xref: /xnu-10063.121.3/bsd/vfs/vfs_syscalls.c (revision 2c2f96dc2b9a4408a43d3150ae9c105355ca3daa)
1 /*
2  * Copyright (c) 1995-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1989, 1993
30  *	The Regents of the University of California.  All rights reserved.
31  * (c) UNIX System Laboratories, Inc.
32  * All or some portions of this file are derived from material licensed
33  * to the University of California by American Telephone and Telegraph
34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35  * the permission of UNIX System Laboratories, Inc.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. All advertising materials mentioning features or use of this software
46  *    must display the following acknowledgement:
47  *	This product includes software developed by the University of
48  *	California, Berkeley and its contributors.
49  * 4. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.41 (Berkeley) 6/15/95
66  */
67 /*
68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69  * support for mandatory and extensible security protections.  This notice
70  * is included in support of clause 2.2 (b) of the Apple Public License,
71  * Version 2.0.
72  */
73 
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112 
113 #include <vfs/vfs_disk_conditioner.h>
114 #if CONFIG_EXCLAVES
115 #include <vfs/vfs_exclave_fs.h>
116 #endif
117 
118 #include <security/audit/audit.h>
119 #include <bsm/audit_kevents.h>
120 
121 #include <mach/mach_types.h>
122 #include <kern/kern_types.h>
123 #include <kern/kalloc.h>
124 #include <kern/task.h>
125 
126 #include <vm/vm_pageout.h>
127 #include <vm/vm_protos.h>
128 
129 #include <libkern/OSAtomic.h>
130 #include <os/atomic_private.h>
131 #include <pexpert/pexpert.h>
132 #include <IOKit/IOBSD.h>
133 
134 // deps for MIG call
135 #include <kern/host.h>
136 #include <kern/ipc_misc.h>
137 #include <mach/host_priv.h>
138 #include <mach/vfs_nspace.h>
139 #include <os/log.h>
140 
141 #include <nfs/nfs_conf.h>
142 
143 #if ROUTEFS
144 #include <miscfs/routefs/routefs.h>
145 #endif /* ROUTEFS */
146 
147 #if CONFIG_MACF
148 #include <security/mac.h>
149 #include <security/mac_framework.h>
150 #endif
151 
152 #if CONFIG_FSE
153 #define GET_PATH(x) \
154 	((x) = get_pathbuff())
155 #define RELEASE_PATH(x) \
156 	release_pathbuff(x)
157 #else
158 #define GET_PATH(x)     \
159 	((x) = zalloc(ZV_NAMEI))
160 #define RELEASE_PATH(x) \
161 	zfree(ZV_NAMEI, x)
162 #endif /* CONFIG_FSE */
163 
164 #ifndef HFS_GET_BOOT_INFO
165 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
166 #endif
167 
168 #ifndef HFS_SET_BOOT_INFO
169 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
170 #endif
171 
172 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
173 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
174 #endif
175 
176 extern void disk_conditioner_unmount(mount_t mp);
177 
178 /* struct for checkdirs iteration */
179 struct cdirargs {
180 	vnode_t olddp;
181 	vnode_t newdp;
182 };
183 /* callback  for checkdirs iteration */
184 static int checkdirs_callback(proc_t p, void * arg);
185 
186 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
187 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
188 void enablequotas(struct mount *mp, vfs_context_t ctx);
189 static int getfsstat_callback(mount_t mp, void * arg);
190 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
191 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
192 static int sync_callback(mount_t, void *);
193 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
194     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
195     boolean_t partial_copy);
196 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
197 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
198     struct componentname *cnp, user_addr_t fsmountargs,
199     int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
200 void vfs_notify_mount(vnode_t pdvp);
201 
202 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
203 
204 struct fd_vn_data * fg_vn_data_alloc(void);
205 
206 /*
207  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
208  * Concurrent lookups (or lookups by ids) on hard links can cause the
209  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
210  * does) to return ENOENT as the path cannot be returned from the name cache
211  * alone. We have no option but to retry and hope to get one namei->reverse path
212  * generation done without an intervening lookup, lookup by id on the hard link
213  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
214  * which currently are the MAC hooks for rename, unlink and rmdir.
215  */
216 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
217 
218 /* Max retry limit for rename due to vnode recycling. */
219 #define MAX_RENAME_ERECYCLE_RETRIES 1024
220 
221 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
222     int unlink_flags);
223 
224 #ifdef CONFIG_IMGSRC_ACCESS
225 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
226 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
227 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
228 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
229 static void mount_end_update(mount_t mp);
230 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
231 #endif /* CONFIG_IMGSRC_ACCESS */
232 
233 //snapshot functions
234 #if CONFIG_MNT_ROOTSNAP
235 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
236 #else
237 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
238 #endif
239 
240 __private_extern__
241 int sync_internal(void);
242 
243 __private_extern__
244 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
245 
246 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
247 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
248 
249 /* vars for sync mutex */
250 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
251 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
252 
253 extern lck_rw_t rootvnode_rw_lock;
254 
255 VFS_SMR_DECLARE;
256 extern uint32_t nc_smr_enabled;
257 
258 /*
259  * incremented each time a mount or unmount operation occurs
260  * used to invalidate the cached value of the rootvp in the
261  * mount structure utilized by cache_lookup_path
262  */
263 uint32_t mount_generation = 0;
264 
265 /* counts number of mount and unmount operations */
266 unsigned int vfs_nummntops = 0;
267 
268 /* system-wide, per-boot unique mount ID */
269 static _Atomic uint64_t mount_unique_id = 1;
270 
271 extern const struct fileops vnops;
272 #if CONFIG_APPLEDOUBLE
273 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
274 #endif /* CONFIG_APPLEDOUBLE */
275 
276 /* Maximum buffer length supported by fsgetpath(2) */
277 #define FSGETPATH_MAXBUFLEN  8192
278 
279 /*
280  * Virtual File System System Calls
281  */
282 
283 /*
284  * Private in-kernel mounting spi (specific use-cases only)
285  */
286 boolean_t
vfs_iskernelmount(mount_t mp)287 vfs_iskernelmount(mount_t mp)
288 {
289 	return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
290 }
291 
292 __private_extern__
293 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)294 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
295     void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
296     vfs_context_t ctx)
297 {
298 	struct nameidata nd;
299 	boolean_t did_namei;
300 	int error;
301 
302 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
303 	    UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
304 
305 	kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
306 
307 	/*
308 	 * Get the vnode to be covered if it's not supplied
309 	 */
310 	if (vp == NULLVP) {
311 		error = namei(&nd);
312 		if (error) {
313 			if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
314 				printf("failed to locate mount-on path: %s ", path);
315 			}
316 			return error;
317 		}
318 		vp = nd.ni_vp;
319 		pvp = nd.ni_dvp;
320 		did_namei = TRUE;
321 	} else {
322 		char *pnbuf = CAST_DOWN(char *, path);
323 
324 		nd.ni_cnd.cn_pnbuf = pnbuf;
325 		nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
326 		did_namei = FALSE;
327 	}
328 
329 	kern_flags |= KERNEL_MOUNT_KMOUNT;
330 	error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
331 	    syscall_flags, kern_flags, NULL, ctx);
332 
333 	if (did_namei) {
334 		vnode_put(vp);
335 		vnode_put(pvp);
336 		nameidone(&nd);
337 	}
338 
339 	return error;
340 }
341 
342 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)343 vfs_mount_at_path(const char *fstype, const char *path,
344     vnode_t pvp, vnode_t vp, void *data, size_t datalen,
345     int mnt_flags, int flags)
346 {
347 	int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
348 	int error, km_flags = 0;
349 	vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
350 
351 	/*
352 	 * This call is currently restricted to specific use cases.
353 	 */
354 	if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
355 		return ENOTSUP;
356 	}
357 
358 #if !defined(XNU_TARGET_OS_OSX)
359 	if (strcmp(fstype, "lifs") == 0) {
360 		syscall_flags |= MNT_NOEXEC;
361 	}
362 #endif
363 
364 	if (flags & VFS_MOUNT_FLAG_NOAUTH) {
365 		km_flags |= KERNEL_MOUNT_NOAUTH;
366 	}
367 	if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
368 		km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
369 	}
370 
371 	error = kernel_mount(fstype, pvp, vp, path, data, datalen,
372 	    syscall_flags, km_flags, ctx);
373 	if (error) {
374 		printf("%s: mount on %s failed, error %d\n", __func__, path,
375 		    error);
376 	}
377 
378 	return error;
379 }
380 
381 /*
382  * Mount a file system.
383  */
384 /* ARGSUSED */
385 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)386 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
387 {
388 	struct __mac_mount_args muap;
389 
390 	muap.type = uap->type;
391 	muap.path = uap->path;
392 	muap.flags = uap->flags;
393 	muap.data = uap->data;
394 	muap.mac_p = USER_ADDR_NULL;
395 	return __mac_mount(p, &muap, retval);
396 }
397 
398 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)399 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
400 {
401 	struct componentname    cn;
402 	vfs_context_t           ctx = vfs_context_current();
403 	size_t                  dummy = 0;
404 	int                     error;
405 	int                     flags = uap->flags;
406 	char                    fstypename[MFSNAMELEN];
407 	char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
408 	vnode_t                 pvp;
409 	vnode_t                 vp;
410 
411 	AUDIT_ARG(fd, uap->fd);
412 	AUDIT_ARG(fflags, flags);
413 	/* fstypename will get audited by mount_common */
414 
415 	/* Sanity check the flags */
416 	if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
417 		return ENOTSUP;
418 	}
419 
420 	if (flags & MNT_UNION) {
421 		return EPERM;
422 	}
423 
424 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
425 	if (error) {
426 		return error;
427 	}
428 
429 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
430 		return error;
431 	}
432 
433 	if ((error = vnode_getwithref(vp)) != 0) {
434 		file_drop(uap->fd);
435 		return error;
436 	}
437 
438 	pvp = vnode_getparent(vp);
439 	if (pvp == NULL) {
440 		if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
441 			error = EBUSY;
442 		} else {
443 			error = EINVAL;
444 		}
445 		vnode_put(vp);
446 		file_drop(uap->fd);
447 		return error;
448 	}
449 
450 	memset(&cn, 0, sizeof(struct componentname));
451 	cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
452 	cn.cn_pnlen = MAXPATHLEN;
453 
454 	if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
455 		zfree(ZV_NAMEI, cn.cn_pnbuf);
456 		vnode_put(pvp);
457 		vnode_put(vp);
458 		file_drop(uap->fd);
459 		return error;
460 	}
461 
462 	error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
463 
464 	zfree(ZV_NAMEI, cn.cn_pnbuf);
465 	vnode_put(pvp);
466 	vnode_put(vp);
467 	file_drop(uap->fd);
468 
469 	return error;
470 }
471 
472 #define MAX_GRAFT_METADATA_SIZE             16384 /* bytes */
473 
474 /*
475  * Get the size of a graft file (a manifest or payload file).
476  * The vp should be an iocounted vnode.
477  */
478 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)479 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
480 {
481 	struct stat64 sb = {};
482 	int error;
483 
484 	*size = 0;
485 
486 	error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
487 	if (error) {
488 		return error;
489 	}
490 
491 	if (sb.st_size == 0) {
492 		error = ENODATA;
493 	} else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
494 		error = EFBIG;
495 	} else {
496 		*size = (size_t) sb.st_size;
497 	}
498 
499 	return error;
500 }
501 
502 /*
503  * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
504  * `size` must already be validated.
505  */
506 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)507 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
508 {
509 	return vn_rdwr(UIO_READ, graft_vp,
510 	           (caddr_t) buf, (int) size, /* offset */ 0,
511 	           UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
512 	           vfs_context_ucred(vctx), /* resid */ NULL,
513 	           vfs_context_proc(vctx));
514 }
515 
516 /*
517  * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
518  * and read it into `buf`.
519  */
520 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,size_t * size,void * buf)521 graft_secureboot_read_fd(int fd, vfs_context_t vctx, size_t *size, void *buf)
522 {
523 	vnode_t metadata_vp = NULLVP;
524 	int error;
525 
526 	// Convert this graft fd to a vnode.
527 	if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
528 		goto out;
529 	}
530 
531 	// Get (and validate) size information.
532 	if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
533 		goto out;
534 	}
535 
536 	// Read each file into the provided buffer - we must get the expected amount of bytes.
537 	if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
538 		goto out;
539 	}
540 
541 out:
542 	if (metadata_vp) {
543 		vnode_put(metadata_vp);
544 		metadata_vp = NULLVP;
545 	}
546 
547 	return error;
548 }
549 
550 /*
551  * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
552  * provided in `gfs`, saving the size of data read in `gfs`.
553  */
554 static int
graft_secureboot_read_metadata(secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)555 graft_secureboot_read_metadata(secure_boot_cryptex_args_t *sbc_args, vfs_context_t vctx,
556     fsioc_graft_fs_t *gfs)
557 {
558 	int error;
559 
560 	// Read the authentic manifest.
561 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
562 	    &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
563 		return error;
564 	}
565 
566 	// The user manifest is currently unused, but set its size.
567 	gfs->user_manifest_size = 0;
568 
569 	// Read the payload.
570 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
571 	    &gfs->payload_size, gfs->payload))) {
572 		return error;
573 	}
574 
575 	return 0;
576 }
577 
578 /*
579  * Call into the filesystem to verify and graft a cryptex.
580  */
581 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)582 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
583     vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
584 {
585 	fsioc_graft_fs_t gfs = {};
586 	uint64_t graft_dir_ino = 0;
587 	struct stat64 sb = {};
588 	int error;
589 
590 	// Pre-flight arguments.
591 	if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
592 		// Make sure that this graft version matches what we support.
593 		return ENOTSUP;
594 	} else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
595 		// For this type, cryptex VP must live on same volume as the target of graft.
596 		return EXDEV;
597 	} else if (mounton_vp && mounton_vp->v_type != VDIR) {
598 		// We cannot graft upon non-directories.
599 		return ENOTDIR;
600 	} else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
601 	    sbc_args->sbc_payload_fd < 0) {
602 		// We cannot graft without a manifest and payload.
603 		return EINVAL;
604 	}
605 
606 	if (mounton_vp) {
607 		// Get the mounton's inode number.
608 		error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
609 		if (error) {
610 			return error;
611 		}
612 		graft_dir_ino = (uint64_t) sb.st_ino;
613 	}
614 
615 	// Create buffers (of our maximum-defined size) to store authentication info.
616 	gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
617 	gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
618 
619 	if (!gfs.authentic_manifest || !gfs.payload) {
620 		error = ENOMEM;
621 		goto out;
622 	}
623 
624 	// Read our fd's into our buffers.
625 	// (Note that this will set the buffer size fields in `gfs`.)
626 	error = graft_secureboot_read_metadata(sbc_args, vctx, &gfs);
627 	if (error) {
628 		goto out;
629 	}
630 
631 	gfs.graft_version = FSIOC_GRAFT_VERSION;
632 	gfs.graft_type = graft_type;
633 	gfs.graft_4cc = sbc_args->sbc_4cc;
634 	if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
635 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
636 	}
637 	if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
638 		gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
639 	}
640 	if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
641 		gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
642 	}
643 	if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
644 		gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
645 	}
646 	if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
647 		gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
648 	}
649 	if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
650 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
651 	}
652 	gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
653 
654 	// Call into the FS to perform the graft (and validation).
655 	error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
656 
657 out:
658 	if (gfs.authentic_manifest) {
659 		kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
660 		gfs.authentic_manifest = NULL;
661 	}
662 	if (gfs.payload) {
663 		kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
664 		gfs.payload = NULL;
665 	}
666 
667 	return error;
668 }
669 
670 #define GRAFTDMG_ENTITLEMENT  "com.apple.private.vfs.graftdmg"
671 
672 /*
673  * Graft a cryptex disk image (via FD) onto the appropriate mount-point
674  * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
675  */
676 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)677 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
678 {
679 	int ua_dmgfd = uap->dmg_fd;
680 	user_addr_t ua_mountdir = uap->mountdir;
681 	uint32_t ua_grafttype = uap->graft_type;
682 	user_addr_t ua_graftargs = uap->gda;
683 
684 	graftdmg_args_un kern_gda = {};
685 	int error = 0;
686 	secure_boot_cryptex_args_t *sbc_args = NULL;
687 
688 	vnode_t cryptex_vp = NULLVP;
689 	vnode_t mounton_vp = NULLVP;
690 	struct nameidata nd = {};
691 	vfs_context_t ctx = vfs_context_current();
692 
693 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
694 		return EPERM;
695 	}
696 
697 	error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
698 	if (error) {
699 		return error;
700 	}
701 
702 	// Copy mount dir in, if provided.
703 	if (ua_mountdir != USER_ADDR_NULL) {
704 		// Acquire vnode for mount-on path
705 		NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
706 		    UIO_USERSPACE, ua_mountdir, ctx);
707 
708 		error = namei(&nd);
709 		if (error) {
710 			return error;
711 		}
712 		mounton_vp = nd.ni_vp;
713 	}
714 
715 	// Convert fd to vnode.
716 	error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
717 	if (error) {
718 		goto graftout;
719 	}
720 
721 	if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_MAX) {
722 		error = EINVAL;
723 	} else {
724 		sbc_args = &kern_gda.sbc_args;
725 		error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp);
726 	}
727 
728 graftout:
729 	if (cryptex_vp) {
730 		vnode_put(cryptex_vp);
731 		cryptex_vp = NULLVP;
732 	}
733 	if (mounton_vp) {
734 		vnode_put(mounton_vp);
735 		mounton_vp = NULLVP;
736 	}
737 	if (ua_mountdir != USER_ADDR_NULL) {
738 		nameidone(&nd);
739 	}
740 
741 	return error;
742 }
743 
744 /*
745  * Ungraft a cryptex disk image (via mount dir FD)
746  * { int ungraftdmg(const char *mountdir, uint64_t flags); }
747  */
748 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)749 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
750 {
751 	int error = 0;
752 	user_addr_t ua_mountdir = uap->mountdir;
753 	fsioc_ungraft_fs_t ugfs;
754 	vnode_t mounton_vp = NULLVP;
755 	struct nameidata nd = {};
756 	vfs_context_t ctx = vfs_context_current();
757 
758 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
759 		return EPERM;
760 	}
761 
762 	if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
763 		return EINVAL;
764 	}
765 
766 	ugfs.ungraft_flags = 0;
767 
768 	// Acquire vnode for mount-on path
769 	NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
770 	    UIO_USERSPACE, ua_mountdir, ctx);
771 
772 	error = namei(&nd);
773 	if (error) {
774 		return error;
775 	}
776 	mounton_vp = nd.ni_vp;
777 
778 	// Call into the FS to perform the ungraft
779 	error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
780 
781 	vnode_put(mounton_vp);
782 	nameidone(&nd);
783 
784 	return error;
785 }
786 
787 
788 void
vfs_notify_mount(vnode_t pdvp)789 vfs_notify_mount(vnode_t pdvp)
790 {
791 	vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
792 	lock_vnode_and_post(pdvp, NOTE_WRITE);
793 }
794 
795 /*
796  * __mac_mount:
797  *	Mount a file system taking into account MAC label behavior.
798  *	See mount(2) man page for more information
799  *
800  * Parameters:    p                        Process requesting the mount
801  *                uap                      User argument descriptor (see below)
802  *                retval                   (ignored)
803  *
804  * Indirect:      uap->type                Filesystem type
805  *                uap->path                Path to mount
806  *                uap->data                Mount arguments
807  *                uap->mac_p               MAC info
808  *                uap->flags               Mount flags
809  *
810  *
811  * Returns:        0                       Success
812  *                !0                       Not success
813  */
814 boolean_t root_fs_upgrade_try = FALSE;
815 
816 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)817 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
818 {
819 	vnode_t pvp = NULL;
820 	vnode_t vp = NULL;
821 	int need_nameidone = 0;
822 	vfs_context_t ctx = vfs_context_current();
823 	char fstypename[MFSNAMELEN];
824 	struct nameidata nd;
825 	size_t dummy = 0;
826 	char *labelstr = NULL;
827 	size_t labelsz = 0;
828 	int flags = uap->flags;
829 	int error;
830 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
831 	boolean_t is_64bit = IS_64BIT_PROCESS(p);
832 #else
833 #pragma unused(p)
834 #endif
835 	/*
836 	 * Get the fs type name from user space
837 	 */
838 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
839 	if (error) {
840 		return error;
841 	}
842 
843 	/*
844 	 * Get the vnode to be covered
845 	 */
846 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
847 	    UIO_USERSPACE, uap->path, ctx);
848 	if (flags & MNT_NOFOLLOW) {
849 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
850 	}
851 	error = namei(&nd);
852 	if (error) {
853 		goto out;
854 	}
855 	need_nameidone = 1;
856 	vp = nd.ni_vp;
857 	pvp = nd.ni_dvp;
858 
859 #ifdef CONFIG_IMGSRC_ACCESS
860 	/* Mounting image source cannot be batched with other operations */
861 	if (flags == MNT_IMGSRC_BY_INDEX) {
862 		error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
863 		    ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
864 		goto out;
865 	}
866 #endif /* CONFIG_IMGSRC_ACCESS */
867 
868 #if CONFIG_MACF
869 	/*
870 	 * Get the label string (if any) from user space
871 	 */
872 	if (uap->mac_p != USER_ADDR_NULL) {
873 		struct user_mac mac;
874 		size_t ulen = 0;
875 
876 		if (is_64bit) {
877 			struct user64_mac mac64;
878 			error = copyin(uap->mac_p, &mac64, sizeof(mac64));
879 			mac.m_buflen = (user_size_t)mac64.m_buflen;
880 			mac.m_string = (user_addr_t)mac64.m_string;
881 		} else {
882 			struct user32_mac mac32;
883 			error = copyin(uap->mac_p, &mac32, sizeof(mac32));
884 			mac.m_buflen = mac32.m_buflen;
885 			mac.m_string = mac32.m_string;
886 		}
887 		if (error) {
888 			goto out;
889 		}
890 		if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
891 		    (mac.m_buflen < 2)) {
892 			error = EINVAL;
893 			goto out;
894 		}
895 		labelsz = mac.m_buflen;
896 		labelstr = kalloc_data(labelsz, Z_WAITOK);
897 		error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
898 		if (error) {
899 			goto out;
900 		}
901 		AUDIT_ARG(mac_string, labelstr);
902 	}
903 #endif /* CONFIG_MACF */
904 
905 	AUDIT_ARG(fflags, flags);
906 
907 #if !CONFIG_UNION_MOUNTS
908 	if (flags & MNT_UNION) {
909 		error = EPERM;
910 		goto out;
911 	}
912 #endif
913 
914 	if ((vp->v_flag & VROOT) &&
915 	    (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
916 #if CONFIG_UNION_MOUNTS
917 		if (!(flags & MNT_UNION)) {
918 			flags |= MNT_UPDATE;
919 		} else {
920 			/*
921 			 * For a union mount on '/', treat it as fresh
922 			 * mount instead of update.
923 			 * Otherwise, union mouting on '/' used to panic the
924 			 * system before, since mnt_vnodecovered was found to
925 			 * be NULL for '/' which is required for unionlookup
926 			 * after it gets ENOENT on union mount.
927 			 */
928 			flags = (flags & ~(MNT_UPDATE));
929 		}
930 #else
931 		flags |= MNT_UPDATE;
932 #endif /* CONFIG_UNION_MOUNTS */
933 
934 #if SECURE_KERNEL
935 		if ((flags & MNT_RDONLY) == 0) {
936 			/* Release kernels are not allowed to mount "/" as rw */
937 			error = EPERM;
938 			goto out;
939 		}
940 #endif
941 
942 		/*
943 		 * See 7392553 for more details on why this check exists.
944 		 * Suffice to say: If this check is ON and something tries
945 		 * to mount the rootFS RW, we'll turn off the codesign
946 		 * bitmap optimization.
947 		 */
948 #if CHECK_CS_VALIDATION_BITMAP
949 		if ((flags & MNT_RDONLY) == 0) {
950 			root_fs_upgrade_try = TRUE;
951 		}
952 #endif
953 	}
954 
955 	error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
956 	    labelstr, ctx);
957 
958 out:
959 
960 #if CONFIG_MACF
961 	kfree_data(labelstr, labelsz);
962 #endif /* CONFIG_MACF */
963 
964 	if (vp) {
965 		vnode_put(vp);
966 	}
967 	if (pvp) {
968 		vnode_put(pvp);
969 	}
970 	if (need_nameidone) {
971 		nameidone(&nd);
972 	}
973 
974 	return error;
975 }
976 
977 /*
978  * common mount implementation (final stage of mounting)
979  *
980  * Arguments:
981  *  fstypename	file system type (ie it's vfs name)
982  *  pvp		parent of covered vnode
983  *  vp		covered vnode
984  *  cnp		component name (ie path) of covered vnode
985  *  flags	generic mount flags
986  *  fsmountargs	file system specific data
987  *  labelstr	optional MAC label
988  *  kernelmount	TRUE for mounts initiated from inside the kernel
989  *  ctx		caller's context
990  */
991 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)992 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
993     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
994     char *labelstr, vfs_context_t ctx)
995 {
996 #if !CONFIG_MACF
997 #pragma unused(labelstr)
998 #endif
999 	struct vnode *devvp = NULLVP;
1000 	struct vnode *device_vnode = NULLVP;
1001 #if CONFIG_MACF
1002 	struct vnode *rvp;
1003 #endif
1004 	struct mount *mp = NULL;
1005 	struct vfstable *vfsp = (struct vfstable *)0;
1006 	struct proc *p = vfs_context_proc(ctx);
1007 	int error, flag = 0;
1008 	bool flag_set = false;
1009 	user_addr_t devpath = USER_ADDR_NULL;
1010 	int ronly = 0;
1011 	int mntalloc = 0;
1012 	boolean_t vfsp_ref = FALSE;
1013 	boolean_t is_rwlock_locked = FALSE;
1014 	boolean_t did_rele = FALSE;
1015 	boolean_t have_usecount = FALSE;
1016 	boolean_t did_set_lmount = FALSE;
1017 	boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1018 
1019 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1020 	/* Check for mutually-exclusive flag bits */
1021 	uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1022 	int bitcount = 0;
1023 	while (checkflags != 0) {
1024 		checkflags &= (checkflags - 1);
1025 		bitcount++;
1026 	}
1027 
1028 	if (bitcount > 1) {
1029 		//not allowed to request multiple mount-by-role flags
1030 		error = EINVAL;
1031 		goto out1;
1032 	}
1033 #endif
1034 
1035 	/*
1036 	 * Process an update for an existing mount
1037 	 */
1038 	if (flags & MNT_UPDATE) {
1039 		if ((vp->v_flag & VROOT) == 0) {
1040 			error = EINVAL;
1041 			goto out1;
1042 		}
1043 		mp = vp->v_mount;
1044 
1045 		/* if unmount or mount in progress, return error */
1046 		mount_lock_spin(mp);
1047 		if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1048 			mount_unlock(mp);
1049 			error = EBUSY;
1050 			goto out1;
1051 		}
1052 		mp->mnt_lflag |= MNT_LMOUNT;
1053 		did_set_lmount = TRUE;
1054 		mount_unlock(mp);
1055 		lck_rw_lock_exclusive(&mp->mnt_rwlock);
1056 		is_rwlock_locked = TRUE;
1057 		/*
1058 		 * We only allow the filesystem to be reloaded if it
1059 		 * is currently mounted read-only.
1060 		 */
1061 		if ((flags & MNT_RELOAD) &&
1062 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1063 			error = ENOTSUP;
1064 			goto out1;
1065 		}
1066 
1067 		/*
1068 		 * If content protection is enabled, update mounts are not
1069 		 * allowed to turn it off.
1070 		 */
1071 		if ((mp->mnt_flag & MNT_CPROTECT) &&
1072 		    ((flags & MNT_CPROTECT) == 0)) {
1073 			error = EINVAL;
1074 			goto out1;
1075 		}
1076 
1077 		/*
1078 		 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1079 		 * failure to return an error for this so we'll just silently
1080 		 * add it if it is not passed in.
1081 		 */
1082 		if ((mp->mnt_flag & MNT_REMOVABLE) &&
1083 		    ((flags & MNT_REMOVABLE) == 0)) {
1084 			flags |= MNT_REMOVABLE;
1085 		}
1086 
1087 		/* Can't downgrade the backer of the root FS */
1088 		if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1089 		    (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1090 			error = ENOTSUP;
1091 			goto out1;
1092 		}
1093 
1094 		/*
1095 		 * Only root, or the user that did the original mount is
1096 		 * permitted to update it.
1097 		 */
1098 		if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1099 		    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1100 			goto out1;
1101 		}
1102 #if CONFIG_MACF
1103 		error = mac_mount_check_remount(ctx, mp);
1104 		if (error != 0) {
1105 			goto out1;
1106 		}
1107 #endif
1108 		/*
1109 		 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1110 		 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1111 		 */
1112 		if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1113 			flags |= MNT_NOSUID | MNT_NODEV;
1114 			if (mp->mnt_flag & MNT_NOEXEC) {
1115 				flags |= MNT_NOEXEC;
1116 			}
1117 		}
1118 		flag = mp->mnt_flag;
1119 		flag_set = true;
1120 
1121 
1122 
1123 		mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1124 
1125 		vfsp = mp->mnt_vtable;
1126 		goto update;
1127 	} // MNT_UPDATE
1128 
1129 	/*
1130 	 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1131 	 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1132 	 */
1133 	if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1134 		flags |= MNT_NOSUID | MNT_NODEV;
1135 		if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1136 			flags |= MNT_NOEXEC;
1137 		}
1138 	}
1139 
1140 	/* XXXAUDIT: Should we capture the type on the error path as well? */
1141 	/* XXX cast-away const (audit_arg_text() does not modify its input) */
1142 	AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1143 	mount_list_lock();
1144 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1145 		if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1146 			vfsp->vfc_refcount++;
1147 			vfsp_ref = TRUE;
1148 			break;
1149 		}
1150 	}
1151 	mount_list_unlock();
1152 	if (vfsp == NULL) {
1153 		error = ENODEV;
1154 		goto out1;
1155 	}
1156 
1157 	/*
1158 	 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1159 	 * except in ROSV configs and for the initial BaseSystem root.
1160 	 */
1161 	if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1162 	    ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1163 	    ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1164 		error = EINVAL;  /* unsupported request */
1165 		goto out1;
1166 	}
1167 
1168 	error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1169 	if (error != 0) {
1170 		goto out1;
1171 	}
1172 
1173 	/*
1174 	 * Allocate and initialize the filesystem (mount_t)
1175 	 */
1176 	mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1177 	mntalloc = 1;
1178 
1179 	/* Initialize the default IO constraints */
1180 	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1181 	mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1182 	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1183 	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1184 	mp->mnt_devblocksize = DEV_BSIZE;
1185 	mp->mnt_alignmentmask = PAGE_MASK;
1186 	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1187 	mp->mnt_ioscale = 1;
1188 	mp->mnt_ioflags = 0;
1189 	mp->mnt_realrootvp = NULLVP;
1190 	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1191 
1192 	mp->mnt_lflag |= MNT_LMOUNT;
1193 	did_set_lmount = TRUE;
1194 
1195 	TAILQ_INIT(&mp->mnt_vnodelist);
1196 	TAILQ_INIT(&mp->mnt_workerqueue);
1197 	TAILQ_INIT(&mp->mnt_newvnodes);
1198 	mount_lock_init(mp);
1199 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
1200 	is_rwlock_locked = TRUE;
1201 	mp->mnt_op = vfsp->vfc_vfsops;
1202 	mp->mnt_vtable = vfsp;
1203 	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
1204 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1205 	strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1206 	do {
1207 		size_t pathlen = MAXPATHLEN;
1208 
1209 		if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1210 			strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1211 		}
1212 	} while (0);
1213 	mp->mnt_vnodecovered = vp;
1214 	mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1215 	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1216 	mp->mnt_devbsdunit = 0;
1217 	mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1218 
1219 	/* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1220 	vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1221 
1222 	if (kernelmount) {
1223 		mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1224 	}
1225 	if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1226 		mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1227 	}
1228 
1229 	if (KERNEL_MOUNT_DEVFS & internal_flags) {
1230 		// kernel mounted devfs
1231 		mp->mnt_kern_flag |= MNTK_SYSTEM;
1232 	}
1233 
1234 update:
1235 
1236 	/*
1237 	 * Set the mount level flags.
1238 	 */
1239 	if (flags & MNT_RDONLY) {
1240 		mp->mnt_flag |= MNT_RDONLY;
1241 	} else if (mp->mnt_flag & MNT_RDONLY) {
1242 		// disallow read/write upgrades of file systems that
1243 		// had the TYPENAME_OVERRIDE feature set.
1244 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1245 			error = EPERM;
1246 			goto out1;
1247 		}
1248 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
1249 	}
1250 	mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1251 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1252 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1253 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1254 	    MNT_QUARANTINE | MNT_CPROTECT);
1255 
1256 #if SECURE_KERNEL
1257 #if !CONFIG_MNT_SUID
1258 	/*
1259 	 * On release builds of iOS based platforms, always enforce NOSUID on
1260 	 * all mounts. We do this here because we can catch update mounts as well as
1261 	 * non-update mounts in this case.
1262 	 */
1263 	mp->mnt_flag |= (MNT_NOSUID);
1264 #endif
1265 #endif
1266 
1267 	mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1268 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1269 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1270 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1271 	    MNT_QUARANTINE | MNT_CPROTECT);
1272 
1273 #if CONFIG_MACF
1274 	if (flags & MNT_MULTILABEL) {
1275 		if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1276 			error = EINVAL;
1277 			goto out1;
1278 		}
1279 		mp->mnt_flag |= MNT_MULTILABEL;
1280 	}
1281 #endif
1282 	/*
1283 	 * Process device path for local file systems if requested.
1284 	 *
1285 	 * Snapshot and mount-by-role mounts do not use this path; they are
1286 	 * passing other opaque data in the device path field.
1287 	 *
1288 	 * Basesystemroot mounts pass a device path to be resolved here,
1289 	 * but it's just a char * already inside the kernel, which
1290 	 * kernel_mount() shoved into a user_addr_t to call us. So for such
1291 	 * mounts we must skip copyin (both of the address and of the string
1292 	 * (in NDINIT).
1293 	 */
1294 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1295 	    !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1296 		boolean_t do_copyin_devpath = true;
1297 #if CONFIG_BASESYSTEMROOT
1298 		if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1299 			// KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1300 			// We have been passed fsmountargs, which is typed as a user_addr_t,
1301 			// but is actually a char ** pointing to a (kernelspace) string.
1302 			// We manually unpack it with a series of casts and dereferences
1303 			// that reverses what was done just above us on the stack in
1304 			// imageboot_pivot_image().
1305 			// After retrieving the path to the dev node (which we will NDINIT
1306 			// in a moment), we pass NULL fsmountargs on to the filesystem.
1307 			_Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1308 			char **devnamepp = (char **)fsmountargs;
1309 			char *devnamep = *devnamepp;
1310 			devpath = CAST_USER_ADDR_T(devnamep);
1311 			do_copyin_devpath = false;
1312 			fsmountargs = USER_ADDR_NULL;
1313 
1314 			//Now that we have a mp, denote that this mount is for the basesystem.
1315 			mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1316 		}
1317 #endif // CONFIG_BASESYSTEMROOT
1318 
1319 		if (do_copyin_devpath) {
1320 			if (vfs_context_is64bit(ctx)) {
1321 				if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1322 					goto out1;
1323 				}
1324 				fsmountargs += sizeof(devpath);
1325 			} else {
1326 				user32_addr_t tmp;
1327 				if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1328 					goto out1;
1329 				}
1330 				/* munge into LP64 addr */
1331 				devpath = CAST_USER_ADDR_T(tmp);
1332 				fsmountargs += sizeof(tmp);
1333 			}
1334 		}
1335 
1336 		/* Lookup device and authorize access to it */
1337 		if ((devpath)) {
1338 			struct nameidata nd;
1339 
1340 			enum uio_seg seg = UIO_USERSPACE;
1341 #if CONFIG_BASESYSTEMROOT
1342 			if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1343 				seg = UIO_SYSSPACE;
1344 			}
1345 #endif // CONFIG_BASESYSTEMROOT
1346 
1347 			NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1348 			if ((error = namei(&nd))) {
1349 				goto out1;
1350 			}
1351 
1352 			strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1353 			devvp = nd.ni_vp;
1354 
1355 			nameidone(&nd);
1356 
1357 			if (devvp->v_type != VBLK) {
1358 				error = ENOTBLK;
1359 				goto out2;
1360 			}
1361 			if (major(devvp->v_rdev) >= nblkdev) {
1362 				error = ENXIO;
1363 				goto out2;
1364 			}
1365 			/*
1366 			 * If mount by non-root, then verify that user has necessary
1367 			 * permissions on the device.
1368 			 */
1369 			if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1370 				kauth_action_t accessmode = KAUTH_VNODE_READ_DATA;
1371 
1372 				if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1373 					accessmode |= KAUTH_VNODE_WRITE_DATA;
1374 				}
1375 				if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1376 					goto out2;
1377 				}
1378 			}
1379 		}
1380 		/* On first mount, preflight and open device */
1381 		if (devpath && ((flags & MNT_UPDATE) == 0)) {
1382 			if ((error = vnode_ref(devvp))) {
1383 				goto out2;
1384 			}
1385 			/*
1386 			 * Disallow multiple mounts of the same device.
1387 			 * Disallow mounting of a device that is currently in use
1388 			 * (except for root, which might share swap device for miniroot).
1389 			 * Flush out any old buffers remaining from a previous use.
1390 			 */
1391 			if ((error = vfs_setmounting(devvp))) {
1392 				vnode_rele(devvp);
1393 				goto out2;
1394 			}
1395 
1396 			if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1397 				error = EBUSY;
1398 				goto out3;
1399 			}
1400 			if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1401 				error = ENOTBLK;
1402 				goto out3;
1403 			}
1404 			if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1405 				goto out3;
1406 			}
1407 
1408 			ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1409 #if CONFIG_MACF
1410 			error = mac_vnode_check_open(ctx,
1411 			    devvp,
1412 			    ronly ? FREAD : FREAD | FWRITE);
1413 			if (error) {
1414 				goto out3;
1415 			}
1416 #endif /* MAC */
1417 			if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1418 				goto out3;
1419 			}
1420 
1421 			mp->mnt_devvp = devvp;
1422 			device_vnode = devvp;
1423 		} else if ((mp->mnt_flag & MNT_RDONLY) &&
1424 		    (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1425 		    (device_vnode = mp->mnt_devvp)) {
1426 			dev_t dev;
1427 			int maj;
1428 			/*
1429 			 * If upgrade to read-write by non-root, then verify
1430 			 * that user has necessary permissions on the device.
1431 			 */
1432 			vnode_getalways(device_vnode);
1433 
1434 			if (suser(vfs_context_ucred(ctx), NULL) &&
1435 			    (error = vnode_authorize(device_vnode, NULL,
1436 			    KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1437 			    ctx)) != 0) {
1438 				vnode_put(device_vnode);
1439 				goto out2;
1440 			}
1441 
1442 			/* Tell the device that we're upgrading */
1443 			dev = (dev_t)device_vnode->v_rdev;
1444 			maj = major(dev);
1445 
1446 			if ((u_int)maj >= (u_int)nblkdev) {
1447 				panic("Volume mounted on a device with invalid major number.");
1448 			}
1449 
1450 			error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1451 			vnode_put(device_vnode);
1452 			device_vnode = NULLVP;
1453 			if (error != 0) {
1454 				goto out2;
1455 			}
1456 		}
1457 	} // localargs && !(snapshot | data | vm)
1458 
1459 #if CONFIG_MACF
1460 	if ((flags & MNT_UPDATE) == 0) {
1461 		mac_mount_label_init(mp);
1462 		mac_mount_label_associate(ctx, mp);
1463 	}
1464 	if (labelstr) {
1465 		if ((flags & MNT_UPDATE) != 0) {
1466 			error = mac_mount_check_label_update(ctx, mp);
1467 			if (error != 0) {
1468 				goto out3;
1469 			}
1470 		}
1471 	}
1472 #endif
1473 	/*
1474 	 * Mount the filesystem.  We already asserted that internal_flags
1475 	 * cannot have more than one mount-by-role bit set.
1476 	 */
1477 	if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1478 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1479 		    (caddr_t)fsmountargs, 0, ctx);
1480 	} else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1481 #if CONFIG_ROSV_STARTUP
1482 		struct mount *origin_mp = (struct mount*)fsmountargs;
1483 		fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1484 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1485 		if (error) {
1486 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1487 		} else {
1488 			/* Mark volume associated with system volume */
1489 			mp->mnt_kern_flag |= MNTK_SYSTEM;
1490 
1491 			/* Attempt to acquire the mnt_devvp and set it up */
1492 			struct vnode *mp_devvp = NULL;
1493 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1494 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1495 				    0, &mp_devvp, vfs_context_kernel());
1496 				if (!lerr) {
1497 					mp->mnt_devvp = mp_devvp;
1498 					//vnode_lookup took an iocount, need to drop it.
1499 					vnode_put(mp_devvp);
1500 					// now set `device_vnode` to the devvp that was acquired.
1501 					// this is needed in order to ensure vfs_init_io_attributes is invoked.
1502 					// note that though the iocount above was dropped, the mount acquires
1503 					// an implicit reference against the device.
1504 					device_vnode = mp_devvp;
1505 				}
1506 			}
1507 		}
1508 #else
1509 		error = EINVAL;
1510 #endif
1511 	} else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1512 #if CONFIG_MOUNT_VM
1513 		struct mount *origin_mp = (struct mount*)fsmountargs;
1514 		fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1515 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1516 		if (error) {
1517 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1518 		} else {
1519 			/* Mark volume associated with system volume and a swap mount */
1520 			mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1521 			/* Attempt to acquire the mnt_devvp and set it up */
1522 			struct vnode *mp_devvp = NULL;
1523 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1524 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1525 				    0, &mp_devvp, vfs_context_kernel());
1526 				if (!lerr) {
1527 					mp->mnt_devvp = mp_devvp;
1528 					//vnode_lookup took an iocount, need to drop it.
1529 					vnode_put(mp_devvp);
1530 
1531 					// now set `device_vnode` to the devvp that was acquired.
1532 					// note that though the iocount above was dropped, the mount acquires
1533 					// an implicit reference against the device.
1534 					device_vnode = mp_devvp;
1535 				}
1536 			}
1537 		}
1538 #else
1539 		error = EINVAL;
1540 #endif
1541 	} else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1542 #if CONFIG_MOUNT_PREBOOTRECOVERY
1543 		struct mount *origin_mp = (struct mount*)fsmountargs;
1544 		uint32_t mount_role = 0;
1545 		if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1546 			mount_role = VFS_PREBOOT_ROLE;
1547 		} else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1548 			mount_role = VFS_RECOVERY_ROLE;
1549 		}
1550 
1551 		if (mount_role != 0) {
1552 			fs_role_mount_args_t frma = {origin_mp, mount_role};
1553 			error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1554 			if (error) {
1555 				printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1556 			} else {
1557 				// NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1558 				/* Mark volume associated with system volume */
1559 				//mp->mnt_kern_flag |= MNTK_SYSTEM;
1560 				/* Attempt to acquire the mnt_devvp and set it up */
1561 				struct vnode *mp_devvp = NULL;
1562 				if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1563 					errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1564 					    0, &mp_devvp, vfs_context_kernel());
1565 					if (!lerr) {
1566 						mp->mnt_devvp = mp_devvp;
1567 						//vnode_lookup took an iocount, need to drop it.
1568 						vnode_put(mp_devvp);
1569 
1570 						// now set `device_vnode` to the devvp that was acquired.
1571 						// note that though the iocount above was dropped, the mount acquires
1572 						// an implicit reference against the device.
1573 						device_vnode = mp_devvp;
1574 					}
1575 				}
1576 			}
1577 		} else {
1578 			printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1579 			error = EINVAL;
1580 		}
1581 #else
1582 		error = EINVAL;
1583 #endif
1584 	} else {
1585 		error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1586 	}
1587 
1588 	if (flags & MNT_UPDATE) {
1589 		if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1590 			mp->mnt_flag &= ~MNT_RDONLY;
1591 		}
1592 		mp->mnt_flag &= ~
1593 		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1594 		mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1595 		if (error) {
1596 			mp->mnt_flag = flag;  /* restore flag value */
1597 		}
1598 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1599 		lck_rw_done(&mp->mnt_rwlock);
1600 		is_rwlock_locked = FALSE;
1601 		if (!error) {
1602 			enablequotas(mp, ctx);
1603 		}
1604 		goto exit;
1605 	}
1606 
1607 	/*
1608 	 * Put the new filesystem on the mount list after root.
1609 	 */
1610 	if (error == 0) {
1611 		struct vfs_attr vfsattr;
1612 		if (device_vnode) {
1613 			/*
1614 			 *   cache the IO attributes for the underlying physical media...
1615 			 *   an error return indicates the underlying driver doesn't
1616 			 *   support all the queries necessary... however, reasonable
1617 			 *   defaults will have been set, so no reason to bail or care
1618 			 *
1619 			 *   Need to do this before calling the MAC hook as it needs
1620 			 *   information from this call.
1621 			 */
1622 			vfs_init_io_attributes(device_vnode, mp);
1623 		}
1624 
1625 #if CONFIG_MACF
1626 		error = mac_mount_check_mount_late(ctx, mp);
1627 		if (error != 0) {
1628 			goto out4;
1629 		}
1630 
1631 		if (vfs_flags(mp) & MNT_MULTILABEL) {
1632 			error = VFS_ROOT(mp, &rvp, ctx);
1633 			if (error) {
1634 				printf("%s() VFS_ROOT returned %d\n", __func__, error);
1635 				goto out4;
1636 			}
1637 			error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1638 			/*
1639 			 * drop reference provided by VFS_ROOT
1640 			 */
1641 			vnode_put(rvp);
1642 
1643 			if (error) {
1644 				goto out4;
1645 			}
1646 		}
1647 #endif  /* MAC */
1648 
1649 		vnode_lock_spin(vp);
1650 		CLR(vp->v_flag, VMOUNT);
1651 		vp->v_mountedhere = mp;
1652 		SET(vp->v_flag, VMOUNTEDHERE);
1653 		vnode_unlock(vp);
1654 
1655 		/*
1656 		 * taking the name_cache_lock exclusively will
1657 		 * insure that everyone is out of the fast path who
1658 		 * might be trying to use a now stale copy of
1659 		 * vp->v_mountedhere->mnt_realrootvp
1660 		 * bumping mount_generation causes the cached values
1661 		 * to be invalidated
1662 		 */
1663 		name_cache_lock();
1664 		mount_generation++;
1665 		name_cache_unlock();
1666 
1667 		error = vnode_ref(vp);
1668 		if (error != 0) {
1669 			goto out4;
1670 		}
1671 
1672 		have_usecount = TRUE;
1673 
1674 		error = checkdirs(vp, ctx);
1675 		if (error != 0) {
1676 			/* Unmount the filesystem as cdir/rdirs cannot be updated */
1677 			goto out4;
1678 		}
1679 		/*
1680 		 * there is no cleanup code here so I have made it void
1681 		 * we need to revisit this
1682 		 */
1683 		(void)VFS_START(mp, 0, ctx);
1684 
1685 		if (mount_list_add(mp) != 0) {
1686 			/*
1687 			 * The system is shutting down trying to umount
1688 			 * everything, so fail with a plausible errno.
1689 			 */
1690 			error = EBUSY;
1691 			goto out4;
1692 		}
1693 		lck_rw_done(&mp->mnt_rwlock);
1694 		is_rwlock_locked = FALSE;
1695 
1696 		/* Check if this mounted file system supports EAs or named streams. */
1697 		/* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1698 		VFSATTR_INIT(&vfsattr);
1699 		VFSATTR_WANTED(&vfsattr, f_capabilities);
1700 		if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1701 		    vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1702 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1703 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1704 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1705 				mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1706 			}
1707 #if NAMEDSTREAMS
1708 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1709 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1710 				mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1711 			}
1712 #endif
1713 			/* Check if this file system supports path from id lookups. */
1714 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1715 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1716 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1717 			} else if (mp->mnt_flag & MNT_DOVOLFS) {
1718 				/* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1719 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1720 			}
1721 
1722 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1723 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1724 				mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1725 			}
1726 		}
1727 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1728 			mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1729 		}
1730 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1731 			mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1732 		}
1733 		/* increment the operations count */
1734 		OSAddAtomic(1, &vfs_nummntops);
1735 		enablequotas(mp, ctx);
1736 
1737 		if (device_vnode) {
1738 			vfs_setmountedon(device_vnode);
1739 		}
1740 
1741 		/* Now that mount is setup, notify the listeners */
1742 		vfs_notify_mount(pvp);
1743 		IOBSDMountChange(mp, kIOMountChangeMount);
1744 	} else {
1745 		/* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1746 		if (mp->mnt_vnodelist.tqh_first != NULL) {
1747 			panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1748 			    mp->mnt_vtable->vfc_name, error);
1749 		}
1750 
1751 		vnode_lock_spin(vp);
1752 		CLR(vp->v_flag, VMOUNT);
1753 		vnode_unlock(vp);
1754 		mount_list_lock();
1755 		mp->mnt_vtable->vfc_refcount--;
1756 		mount_list_unlock();
1757 
1758 		if (device_vnode) {
1759 			vnode_rele(device_vnode);
1760 			VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1761 			vfs_clearmounting(device_vnode);
1762 		}
1763 		lck_rw_done(&mp->mnt_rwlock);
1764 		is_rwlock_locked = FALSE;
1765 
1766 		if (nc_smr_enabled) {
1767 			vfs_smr_synchronize();
1768 		}
1769 
1770 		/*
1771 		 * if we get here, we have a mount structure that needs to be freed,
1772 		 * but since the coveredvp hasn't yet been updated to point at it,
1773 		 * no need to worry about other threads holding a crossref on this mp
1774 		 * so it's ok to just free it
1775 		 */
1776 		mount_lock_destroy(mp);
1777 #if CONFIG_MACF
1778 		mac_mount_label_destroy(mp);
1779 #endif
1780 		zfree(mount_zone, mp);
1781 		did_set_lmount = false;
1782 	}
1783 exit:
1784 	/*
1785 	 * drop I/O count on the device vp if there was one
1786 	 */
1787 	if (devpath && devvp) {
1788 		vnode_put(devvp);
1789 	}
1790 
1791 	if (did_set_lmount) {
1792 		mount_lock_spin(mp);
1793 		mp->mnt_lflag &= ~MNT_LMOUNT;
1794 		mount_unlock(mp);
1795 	}
1796 
1797 	return error;
1798 
1799 /* Error condition exits */
1800 out4:
1801 	(void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1802 
1803 	/*
1804 	 * If the mount has been placed on the covered vp,
1805 	 * it may have been discovered by now, so we have
1806 	 * to treat this just like an unmount
1807 	 */
1808 	mount_lock_spin(mp);
1809 	mp->mnt_lflag |= MNT_LDEAD;
1810 	mount_unlock(mp);
1811 
1812 	if (device_vnode != NULLVP) {
1813 		vnode_rele(device_vnode);
1814 		VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1815 		    ctx);
1816 		vfs_clearmounting(device_vnode);
1817 		did_rele = TRUE;
1818 	}
1819 
1820 	vnode_lock_spin(vp);
1821 
1822 	mp->mnt_crossref++;
1823 	CLR(vp->v_flag, VMOUNTEDHERE);
1824 	vp->v_mountedhere = (mount_t) 0;
1825 
1826 	vnode_unlock(vp);
1827 
1828 	if (have_usecount) {
1829 		vnode_rele(vp);
1830 	}
1831 out3:
1832 	if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1833 		vnode_rele(devvp);
1834 		vfs_clearmounting(devvp);
1835 	}
1836 out2:
1837 	if (devpath && devvp) {
1838 		vnode_put(devvp);
1839 	}
1840 out1:
1841 	/* Release mnt_rwlock only when it was taken */
1842 	if (is_rwlock_locked == TRUE) {
1843 		if (flag_set) {
1844 			mp->mnt_flag = flag;  /* restore mnt_flag value */
1845 		}
1846 		lck_rw_done(&mp->mnt_rwlock);
1847 	}
1848 
1849 	if (did_set_lmount) {
1850 		mount_lock_spin(mp);
1851 		mp->mnt_lflag &= ~MNT_LMOUNT;
1852 		mount_unlock(mp);
1853 	}
1854 
1855 	if (mntalloc) {
1856 		if (mp->mnt_crossref) {
1857 			mount_dropcrossref(mp, vp, 0);
1858 		} else {
1859 			if (nc_smr_enabled) {
1860 				vfs_smr_synchronize();
1861 			}
1862 
1863 			mount_lock_destroy(mp);
1864 #if CONFIG_MACF
1865 			mac_mount_label_destroy(mp);
1866 #endif
1867 			zfree(mount_zone, mp);
1868 		}
1869 	}
1870 	if (vfsp_ref) {
1871 		mount_list_lock();
1872 		vfsp->vfc_refcount--;
1873 		mount_list_unlock();
1874 	}
1875 
1876 	return error;
1877 }
1878 
1879 /*
1880  * Flush in-core data, check for competing mount attempts,
1881  * and set VMOUNT
1882  */
1883 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)1884 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1885 {
1886 #if !CONFIG_MACF
1887 #pragma unused(cnp,fsname)
1888 #endif
1889 	struct vnode_attr va;
1890 	int error;
1891 	boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1892 	boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1893 	boolean_t is_busy;
1894 
1895 	if (!skip_auth) {
1896 		/*
1897 		 * If the user is not root, ensure that they own the directory
1898 		 * onto which we are attempting to mount.
1899 		 */
1900 		VATTR_INIT(&va);
1901 		VATTR_WANTED(&va, va_uid);
1902 		if ((error = vnode_getattr(vp, &va, ctx)) ||
1903 		    (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1904 		    (!vfs_context_issuser(ctx)))) {
1905 			error = EPERM;
1906 			goto out;
1907 		}
1908 	}
1909 
1910 	if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1911 		goto out;
1912 	}
1913 
1914 	if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1915 		goto out;
1916 	}
1917 
1918 	if (vp->v_type != VDIR) {
1919 		error = ENOTDIR;
1920 		goto out;
1921 	}
1922 
1923 	vnode_lock_spin(vp);
1924 	is_busy = is_fmount ?
1925 	    (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL)) :
1926 	    (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL));
1927 	if (is_busy) {
1928 		vnode_unlock(vp);
1929 		error = EBUSY;
1930 		goto out;
1931 	}
1932 	SET(vp->v_flag, VMOUNT);
1933 	vnode_unlock(vp);
1934 
1935 #if CONFIG_MACF
1936 	error = mac_mount_check_mount(ctx, vp,
1937 	    cnp, fsname);
1938 	if (error != 0) {
1939 		vnode_lock_spin(vp);
1940 		CLR(vp->v_flag, VMOUNT);
1941 		vnode_unlock(vp);
1942 	}
1943 #endif
1944 
1945 out:
1946 	return error;
1947 }
1948 
1949 #if CONFIG_IMGSRC_ACCESS
1950 
1951 #define DEBUG_IMGSRC 0
1952 
1953 #if DEBUG_IMGSRC
1954 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1955 #else
1956 #define IMGSRC_DEBUG(args...) do { } while(0)
1957 #endif
1958 
1959 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)1960 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1961 {
1962 	struct nameidata nd;
1963 	vnode_t vp, realdevvp;
1964 	kauth_action_t accessmode;
1965 	int error;
1966 	enum uio_seg uio = UIO_USERSPACE;
1967 
1968 	if (ctx == vfs_context_kernel()) {
1969 		uio = UIO_SYSSPACE;
1970 	}
1971 
1972 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1973 	if ((error = namei(&nd))) {
1974 		IMGSRC_DEBUG("namei() failed with %d\n", error);
1975 		return error;
1976 	}
1977 
1978 	vp = nd.ni_vp;
1979 
1980 	if (!vnode_isblk(vp)) {
1981 		IMGSRC_DEBUG("Not block device.\n");
1982 		error = ENOTBLK;
1983 		goto out;
1984 	}
1985 
1986 	realdevvp = mp->mnt_devvp;
1987 	if (realdevvp == NULLVP) {
1988 		IMGSRC_DEBUG("No device backs the mount.\n");
1989 		error = ENXIO;
1990 		goto out;
1991 	}
1992 
1993 	error = vnode_getwithref(realdevvp);
1994 	if (error != 0) {
1995 		IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1996 		goto out;
1997 	}
1998 
1999 	if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
2000 		IMGSRC_DEBUG("Wrong dev_t.\n");
2001 		error = ENXIO;
2002 		goto out1;
2003 	}
2004 
2005 	strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2006 
2007 	/*
2008 	 * If mount by non-root, then verify that user has necessary
2009 	 * permissions on the device.
2010 	 */
2011 	if (!vfs_context_issuser(ctx)) {
2012 		accessmode = KAUTH_VNODE_READ_DATA;
2013 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2014 			accessmode |= KAUTH_VNODE_WRITE_DATA;
2015 		}
2016 		if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2017 			IMGSRC_DEBUG("Access denied.\n");
2018 			goto out1;
2019 		}
2020 	}
2021 
2022 	*devvpp = vp;
2023 
2024 out1:
2025 	vnode_put(realdevvp);
2026 
2027 out:
2028 	nameidone(&nd);
2029 
2030 	if (error) {
2031 		vnode_put(vp);
2032 	}
2033 
2034 	return error;
2035 }
2036 
2037 /*
2038  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2039  * and call checkdirs()
2040  */
2041 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2042 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2043 {
2044 	int error;
2045 
2046 	mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2047 
2048 	IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2049 	    mp->mnt_vtable->vfc_name, vnode_getname(vp));
2050 
2051 	vnode_lock_spin(vp);
2052 	CLR(vp->v_flag, VMOUNT);
2053 	vp->v_mountedhere = mp;
2054 	SET(vp->v_flag, VMOUNTEDHERE);
2055 	vnode_unlock(vp);
2056 
2057 	/*
2058 	 * taking the name_cache_lock exclusively will
2059 	 * insure that everyone is out of the fast path who
2060 	 * might be trying to use a now stale copy of
2061 	 * vp->v_mountedhere->mnt_realrootvp
2062 	 * bumping mount_generation causes the cached values
2063 	 * to be invalidated
2064 	 */
2065 	name_cache_lock();
2066 	mount_generation++;
2067 	name_cache_unlock();
2068 
2069 	error = vnode_ref(vp);
2070 	if (error != 0) {
2071 		goto out;
2072 	}
2073 
2074 	error = checkdirs(vp, ctx);
2075 	if (error != 0) {
2076 		/* Unmount the filesystem as cdir/rdirs cannot be updated */
2077 		vnode_rele(vp);
2078 		goto out;
2079 	}
2080 
2081 out:
2082 	if (error != 0) {
2083 		mp->mnt_vnodecovered = NULLVP;
2084 	}
2085 	return error;
2086 }
2087 
2088 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2089 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2090 {
2091 	vnode_rele(vp);
2092 	vnode_lock_spin(vp);
2093 	CLR(vp->v_flag, (VMOUNT | VMOUNTEDHERE));
2094 	vp->v_mountedhere = (mount_t)NULL;
2095 	vnode_unlock(vp);
2096 
2097 	mp->mnt_vnodecovered = NULLVP;
2098 }
2099 
2100 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2101 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2102 {
2103 	int error;
2104 
2105 	/* unmount in progress return error */
2106 	mount_lock_spin(mp);
2107 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2108 		mount_unlock(mp);
2109 		return EBUSY;
2110 	}
2111 	mount_unlock(mp);
2112 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2113 
2114 	/*
2115 	 * We only allow the filesystem to be reloaded if it
2116 	 * is currently mounted read-only.
2117 	 */
2118 	if ((flags & MNT_RELOAD) &&
2119 	    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2120 		error = ENOTSUP;
2121 		goto out;
2122 	}
2123 
2124 	/*
2125 	 * Only root, or the user that did the original mount is
2126 	 * permitted to update it.
2127 	 */
2128 	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2129 	    (!vfs_context_issuser(ctx))) {
2130 		error = EPERM;
2131 		goto out;
2132 	}
2133 #if CONFIG_MACF
2134 	error = mac_mount_check_remount(ctx, mp);
2135 	if (error != 0) {
2136 		goto out;
2137 	}
2138 #endif
2139 
2140 out:
2141 	if (error) {
2142 		lck_rw_done(&mp->mnt_rwlock);
2143 	}
2144 
2145 	return error;
2146 }
2147 
2148 static void
mount_end_update(mount_t mp)2149 mount_end_update(mount_t mp)
2150 {
2151 	lck_rw_done(&mp->mnt_rwlock);
2152 }
2153 
2154 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2155 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2156 {
2157 	vnode_t vp;
2158 
2159 	if (height >= MAX_IMAGEBOOT_NESTING) {
2160 		return EINVAL;
2161 	}
2162 
2163 	vp = imgsrc_rootvnodes[height];
2164 	if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2165 		*rvpp = vp;
2166 		return 0;
2167 	} else {
2168 		return ENOENT;
2169 	}
2170 }
2171 
2172 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2173 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2174     struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2175     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2176 {
2177 	int error;
2178 	mount_t mp;
2179 	boolean_t placed = FALSE;
2180 	struct vfstable *vfsp;
2181 	user_addr_t devpath;
2182 	char *old_mntonname;
2183 	vnode_t rvp;
2184 	vnode_t devvp;
2185 	uint32_t height;
2186 	uint32_t flags;
2187 
2188 	/* If we didn't imageboot, nothing to move */
2189 	if (imgsrc_rootvnodes[0] == NULLVP) {
2190 		return EINVAL;
2191 	}
2192 
2193 	/* Only root can do this */
2194 	if (!vfs_context_issuser(ctx)) {
2195 		return EPERM;
2196 	}
2197 
2198 	IMGSRC_DEBUG("looking for root vnode.\n");
2199 
2200 	/*
2201 	 * Get root vnode of filesystem we're moving.
2202 	 */
2203 	if (by_index) {
2204 		if (is64bit) {
2205 			struct user64_mnt_imgsrc_args mia64;
2206 			error = copyin(fsmountargs, &mia64, sizeof(mia64));
2207 			if (error != 0) {
2208 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2209 				return error;
2210 			}
2211 
2212 			height = mia64.mi_height;
2213 			flags = mia64.mi_flags;
2214 			devpath = (user_addr_t)mia64.mi_devpath;
2215 		} else {
2216 			struct user32_mnt_imgsrc_args mia32;
2217 			error = copyin(fsmountargs, &mia32, sizeof(mia32));
2218 			if (error != 0) {
2219 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2220 				return error;
2221 			}
2222 
2223 			height = mia32.mi_height;
2224 			flags = mia32.mi_flags;
2225 			devpath = mia32.mi_devpath;
2226 		}
2227 	} else {
2228 		/*
2229 		 * For binary compatibility--assumes one level of nesting.
2230 		 */
2231 		if (is64bit) {
2232 			if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2233 				return error;
2234 			}
2235 		} else {
2236 			user32_addr_t tmp;
2237 			if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2238 				return error;
2239 			}
2240 
2241 			/* munge into LP64 addr */
2242 			devpath = CAST_USER_ADDR_T(tmp);
2243 		}
2244 
2245 		height = 0;
2246 		flags = 0;
2247 	}
2248 
2249 	if (flags != 0) {
2250 		IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2251 		return EINVAL;
2252 	}
2253 
2254 	error = get_imgsrc_rootvnode(height, &rvp);
2255 	if (error != 0) {
2256 		IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2257 		return error;
2258 	}
2259 
2260 	IMGSRC_DEBUG("got old root vnode\n");
2261 
2262 	old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2263 
2264 	/* Can only move once */
2265 	mp = vnode_mount(rvp);
2266 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2267 		IMGSRC_DEBUG("Already moved.\n");
2268 		error = EBUSY;
2269 		goto out0;
2270 	}
2271 
2272 	IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2273 	IMGSRC_DEBUG("Starting updated.\n");
2274 
2275 	/* Get exclusive rwlock on mount, authorize update on mp */
2276 	error = mount_begin_update(mp, ctx, 0);
2277 	if (error != 0) {
2278 		IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2279 		goto out0;
2280 	}
2281 
2282 	/*
2283 	 * It can only be moved once.  Flag is set under the rwlock,
2284 	 * so we're now safe to proceed.
2285 	 */
2286 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2287 		IMGSRC_DEBUG("Already moved [2]\n");
2288 		goto out1;
2289 	}
2290 
2291 	IMGSRC_DEBUG("Preparing coveredvp.\n");
2292 
2293 	/* Mark covered vnode as mount in progress, authorize placing mount on top */
2294 	error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2295 	if (error != 0) {
2296 		IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2297 		goto out1;
2298 	}
2299 
2300 	IMGSRC_DEBUG("Covered vp OK.\n");
2301 
2302 	/* Sanity check the name caller has provided */
2303 	vfsp = mp->mnt_vtable;
2304 	if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2305 		IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2306 		    vfsp->vfc_name, fsname);
2307 		error = EINVAL;
2308 		goto out2;
2309 	}
2310 
2311 	/* Check the device vnode and update mount-from name, for local filesystems */
2312 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2313 		IMGSRC_DEBUG("Local, doing device validation.\n");
2314 
2315 		if (devpath != USER_ADDR_NULL) {
2316 			error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2317 			if (error) {
2318 				IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2319 				goto out2;
2320 			}
2321 
2322 			vnode_put(devvp);
2323 		}
2324 	}
2325 
2326 	/*
2327 	 * Place mp on top of vnode, ref the vnode,  call checkdirs(),
2328 	 * and increment the name cache's mount generation
2329 	 */
2330 
2331 	IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2332 	error = place_mount_and_checkdirs(mp, vp, ctx);
2333 	if (error != 0) {
2334 		goto out2;
2335 	}
2336 
2337 	placed = TRUE;
2338 
2339 	strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2340 	strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2341 
2342 	/* Forbid future moves */
2343 	mount_lock(mp);
2344 	mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2345 	mount_unlock(mp);
2346 
2347 	/* Finally, add to mount list, completely ready to go */
2348 	if (mount_list_add(mp) != 0) {
2349 		/*
2350 		 * The system is shutting down trying to umount
2351 		 * everything, so fail with a plausible errno.
2352 		 */
2353 		error = EBUSY;
2354 		goto out3;
2355 	}
2356 
2357 	mount_end_update(mp);
2358 	vnode_put(rvp);
2359 	zfree(ZV_NAMEI, old_mntonname);
2360 
2361 	vfs_notify_mount(pvp);
2362 
2363 	return 0;
2364 out3:
2365 	strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2366 
2367 	mount_lock(mp);
2368 	mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2369 	mount_unlock(mp);
2370 
2371 out2:
2372 	/*
2373 	 * Placing the mp on the vnode clears VMOUNT,
2374 	 * so cleanup is different after that point
2375 	 */
2376 	if (placed) {
2377 		/* Rele the vp, clear VMOUNT and v_mountedhere */
2378 		undo_place_on_covered_vp(mp, vp);
2379 	} else {
2380 		vnode_lock_spin(vp);
2381 		CLR(vp->v_flag, VMOUNT);
2382 		vnode_unlock(vp);
2383 	}
2384 out1:
2385 	mount_end_update(mp);
2386 
2387 out0:
2388 	vnode_put(rvp);
2389 	zfree(ZV_NAMEI, old_mntonname);
2390 	return error;
2391 }
2392 
2393 #endif /* CONFIG_IMGSRC_ACCESS */
2394 
2395 void
enablequotas(struct mount * mp,vfs_context_t ctx)2396 enablequotas(struct mount *mp, vfs_context_t ctx)
2397 {
2398 	struct nameidata qnd;
2399 	int type;
2400 	char qfpath[MAXPATHLEN];
2401 	const char *qfname = QUOTAFILENAME;
2402 	const char *qfopsname = QUOTAOPSNAME;
2403 	const char *qfextension[] = INITQFNAMES;
2404 
2405 	/* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2406 	if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2407 		return;
2408 	}
2409 	/*
2410 	 * Enable filesystem disk quotas if necessary.
2411 	 * We ignore errors as this should not interfere with final mount
2412 	 */
2413 	for (type = 0; type < MAXQUOTAS; type++) {
2414 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2415 		NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2416 		    CAST_USER_ADDR_T(qfpath), ctx);
2417 		if (namei(&qnd) != 0) {
2418 			continue;           /* option file to trigger quotas is not present */
2419 		}
2420 		vnode_put(qnd.ni_vp);
2421 		nameidone(&qnd);
2422 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2423 
2424 		(void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2425 	}
2426 	return;
2427 }
2428 
2429 
2430 static int
checkdirs_callback(proc_t p,void * arg)2431 checkdirs_callback(proc_t p, void * arg)
2432 {
2433 	struct cdirargs *cdrp = (struct cdirargs *)arg;
2434 	vnode_t olddp = cdrp->olddp;
2435 	vnode_t newdp = cdrp->newdp;
2436 	struct filedesc *fdp = &p->p_fd;
2437 	vnode_t new_cvp = newdp;
2438 	vnode_t new_rvp = newdp;
2439 	vnode_t old_cvp = NULL;
2440 	vnode_t old_rvp = NULL;
2441 
2442 	/*
2443 	 * XXX Also needs to iterate each thread in the process to see if it
2444 	 * XXX is using a per-thread current working directory, and, if so,
2445 	 * XXX update that as well.
2446 	 */
2447 
2448 	/*
2449 	 * First, with the proc_fdlock held, check to see if we will need
2450 	 * to do any work.  If not, we will get out fast.
2451 	 */
2452 	proc_fdlock(p);
2453 	if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2454 		proc_fdunlock(p);
2455 		return PROC_RETURNED;
2456 	}
2457 	proc_fdunlock(p);
2458 
2459 	/*
2460 	 * Ok, we will have to do some work.  Always take two refs
2461 	 * because we might need that many.  We'll dispose of whatever
2462 	 * we ended up not using.
2463 	 */
2464 	if (vnode_ref(newdp) != 0) {
2465 		return PROC_RETURNED;
2466 	}
2467 	if (vnode_ref(newdp) != 0) {
2468 		vnode_rele(newdp);
2469 		return PROC_RETURNED;
2470 	}
2471 
2472 	proc_dirs_lock_exclusive(p);
2473 	/*
2474 	 * Now do the work.  Note: we dropped the proc_fdlock, so we
2475 	 * have to do all of the checks again.
2476 	 */
2477 	proc_fdlock(p);
2478 	if (fdp->fd_cdir == olddp) {
2479 		old_cvp = olddp;
2480 		fdp->fd_cdir = newdp;
2481 		new_cvp = NULL;
2482 	}
2483 	if (fdp->fd_rdir == olddp) {
2484 		old_rvp = olddp;
2485 		fdp->fd_rdir = newdp;
2486 		new_rvp = NULL;
2487 	}
2488 	proc_fdunlock(p);
2489 	proc_dirs_unlock_exclusive(p);
2490 
2491 	/*
2492 	 * Dispose of any references that are no longer needed.
2493 	 */
2494 	if (old_cvp != NULL) {
2495 		vnode_rele(old_cvp);
2496 	}
2497 	if (old_rvp != NULL) {
2498 		vnode_rele(old_rvp);
2499 	}
2500 	if (new_cvp != NULL) {
2501 		vnode_rele(new_cvp);
2502 	}
2503 	if (new_rvp != NULL) {
2504 		vnode_rele(new_rvp);
2505 	}
2506 
2507 	return PROC_RETURNED;
2508 }
2509 
2510 
2511 
2512 /*
2513  * Scan all active processes to see if any of them have a current
2514  * or root directory onto which the new filesystem has just been
2515  * mounted. If so, replace them with the new mount point.
2516  */
2517 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2518 checkdirs(vnode_t olddp, vfs_context_t ctx)
2519 {
2520 	vnode_t newdp;
2521 	vnode_t tvp;
2522 	int err;
2523 	struct cdirargs cdr;
2524 
2525 	if (olddp->v_usecount == 1) {
2526 		return 0;
2527 	}
2528 	err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2529 
2530 	if (err != 0) {
2531 #if DIAGNOSTIC
2532 		panic("mount: lost mount: error %d", err);
2533 #endif
2534 		return err;
2535 	}
2536 
2537 	cdr.olddp = olddp;
2538 	cdr.newdp = newdp;
2539 	/* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2540 	proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2541 
2542 	if (rootvnode == olddp) {
2543 		vnode_ref(newdp);
2544 		lck_rw_lock_exclusive(&rootvnode_rw_lock);
2545 		tvp = rootvnode;
2546 		rootvnode = newdp;
2547 		lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2548 		vnode_rele(tvp);
2549 	}
2550 
2551 	vnode_put(newdp);
2552 	return 0;
2553 }
2554 
2555 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT        \
2556 	"com.apple.private.vfs.role-account-unmount"
2557 
2558 /*
2559  * Unmount a file system.
2560  *
2561  * Note: unmount takes a path to the vnode mounted on as argument,
2562  * not special file (as before).
2563  */
2564 /* ARGSUSED */
2565 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2566 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2567 {
2568 	vnode_t vp;
2569 	struct mount *mp;
2570 	int error;
2571 	struct nameidata nd;
2572 	vfs_context_t ctx;
2573 
2574 	/*
2575 	 * If the process has the entitlement, use the kernel's context when
2576 	 * performing lookup on the mount path as the process might lack proper
2577 	 * permission to access the directory.
2578 	 */
2579 	ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2580 	    vfs_context_kernel() : vfs_context_current();
2581 
2582 	NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2583 	    UIO_USERSPACE, uap->path, ctx);
2584 	error = namei(&nd);
2585 	if (error) {
2586 		return error;
2587 	}
2588 	vp = nd.ni_vp;
2589 	mp = vp->v_mount;
2590 	nameidone(&nd);
2591 
2592 	/*
2593 	 * Must be the root of the filesystem
2594 	 */
2595 	if ((vp->v_flag & VROOT) == 0) {
2596 		vnode_put(vp);
2597 		return EINVAL;
2598 	}
2599 #if CONFIG_MACF
2600 	error = mac_mount_check_umount(ctx, mp);
2601 	if (error != 0) {
2602 		vnode_put(vp);
2603 		return error;
2604 	}
2605 #endif
2606 	mount_ref(mp, 0);
2607 	vnode_put(vp);
2608 	/* safedounmount consumes the mount ref */
2609 	return safedounmount(mp, uap->flags, ctx);
2610 }
2611 
2612 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2613 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2614 {
2615 	mount_t mp;
2616 
2617 	mp = mount_list_lookupby_fsid(fsid, 0, 1);
2618 	if (mp == (mount_t)0) {
2619 		return ENOENT;
2620 	}
2621 	mount_ref(mp, 0);
2622 	mount_iterdrop(mp);
2623 	/* safedounmount consumes the mount ref */
2624 	return safedounmount(mp, flags, ctx);
2625 }
2626 
2627 /*
2628  * The mount struct comes with a mount ref which will be consumed.
2629  * Do the actual file system unmount, prevent some common foot shooting.
2630  */
2631 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2632 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2633 {
2634 	int error;
2635 	proc_t p = vfs_context_proc(ctx);
2636 
2637 	/*
2638 	 * If the file system is not responding and MNT_NOBLOCK
2639 	 * is set and not a forced unmount then return EBUSY.
2640 	 */
2641 	if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2642 	    (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2643 		error = EBUSY;
2644 		goto out;
2645 	}
2646 
2647 	/*
2648 	 * Skip authorization in two cases:
2649 	 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2650 	 *   This entitlement allows non-root processes unmount volumes mounted by
2651 	 *   other processes.
2652 	 * - If the mount is tagged as permissive and this is not a forced-unmount
2653 	 *   attempt.
2654 	 */
2655 	if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2656 	    (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2657 		/*
2658 		 * Only root, or the user that did the original mount is
2659 		 * permitted to unmount this filesystem.
2660 		 */
2661 		if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2662 		    (error = suser(kauth_cred_get(), &p->p_acflag))) {
2663 			goto out;
2664 		}
2665 	}
2666 	/*
2667 	 * Don't allow unmounting the root file system, or other volumes
2668 	 * associated with it (for example, the associated VM or DATA mounts) .
2669 	 */
2670 	if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2671 		if (!(mp->mnt_flag & MNT_ROOTFS)) {
2672 			printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2673 			    mp->mnt_vfsstat.f_mntonname);
2674 		}
2675 		error = EBUSY; /* the root (or associated volumes) is always busy */
2676 		goto out;
2677 	}
2678 
2679 	/*
2680 	 * If the mount is providing the root filesystem's disk image
2681 	 * (i.e. imageboot), don't allow unmounting
2682 	 */
2683 	if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2684 		error = EBUSY;
2685 		goto out;
2686 	}
2687 
2688 	return dounmount(mp, flags, 1, ctx);
2689 
2690 out:
2691 	mount_drop(mp, 0);
2692 	return error;
2693 }
2694 
2695 /*
2696  * Do the actual file system unmount.
2697  */
2698 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2699 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2700 {
2701 	vnode_t coveredvp = (vnode_t)0;
2702 	int error;
2703 	int needwakeup = 0;
2704 	int forcedunmount = 0;
2705 	int lflags = 0;
2706 	struct vnode *devvp = NULLVP;
2707 #if CONFIG_TRIGGERS
2708 	proc_t p = vfs_context_proc(ctx);
2709 	int did_vflush = 0;
2710 	int pflags_save = 0;
2711 #endif /* CONFIG_TRIGGERS */
2712 
2713 #if CONFIG_FSE
2714 	if (!(flags & MNT_FORCE)) {
2715 		fsevent_unmount(mp, ctx);  /* has to come first! */
2716 	}
2717 #endif
2718 
2719 	mount_lock(mp);
2720 
2721 	/*
2722 	 * If already an unmount in progress just return EBUSY.
2723 	 * Even a forced unmount cannot override.
2724 	 */
2725 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2726 		if (withref != 0) {
2727 			mount_drop(mp, 1);
2728 		}
2729 		mount_unlock(mp);
2730 		return EBUSY;
2731 	}
2732 
2733 	if (flags & MNT_FORCE) {
2734 		forcedunmount = 1;
2735 		mp->mnt_lflag |= MNT_LFORCE;
2736 	}
2737 
2738 #if CONFIG_TRIGGERS
2739 	if (flags & MNT_NOBLOCK && p != kernproc) {
2740 		pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2741 	}
2742 #endif
2743 
2744 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
2745 	mp->mnt_lflag |= MNT_LUNMOUNT;
2746 	mp->mnt_flag &= ~MNT_ASYNC;
2747 	/*
2748 	 * anyone currently in the fast path that
2749 	 * trips over the cached rootvp will be
2750 	 * dumped out and forced into the slow path
2751 	 * to regenerate a new cached value
2752 	 */
2753 	mp->mnt_realrootvp = NULLVP;
2754 	mount_unlock(mp);
2755 
2756 	if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2757 		/*
2758 		 * Force unmount any mounts in this filesystem.
2759 		 * If any unmounts fail - just leave them dangling.
2760 		 * Avoids recursion.
2761 		 */
2762 		(void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2763 	}
2764 
2765 	/*
2766 	 * taking the name_cache_lock exclusively will
2767 	 * insure that everyone is out of the fast path who
2768 	 * might be trying to use a now stale copy of
2769 	 * vp->v_mountedhere->mnt_realrootvp
2770 	 * bumping mount_generation causes the cached values
2771 	 * to be invalidated
2772 	 */
2773 	name_cache_lock();
2774 	mount_generation++;
2775 	name_cache_unlock();
2776 
2777 
2778 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2779 	if (withref != 0) {
2780 		mount_drop(mp, 0);
2781 	}
2782 	error = 0;
2783 	if (forcedunmount == 0) {
2784 		ubc_umount(mp); /* release cached vnodes */
2785 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2786 			error = VFS_SYNC(mp, MNT_WAIT, ctx);
2787 			if (error) {
2788 				mount_lock(mp);
2789 				mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2790 				mp->mnt_lflag &= ~MNT_LUNMOUNT;
2791 				mp->mnt_lflag &= ~MNT_LFORCE;
2792 				goto out;
2793 			}
2794 		}
2795 	}
2796 
2797 	IOBSDMountChange(mp, kIOMountChangeUnmount);
2798 
2799 #if CONFIG_TRIGGERS
2800 	vfs_nested_trigger_unmounts(mp, flags, ctx);
2801 	did_vflush = 1;
2802 #endif
2803 	if (forcedunmount) {
2804 		lflags |= FORCECLOSE;
2805 	}
2806 	error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2807 	if ((forcedunmount == 0) && error) {
2808 		mount_lock(mp);
2809 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2810 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2811 		mp->mnt_lflag &= ~MNT_LFORCE;
2812 		goto out;
2813 	}
2814 
2815 	/* make sure there are no one in the mount iterations or lookup */
2816 	mount_iterdrain(mp);
2817 
2818 	error = VFS_UNMOUNT(mp, flags, ctx);
2819 	if (error) {
2820 		mount_iterreset(mp);
2821 		mount_lock(mp);
2822 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2823 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2824 		mp->mnt_lflag &= ~MNT_LFORCE;
2825 		goto out;
2826 	}
2827 
2828 	/* increment the operations count */
2829 	if (!error) {
2830 		OSAddAtomic(1, &vfs_nummntops);
2831 	}
2832 
2833 	if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2834 		/* hold an io reference and drop the usecount before close */
2835 		devvp = mp->mnt_devvp;
2836 		vnode_getalways(devvp);
2837 		vnode_rele(devvp);
2838 		VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2839 		    ctx);
2840 		vnode_clearmountedon(devvp);
2841 		vnode_put(devvp);
2842 	}
2843 	lck_rw_done(&mp->mnt_rwlock);
2844 	mount_list_remove(mp);
2845 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2846 
2847 	/* mark the mount point hook in the vp but not drop the ref yet */
2848 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2849 		/*
2850 		 * The covered vnode needs special handling. Trying to get an
2851 		 * iocount must not block here as this may lead to deadlocks
2852 		 * if the Filesystem to which the covered vnode belongs is
2853 		 * undergoing forced unmounts. Since we hold a usecount, the
2854 		 * vnode cannot be reused (it can, however, still be terminated)
2855 		 */
2856 		vnode_getalways(coveredvp);
2857 		vnode_lock_spin(coveredvp);
2858 
2859 		mp->mnt_crossref++;
2860 		coveredvp->v_mountedhere = (struct mount *)0;
2861 		CLR(coveredvp->v_flag, VMOUNT | VMOUNTEDHERE);
2862 		vnode_unlock(coveredvp);
2863 		vnode_put(coveredvp);
2864 	}
2865 
2866 	mount_list_lock();
2867 	mp->mnt_vtable->vfc_refcount--;
2868 	mount_list_unlock();
2869 
2870 	cache_purgevfs(mp);     /* remove cache entries for this file sys */
2871 	vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2872 	mount_lock(mp);
2873 	mp->mnt_lflag |= MNT_LDEAD;
2874 
2875 	if (mp->mnt_lflag & MNT_LWAIT) {
2876 		/*
2877 		 * do the wakeup here
2878 		 * in case we block in mount_refdrain
2879 		 * which will drop the mount lock
2880 		 * and allow anyone blocked in vfs_busy
2881 		 * to wakeup and see the LDEAD state
2882 		 */
2883 		mp->mnt_lflag &= ~MNT_LWAIT;
2884 		wakeup((caddr_t)mp);
2885 	}
2886 	mount_refdrain(mp);
2887 
2888 	/* free disk_conditioner_info structure for this mount */
2889 	disk_conditioner_unmount(mp);
2890 
2891 out:
2892 	if (mp->mnt_lflag & MNT_LWAIT) {
2893 		mp->mnt_lflag &= ~MNT_LWAIT;
2894 		needwakeup = 1;
2895 	}
2896 
2897 #if CONFIG_TRIGGERS
2898 	if (flags & MNT_NOBLOCK && p != kernproc) {
2899 		// Restore P_NOREMOTEHANG bit to its previous value
2900 		if ((pflags_save & P_NOREMOTEHANG) == 0) {
2901 			OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2902 		}
2903 	}
2904 
2905 	/*
2906 	 * Callback and context are set together under the mount lock, and
2907 	 * never cleared, so we're safe to examine them here, drop the lock,
2908 	 * and call out.
2909 	 */
2910 	if (mp->mnt_triggercallback != NULL) {
2911 		mount_unlock(mp);
2912 		if (error == 0) {
2913 			mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2914 		} else if (did_vflush) {
2915 			mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2916 		}
2917 	} else {
2918 		mount_unlock(mp);
2919 	}
2920 #else
2921 	mount_unlock(mp);
2922 #endif /* CONFIG_TRIGGERS */
2923 
2924 	lck_rw_done(&mp->mnt_rwlock);
2925 
2926 	if (needwakeup) {
2927 		wakeup((caddr_t)mp);
2928 	}
2929 
2930 	if (!error) {
2931 		if ((coveredvp != NULLVP)) {
2932 			vnode_t pvp = NULLVP;
2933 
2934 			/*
2935 			 * The covered vnode needs special handling. Trying to
2936 			 * get an iocount must not block here as this may lead
2937 			 * to deadlocks if the Filesystem to which the covered
2938 			 * vnode belongs is undergoing forced unmounts. Since we
2939 			 * hold a usecount, the  vnode cannot be reused
2940 			 * (it can, however, still be terminated).
2941 			 */
2942 			vnode_getalways(coveredvp);
2943 
2944 			mount_dropcrossref(mp, coveredvp, 0);
2945 			/*
2946 			 * We'll _try_ to detect if this really needs to be
2947 			 * done. The coveredvp can only be in termination (or
2948 			 * terminated) if the coveredvp's mount point is in a
2949 			 * forced unmount (or has been) since we still hold the
2950 			 * ref.
2951 			 */
2952 			if (!vnode_isrecycled(coveredvp)) {
2953 				pvp = vnode_getparent(coveredvp);
2954 #if CONFIG_TRIGGERS
2955 				if (coveredvp->v_resolve) {
2956 					vnode_trigger_rearm(coveredvp, ctx);
2957 				}
2958 #endif
2959 			}
2960 
2961 			vnode_rele(coveredvp);
2962 			vnode_put(coveredvp);
2963 			coveredvp = NULLVP;
2964 
2965 			if (pvp) {
2966 				lock_vnode_and_post(pvp, NOTE_WRITE);
2967 				vnode_put(pvp);
2968 			}
2969 		} else if (mp->mnt_flag & MNT_ROOTFS) {
2970 			if (nc_smr_enabled) {
2971 				vfs_smr_synchronize();
2972 			}
2973 
2974 			mount_lock_destroy(mp);
2975 #if CONFIG_MACF
2976 			mac_mount_label_destroy(mp);
2977 #endif
2978 			zfree(mount_zone, mp);
2979 		} else {
2980 			panic("dounmount: no coveredvp");
2981 		}
2982 	}
2983 	return error;
2984 }
2985 
2986 /*
2987  * Unmount any mounts in this filesystem.
2988  */
2989 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)2990 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2991 {
2992 	mount_t smp;
2993 	fsid_t *fsids, fsid;
2994 	int fsids_sz;
2995 	int count = 0, i, m = 0;
2996 	vnode_t vp;
2997 
2998 	mount_list_lock();
2999 
3000 	// Get an array to hold the submounts fsids.
3001 	TAILQ_FOREACH(smp, &mountlist, mnt_list)
3002 	count++;
3003 	fsids_sz = count * sizeof(fsid_t);
3004 	fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3005 	if (fsids == NULL) {
3006 		mount_list_unlock();
3007 		goto out;
3008 	}
3009 	fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
3010 
3011 	/*
3012 	 * Fill the array with submount fsids.
3013 	 * Since mounts are always added to the tail of the mount list, the
3014 	 * list is always in mount order.
3015 	 * For each mount check if the mounted-on vnode belongs to a
3016 	 * mount that's already added to our array of mounts to be unmounted.
3017 	 */
3018 	for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3019 		vp = smp->mnt_vnodecovered;
3020 		if (vp == NULL) {
3021 			continue;
3022 		}
3023 		fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
3024 		for (i = 0; i <= m; i++) {
3025 			if (fsids[i].val[0] == fsid.val[0] &&
3026 			    fsids[i].val[1] == fsid.val[1]) {
3027 				fsids[++m] = smp->mnt_vfsstat.f_fsid;
3028 				break;
3029 			}
3030 		}
3031 	}
3032 	mount_list_unlock();
3033 
3034 	// Unmount the submounts in reverse order. Ignore errors.
3035 	for (i = m; i > 0; i--) {
3036 		smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3037 		if (smp) {
3038 			mount_ref(smp, 0);
3039 			mount_iterdrop(smp);
3040 			(void) dounmount(smp, flags, 1, ctx);
3041 		}
3042 	}
3043 out:
3044 	kfree_data(fsids, fsids_sz);
3045 }
3046 
3047 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3048 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3049 {
3050 	vnode_hold(dp);
3051 	vnode_lock(dp);
3052 	mp->mnt_crossref--;
3053 
3054 	if (mp->mnt_crossref < 0) {
3055 		panic("mount cross refs -ve");
3056 	}
3057 
3058 	if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3059 		if (need_put) {
3060 			vnode_put_locked(dp);
3061 		}
3062 		vnode_drop_and_unlock(dp);
3063 
3064 		if (nc_smr_enabled) {
3065 			vfs_smr_synchronize();
3066 		}
3067 
3068 		mount_lock_destroy(mp);
3069 #if CONFIG_MACF
3070 		mac_mount_label_destroy(mp);
3071 #endif
3072 		zfree(mount_zone, mp);
3073 		return;
3074 	}
3075 	if (need_put) {
3076 		vnode_put_locked(dp);
3077 	}
3078 	vnode_drop_and_unlock(dp);
3079 }
3080 
3081 
3082 /*
3083  * Sync each mounted filesystem.
3084  */
3085 #if DIAGNOSTIC
3086 int syncprt = 0;
3087 #endif
3088 
3089 int print_vmpage_stat = 0;
3090 
3091 /*
3092  * sync_callback:	simple wrapper that calls VFS_SYNC() on volumes
3093  *			mounted read-write with the passed waitfor value.
3094  *
3095  * Parameters:	mp	mount-point descriptor per mounted file-system instance.
3096  *		arg	user argument (please see below)
3097  *
3098  * User argument is a pointer to 32 bit unsigned integer which describes the
3099  * type of waitfor value to set for calling VFS_SYNC().  If user argument is
3100  * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3101  * waitfor value.
3102  *
3103  * Returns:		VFS_RETURNED
3104  */
3105 static int
sync_callback(mount_t mp,void * arg)3106 sync_callback(mount_t mp, void *arg)
3107 {
3108 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3109 		int asyncflag = mp->mnt_flag & MNT_ASYNC;
3110 		unsigned waitfor = MNT_NOWAIT;
3111 
3112 		if (arg) {
3113 			waitfor = *(uint32_t*)arg;
3114 		}
3115 
3116 		/* Sanity check for flags - these are the only valid combinations for the flag bits*/
3117 		if (waitfor != MNT_WAIT &&
3118 		    waitfor != (MNT_WAIT | MNT_VOLUME) &&
3119 		    waitfor != MNT_NOWAIT &&
3120 		    waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3121 		    waitfor != MNT_DWAIT &&
3122 		    waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3123 			panic("Passed inappropriate waitfor %u to "
3124 			    "sync_callback()", waitfor);
3125 		}
3126 
3127 		mp->mnt_flag &= ~MNT_ASYNC;
3128 		(void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3129 		if (asyncflag) {
3130 			mp->mnt_flag |= MNT_ASYNC;
3131 		}
3132 	}
3133 
3134 	return VFS_RETURNED;
3135 }
3136 
3137 /* ARGSUSED */
3138 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3139 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3140 {
3141 	vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3142 
3143 	if (print_vmpage_stat) {
3144 		vm_countdirtypages();
3145 	}
3146 
3147 #if DIAGNOSTIC
3148 	if (syncprt) {
3149 		vfs_bufstats();
3150 	}
3151 #endif /* DIAGNOSTIC */
3152 	return 0;
3153 }
3154 
3155 typedef enum {
3156 	SYNC_ALL = 0,
3157 	SYNC_ONLY_RELIABLE_MEDIA = 1,
3158 	SYNC_ONLY_UNRELIABLE_MEDIA = 2
3159 } sync_type_t;
3160 
3161 static int
sync_internal_callback(mount_t mp,void * arg)3162 sync_internal_callback(mount_t mp, void *arg)
3163 {
3164 	if (arg) {
3165 		int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3166 		    (mp->mnt_flag & MNT_LOCAL);
3167 		sync_type_t sync_type = *((sync_type_t *)arg);
3168 
3169 		if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3170 			return VFS_RETURNED;
3171 		} else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3172 			return VFS_RETURNED;
3173 		}
3174 	}
3175 
3176 	(void)sync_callback(mp, NULL);
3177 
3178 	return VFS_RETURNED;
3179 }
3180 
3181 int sync_thread_state = 0;
3182 int sync_timeout_seconds = 5;
3183 
3184 #define SYNC_THREAD_RUN       0x0001
3185 #define SYNC_THREAD_RUNNING   0x0002
3186 
3187 #if CONFIG_PHYS_WRITE_ACCT
3188 thread_t pm_sync_thread;
3189 #endif /* CONFIG_PHYS_WRITE_ACCT */
3190 
3191 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3192 sync_thread(__unused void *arg, __unused wait_result_t wr)
3193 {
3194 	sync_type_t sync_type;
3195 #if CONFIG_PHYS_WRITE_ACCT
3196 	pm_sync_thread = current_thread();
3197 #endif /* CONFIG_PHYS_WRITE_ACCT */
3198 
3199 	lck_mtx_lock(&sync_mtx_lck);
3200 	while (sync_thread_state & SYNC_THREAD_RUN) {
3201 		sync_thread_state &= ~SYNC_THREAD_RUN;
3202 		lck_mtx_unlock(&sync_mtx_lck);
3203 
3204 		sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3205 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3206 		sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3207 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3208 
3209 		lck_mtx_lock(&sync_mtx_lck);
3210 	}
3211 	/*
3212 	 * This wakeup _has_ to be issued before the lock is released otherwise
3213 	 * we may end up waking up a thread in sync_internal which is
3214 	 * expecting a wakeup from a thread it just created and not from this
3215 	 * thread which is about to exit.
3216 	 */
3217 	wakeup(&sync_thread_state);
3218 	sync_thread_state &= ~SYNC_THREAD_RUNNING;
3219 #if CONFIG_PHYS_WRITE_ACCT
3220 	pm_sync_thread = NULL;
3221 #endif /* CONFIG_PHYS_WRITE_ACCT */
3222 	lck_mtx_unlock(&sync_mtx_lck);
3223 
3224 	if (print_vmpage_stat) {
3225 		vm_countdirtypages();
3226 	}
3227 
3228 #if DIAGNOSTIC
3229 	if (syncprt) {
3230 		vfs_bufstats();
3231 	}
3232 #endif /* DIAGNOSTIC */
3233 }
3234 
3235 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3236 
3237 /*
3238  * An in-kernel sync for power management to call.
3239  * This function always returns within sync_timeout seconds.
3240  */
3241 __private_extern__ int
sync_internal(void)3242 sync_internal(void)
3243 {
3244 	thread_t thd = NULL;
3245 	int error;
3246 	int thread_created = FALSE;
3247 	struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3248 
3249 	lck_mtx_lock(&sync_mtx_lck);
3250 	sync_thread_state |= SYNC_THREAD_RUN;
3251 	if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3252 		int kr;
3253 
3254 		sync_thread_state |= SYNC_THREAD_RUNNING;
3255 		kr = kernel_thread_start(sync_thread, NULL, &thd);
3256 		if (kr != KERN_SUCCESS) {
3257 			sync_thread_state &= ~SYNC_THREAD_RUNNING;
3258 			lck_mtx_unlock(&sync_mtx_lck);
3259 			printf("sync_thread failed\n");
3260 			return 0;
3261 		}
3262 		thread_created = TRUE;
3263 	}
3264 
3265 	error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3266 	    (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3267 	if (error) {
3268 		struct timeval now;
3269 
3270 		microtime(&now);
3271 		if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3272 			printf("sync timed out: %d sec\n", sync_timeout_seconds);
3273 			sync_timeout_last_print.tv_sec = now.tv_sec;
3274 		}
3275 	}
3276 
3277 	if (thread_created) {
3278 		thread_deallocate(thd);
3279 	}
3280 
3281 	return 0;
3282 } /* end of sync_internal call */
3283 
3284 /*
3285  * Change filesystem quotas.
3286  */
3287 #if QUOTA
3288 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3289 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3290 {
3291 	struct mount *mp;
3292 	int error, quota_cmd, quota_status = 0;
3293 	caddr_t datap;
3294 	size_t fnamelen;
3295 	struct nameidata nd;
3296 	vfs_context_t ctx = vfs_context_current();
3297 	struct dqblk my_dqblk = {};
3298 
3299 	AUDIT_ARG(uid, uap->uid);
3300 	AUDIT_ARG(cmd, uap->cmd);
3301 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3302 	    uap->path, ctx);
3303 	error = namei(&nd);
3304 	if (error) {
3305 		return error;
3306 	}
3307 	mp = nd.ni_vp->v_mount;
3308 	mount_ref(mp, 0);
3309 	vnode_put(nd.ni_vp);
3310 	nameidone(&nd);
3311 
3312 #if CONFIG_MACF
3313 	error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3314 	if (error != 0) {
3315 		goto out;
3316 	}
3317 #endif
3318 
3319 	/* copyin any data we will need for downstream code */
3320 	quota_cmd = uap->cmd >> SUBCMDSHIFT;
3321 
3322 	switch (quota_cmd) {
3323 	case Q_QUOTAON:
3324 		/* uap->arg specifies a file from which to take the quotas */
3325 		fnamelen = MAXPATHLEN;
3326 		datap = zalloc(ZV_NAMEI);
3327 		error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3328 		break;
3329 	case Q_GETQUOTA:
3330 		/* uap->arg is a pointer to a dqblk structure. */
3331 		datap = (caddr_t) &my_dqblk;
3332 		break;
3333 	case Q_SETQUOTA:
3334 	case Q_SETUSE:
3335 		/* uap->arg is a pointer to a dqblk structure. */
3336 		datap = (caddr_t) &my_dqblk;
3337 		if (proc_is64bit(p)) {
3338 			struct user_dqblk       my_dqblk64;
3339 			error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3340 			if (error == 0) {
3341 				munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3342 			}
3343 		} else {
3344 			error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3345 		}
3346 		break;
3347 	case Q_QUOTASTAT:
3348 		/* uap->arg is a pointer to an integer */
3349 		datap = (caddr_t) &quota_status;
3350 		break;
3351 	default:
3352 		datap = NULL;
3353 		break;
3354 	} /* switch */
3355 
3356 	if (error == 0) {
3357 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3358 	}
3359 
3360 	switch (quota_cmd) {
3361 	case Q_QUOTAON:
3362 		if (datap != NULL) {
3363 			zfree(ZV_NAMEI, datap);
3364 		}
3365 		break;
3366 	case Q_GETQUOTA:
3367 		/* uap->arg is a pointer to a dqblk structure we need to copy out to */
3368 		if (error == 0) {
3369 			if (proc_is64bit(p)) {
3370 				struct user_dqblk       my_dqblk64;
3371 
3372 				memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3373 				munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3374 				error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3375 			} else {
3376 				error = copyout(datap, uap->arg, sizeof(struct dqblk));
3377 			}
3378 		}
3379 		break;
3380 	case Q_QUOTASTAT:
3381 		/* uap->arg is a pointer to an integer */
3382 		if (error == 0) {
3383 			error = copyout(datap, uap->arg, sizeof(quota_status));
3384 		}
3385 		break;
3386 	default:
3387 		break;
3388 	} /* switch */
3389 
3390 out:
3391 	mount_drop(mp, 0);
3392 	return error;
3393 }
3394 #else
3395 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3396 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3397 {
3398 	return EOPNOTSUPP;
3399 }
3400 #endif /* QUOTA */
3401 
3402 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3403 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3404 {
3405 	int error;
3406 	vfs_context_t ctx = vfs_context_current();
3407 
3408 #if CONFIG_MACF
3409 	error = mac_mount_check_stat(ctx, mp);
3410 	if (error != 0) {
3411 		return error;
3412 	}
3413 #endif
3414 
3415 	error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3416 	if (error != 0) {
3417 		return error;
3418 	}
3419 
3420 	return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3421 }
3422 
3423 /*
3424  * Get filesystem statistics.
3425  *
3426  * Returns:	0			Success
3427  *	namei:???
3428  *	vfs_update_vfsstat:???
3429  *	munge_statfs:EFAULT
3430  */
3431 /* ARGSUSED */
3432 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3433 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3434 {
3435 	int error;
3436 	struct mount *mp;
3437 	struct nameidata nd;
3438 	vfs_context_t ctx = vfs_context_current();
3439 	vnode_t vp;
3440 
3441 	NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3442 	    UIO_USERSPACE, uap->path, ctx);
3443 	error = namei(&nd);
3444 	if (error != 0) {
3445 		return error;
3446 	}
3447 	vp = nd.ni_vp;
3448 	mp = vp->v_mount;
3449 	nameidone(&nd);
3450 
3451 	error = statfs_internal(p, mp, uap->buf);
3452 	vnode_put(vp);
3453 
3454 	return error;
3455 }
3456 
3457 /*
3458  * Get filesystem statistics.
3459  */
3460 /* ARGSUSED */
3461 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3462 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3463 {
3464 	int error;
3465 	vnode_t vp = NULL;
3466 	struct mount *mp;
3467 
3468 	AUDIT_ARG(fd, uap->fd);
3469 
3470 	if ((error = file_vnode(uap->fd, &vp)) ||
3471 	    (error = vnode_getwithref(vp))) {
3472 		goto out;
3473 	}
3474 
3475 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3476 
3477 	mp = vp->v_mount;
3478 	if (!mp) {
3479 		error = EBADF;
3480 		goto out_vnode;
3481 	}
3482 
3483 	error = statfs_internal(p, mp, uap->buf);
3484 
3485 out_vnode:
3486 	vnode_put(vp);
3487 
3488 out:
3489 	if (vp != NULL) {
3490 		file_drop(uap->fd);
3491 	}
3492 
3493 	return error;
3494 }
3495 
3496 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3497 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3498 {
3499 	struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3500 
3501 	bzero(sfs, sizeof(*sfs));
3502 
3503 	sfs->f_bsize = vsfs->f_bsize;
3504 	sfs->f_iosize = (int32_t)vsfs->f_iosize;
3505 	sfs->f_blocks = vsfs->f_blocks;
3506 	sfs->f_bfree = vsfs->f_bfree;
3507 	sfs->f_bavail = vsfs->f_bavail;
3508 	sfs->f_files = vsfs->f_files;
3509 	sfs->f_ffree = vsfs->f_ffree;
3510 	sfs->f_fsid = vsfs->f_fsid;
3511 	sfs->f_owner = vsfs->f_owner;
3512 	sfs->f_type = mp->mnt_vtable->vfc_typenum;
3513 	sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3514 	sfs->f_fssubtype = vsfs->f_fssubtype;
3515 	sfs->f_flags_ext = 0;
3516 	if (mp->mnt_kern_flag & MNTK_SYSTEMDATA) {
3517 		sfs->f_flags_ext |= MNT_EXT_ROOT_DATA_VOL;
3518 	}
3519 	if (mp->mnt_kern_flag & MNTK_FSKIT) {
3520 		sfs->f_flags_ext |= MNT_EXT_FSKIT;
3521 	}
3522 	vfs_getfstypename(mp, sfs->f_fstypename, MFSTYPENAMELEN);
3523 	strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3524 	strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3525 }
3526 
3527 /*
3528  * Get file system statistics in 64-bit mode
3529  */
3530 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3531 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3532 {
3533 	struct mount *mp;
3534 	int error;
3535 	struct nameidata *ndp;
3536 	struct statfs64 *sfsp;
3537 	vfs_context_t ctxp = vfs_context_current();
3538 	vnode_t vp;
3539 	struct {
3540 		struct nameidata nd;
3541 		struct statfs64 sfs;
3542 	} *__nameidata_statfs64;
3543 
3544 	__nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3545 	    Z_WAITOK);
3546 	ndp = &__nameidata_statfs64->nd;
3547 
3548 	NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3549 	    UIO_USERSPACE, uap->path, ctxp);
3550 	error = namei(ndp);
3551 	if (error != 0) {
3552 		goto out;
3553 	}
3554 	vp = ndp->ni_vp;
3555 	mp = vp->v_mount;
3556 	nameidone(ndp);
3557 
3558 #if CONFIG_MACF
3559 	error = mac_mount_check_stat(ctxp, mp);
3560 	if (error != 0) {
3561 		vnode_put(vp);
3562 		goto out;
3563 	}
3564 #endif
3565 
3566 	error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3567 	if (error != 0) {
3568 		vnode_put(vp);
3569 		goto out;
3570 	}
3571 
3572 	sfsp = &__nameidata_statfs64->sfs;
3573 	vfs_get_statfs64(mp, sfsp);
3574 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3575 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3576 		/* This process does not want to see a seperate data volume mountpoint */
3577 		strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3578 	}
3579 	error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3580 	vnode_put(vp);
3581 
3582 out:
3583 	kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3584 
3585 	return error;
3586 }
3587 
3588 /*
3589  * Get file system statistics in 64-bit mode
3590  */
3591 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3592 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3593 {
3594 	struct vnode *vp;
3595 	struct mount *mp;
3596 	struct statfs64 sfs;
3597 	int error;
3598 
3599 	AUDIT_ARG(fd, uap->fd);
3600 
3601 	if ((error = file_vnode(uap->fd, &vp))) {
3602 		return error;
3603 	}
3604 
3605 	error = vnode_getwithref(vp);
3606 	if (error) {
3607 		file_drop(uap->fd);
3608 		return error;
3609 	}
3610 
3611 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3612 
3613 	mp = vp->v_mount;
3614 	if (!mp) {
3615 		error = EBADF;
3616 		goto out;
3617 	}
3618 
3619 #if CONFIG_MACF
3620 	error = mac_mount_check_stat(vfs_context_current(), mp);
3621 	if (error != 0) {
3622 		goto out;
3623 	}
3624 #endif
3625 
3626 	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3627 		goto out;
3628 	}
3629 
3630 	vfs_get_statfs64(mp, &sfs);
3631 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3632 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3633 		/* This process does not want to see a seperate data volume mountpoint */
3634 		strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3635 	}
3636 	error = copyout(&sfs, uap->buf, sizeof(sfs));
3637 
3638 out:
3639 	file_drop(uap->fd);
3640 	vnode_put(vp);
3641 
3642 	return error;
3643 }
3644 
3645 struct getfsstat_struct {
3646 	user_addr_t     sfsp;
3647 	user_addr_t     *mp;
3648 	int             count;
3649 	int             maxcount;
3650 	int             flags;
3651 	int             error;
3652 };
3653 
3654 
3655 static int
getfsstat_callback(mount_t mp,void * arg)3656 getfsstat_callback(mount_t mp, void * arg)
3657 {
3658 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3659 	struct vfsstatfs *sp;
3660 	int error, my_size;
3661 	vfs_context_t ctx = vfs_context_current();
3662 
3663 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3664 #if CONFIG_MACF
3665 		error = mac_mount_check_stat(ctx, mp);
3666 		if (error != 0) {
3667 			fstp->error = error;
3668 			return VFS_RETURNED_DONE;
3669 		}
3670 #endif
3671 		sp = &mp->mnt_vfsstat;
3672 		/*
3673 		 * If MNT_NOWAIT is specified, do not refresh the
3674 		 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3675 		 */
3676 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3677 		    (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3678 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3679 		    (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3680 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3681 			return VFS_RETURNED;
3682 		}
3683 
3684 		/*
3685 		 * Need to handle LP64 version of struct statfs
3686 		 */
3687 		error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3688 		if (error) {
3689 			fstp->error = error;
3690 			return VFS_RETURNED_DONE;
3691 		}
3692 		fstp->sfsp += my_size;
3693 
3694 		if (fstp->mp) {
3695 #if CONFIG_MACF
3696 			error = mac_mount_label_get(mp, *fstp->mp);
3697 			if (error) {
3698 				fstp->error = error;
3699 				return VFS_RETURNED_DONE;
3700 			}
3701 #endif
3702 			fstp->mp++;
3703 		}
3704 	}
3705 	fstp->count++;
3706 	return VFS_RETURNED;
3707 }
3708 
3709 /*
3710  * Get statistics on all filesystems.
3711  */
3712 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3713 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3714 {
3715 	struct __mac_getfsstat_args muap;
3716 
3717 	muap.buf = uap->buf;
3718 	muap.bufsize = uap->bufsize;
3719 	muap.mac = USER_ADDR_NULL;
3720 	muap.macsize = 0;
3721 	muap.flags = uap->flags;
3722 
3723 	return __mac_getfsstat(p, &muap, retval);
3724 }
3725 
3726 /*
3727  * __mac_getfsstat: Get MAC-related file system statistics
3728  *
3729  * Parameters:    p                        (ignored)
3730  *                uap                      User argument descriptor (see below)
3731  *                retval                   Count of file system statistics (N stats)
3732  *
3733  * Indirect:      uap->bufsize             Buffer size
3734  *                uap->macsize             MAC info size
3735  *                uap->buf                 Buffer where information will be returned
3736  *                uap->mac                 MAC info
3737  *                uap->flags               File system flags
3738  *
3739  *
3740  * Returns:        0                       Success
3741  *                !0                       Not success
3742  *
3743  */
3744 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3745 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3746 {
3747 	user_addr_t sfsp;
3748 	user_addr_t *mp;
3749 	size_t count, maxcount, bufsize, macsize;
3750 	struct getfsstat_struct fst;
3751 
3752 	if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3753 		return EINVAL;
3754 	}
3755 
3756 	bufsize = (size_t) uap->bufsize;
3757 	macsize = (size_t) uap->macsize;
3758 
3759 	if (IS_64BIT_PROCESS(p)) {
3760 		maxcount = bufsize / sizeof(struct user64_statfs);
3761 	} else {
3762 		maxcount = bufsize / sizeof(struct user32_statfs);
3763 	}
3764 	sfsp = uap->buf;
3765 	count = 0;
3766 
3767 	mp = NULL;
3768 
3769 #if CONFIG_MACF
3770 	if (uap->mac != USER_ADDR_NULL) {
3771 		u_int32_t *mp0;
3772 		int error;
3773 		unsigned int i;
3774 
3775 		count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3776 		if (count != maxcount) {
3777 			return EINVAL;
3778 		}
3779 
3780 		/* Copy in the array */
3781 		mp0 = kalloc_data(macsize, Z_WAITOK);
3782 		if (mp0 == NULL) {
3783 			return ENOMEM;
3784 		}
3785 
3786 		error = copyin(uap->mac, mp0, macsize);
3787 		if (error) {
3788 			kfree_data(mp0, macsize);
3789 			return error;
3790 		}
3791 
3792 		/* Normalize to an array of user_addr_t */
3793 		mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3794 		if (mp == NULL) {
3795 			kfree_data(mp0, macsize);
3796 			return ENOMEM;
3797 		}
3798 
3799 		for (i = 0; i < count; i++) {
3800 			if (IS_64BIT_PROCESS(p)) {
3801 				mp[i] = ((user_addr_t *)mp0)[i];
3802 			} else {
3803 				mp[i] = (user_addr_t)mp0[i];
3804 			}
3805 		}
3806 		kfree_data(mp0, macsize);
3807 	}
3808 #endif
3809 
3810 
3811 	fst.sfsp = sfsp;
3812 	fst.mp = mp;
3813 	fst.flags = uap->flags;
3814 	fst.count = 0;
3815 	fst.error = 0;
3816 	fst.maxcount = (int)maxcount;
3817 
3818 
3819 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3820 
3821 	if (mp) {
3822 		kfree_data(mp, count * sizeof(user_addr_t));
3823 	}
3824 
3825 	if (fst.error) {
3826 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3827 		return fst.error;
3828 	}
3829 
3830 	if (fst.sfsp && fst.count > fst.maxcount) {
3831 		*retval = fst.maxcount;
3832 	} else {
3833 		*retval = fst.count;
3834 	}
3835 	return 0;
3836 }
3837 
3838 static int
getfsstat64_callback(mount_t mp,void * arg)3839 getfsstat64_callback(mount_t mp, void * arg)
3840 {
3841 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3842 	struct vfsstatfs *sp;
3843 	struct statfs64 sfs;
3844 	int error;
3845 
3846 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3847 #if CONFIG_MACF
3848 		error = mac_mount_check_stat(vfs_context_current(), mp);
3849 		if (error != 0) {
3850 			fstp->error = error;
3851 			return VFS_RETURNED_DONE;
3852 		}
3853 #endif
3854 		sp = &mp->mnt_vfsstat;
3855 		/*
3856 		 * If MNT_NOWAIT is specified, do not refresh the fsstat
3857 		 * cache. MNT_WAIT overrides MNT_NOWAIT.
3858 		 *
3859 		 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3860 		 * getfsstat, since the constants are out of the same
3861 		 * namespace.
3862 		 */
3863 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3864 		    ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3865 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3866 		    (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3867 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3868 			return VFS_RETURNED;
3869 		}
3870 
3871 		vfs_get_statfs64(mp, &sfs);
3872 		error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3873 		if (error) {
3874 			fstp->error = error;
3875 			return VFS_RETURNED_DONE;
3876 		}
3877 		fstp->sfsp += sizeof(sfs);
3878 	}
3879 	fstp->count++;
3880 	return VFS_RETURNED;
3881 }
3882 
3883 /*
3884  * Get statistics on all file systems in 64 bit mode.
3885  */
3886 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)3887 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3888 {
3889 	user_addr_t sfsp;
3890 	int count, maxcount;
3891 	struct getfsstat_struct fst;
3892 
3893 	maxcount = uap->bufsize / sizeof(struct statfs64);
3894 
3895 	sfsp = uap->buf;
3896 	count = 0;
3897 
3898 	fst.sfsp = sfsp;
3899 	fst.flags = uap->flags;
3900 	fst.count = 0;
3901 	fst.error = 0;
3902 	fst.maxcount = maxcount;
3903 
3904 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3905 
3906 	if (fst.error) {
3907 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3908 		return fst.error;
3909 	}
3910 
3911 	if (fst.sfsp && fst.count > fst.maxcount) {
3912 		*retval = fst.maxcount;
3913 	} else {
3914 		*retval = fst.count;
3915 	}
3916 
3917 	return 0;
3918 }
3919 
3920 /*
3921  * gets the associated vnode with the file descriptor passed.
3922  * as input
3923  *
3924  * INPUT
3925  * ctx - vfs context of caller
3926  * fd - file descriptor for which vnode is required.
3927  * vpp - Pointer to pointer to vnode to be returned.
3928  *
3929  * The vnode is returned with an iocount so any vnode obtained
3930  * by this call needs a vnode_put
3931  *
3932  */
3933 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)3934 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3935 {
3936 	int error;
3937 	vnode_t vp;
3938 	struct fileproc *fp;
3939 	proc_t p = vfs_context_proc(ctx);
3940 
3941 	*vpp =  NULLVP;
3942 
3943 	error = fp_getfvp(p, fd, &fp, &vp);
3944 	if (error) {
3945 		return error;
3946 	}
3947 
3948 	error = vnode_getwithref(vp);
3949 	if (error) {
3950 		(void)fp_drop(p, fd, fp, 0);
3951 		return error;
3952 	}
3953 
3954 	(void)fp_drop(p, fd, fp, 0);
3955 	*vpp = vp;
3956 	return error;
3957 }
3958 
3959 /*
3960  * Wrapper function around namei to start lookup from a directory
3961  * specified by a file descriptor ni_dirfd.
3962  *
3963  * In addition to all the errors returned by namei, this call can
3964  * return ENOTDIR if the file descriptor does not refer to a directory.
3965  * and EBADF if the file descriptor is not valid.
3966  */
3967 int
nameiat(struct nameidata * ndp,int dirfd)3968 nameiat(struct nameidata *ndp, int dirfd)
3969 {
3970 	if ((dirfd != AT_FDCWD) &&
3971 	    !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3972 	    !(ndp->ni_cnd.cn_flags & USEDVP)) {
3973 		int error = 0;
3974 		char c;
3975 
3976 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3977 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
3978 			if (error) {
3979 				return error;
3980 			}
3981 		} else {
3982 			c = *((char *)(ndp->ni_dirp));
3983 		}
3984 
3985 		if (c != '/') {
3986 			vnode_t dvp_at;
3987 
3988 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3989 			    &dvp_at);
3990 			if (error) {
3991 				return error;
3992 			}
3993 
3994 			if (vnode_vtype(dvp_at) != VDIR) {
3995 				vnode_put(dvp_at);
3996 				return ENOTDIR;
3997 			}
3998 
3999 			ndp->ni_dvp = dvp_at;
4000 			ndp->ni_cnd.cn_flags |= USEDVP;
4001 			error = namei(ndp);
4002 			ndp->ni_cnd.cn_flags &= ~USEDVP;
4003 			vnode_put(dvp_at);
4004 			return error;
4005 		}
4006 	}
4007 
4008 	return namei(ndp);
4009 }
4010 
4011 /*
4012  * Change current working directory to a given file descriptor.
4013  */
4014 /* ARGSUSED */
4015 int
fchdir(proc_t p,vfs_context_t ctx,int fd,bool per_thread)4016 fchdir(proc_t p, vfs_context_t ctx, int fd, bool per_thread)
4017 {
4018 	vnode_t vp;
4019 	vnode_t tdp;
4020 	vnode_t tvp;
4021 	struct mount *mp;
4022 	int error, should_put = 1;
4023 
4024 	AUDIT_ARG(fd, fd);
4025 	if (per_thread && fd == -1) {
4026 		/*
4027 		 * Switching back from per-thread to per process CWD; verify we
4028 		 * in fact have one before proceeding.  The only success case
4029 		 * for this code path is to return 0 preemptively after zapping
4030 		 * the thread structure contents.
4031 		 */
4032 		thread_t th = vfs_context_thread(ctx);
4033 		if (th) {
4034 			uthread_t uth = get_bsdthread_info(th);
4035 			tvp = uth->uu_cdir;
4036 			uth->uu_cdir = NULLVP;
4037 			if (tvp != NULLVP) {
4038 				vnode_rele(tvp);
4039 				return 0;
4040 			}
4041 		}
4042 		return EBADF;
4043 	}
4044 
4045 	if ((error = file_vnode(fd, &vp))) {
4046 		return error;
4047 	}
4048 	if ((error = vnode_getwithref(vp))) {
4049 		file_drop(fd);
4050 		return error;
4051 	}
4052 
4053 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4054 
4055 	if (vp->v_type != VDIR) {
4056 		error = ENOTDIR;
4057 		goto out;
4058 	}
4059 
4060 #if CONFIG_MACF
4061 	error = mac_vnode_check_chdir(ctx, vp);
4062 	if (error) {
4063 		goto out;
4064 	}
4065 #endif
4066 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4067 	if (error) {
4068 		goto out;
4069 	}
4070 
4071 	while (!error && (mp = vp->v_mountedhere) != NULL) {
4072 		if (vfs_busy(mp, LK_NOWAIT)) {
4073 			error = EACCES;
4074 			goto out;
4075 		}
4076 		error = VFS_ROOT(mp, &tdp, ctx);
4077 		vfs_unbusy(mp);
4078 		if (error) {
4079 			break;
4080 		}
4081 		vnode_put(vp);
4082 		vp = tdp;
4083 	}
4084 	if (error) {
4085 		goto out;
4086 	}
4087 	if ((error = vnode_ref(vp))) {
4088 		goto out;
4089 	}
4090 	vnode_put(vp);
4091 	should_put = 0;
4092 
4093 	if (per_thread) {
4094 		thread_t th = vfs_context_thread(ctx);
4095 		if (th) {
4096 			uthread_t uth = get_bsdthread_info(th);
4097 			tvp = uth->uu_cdir;
4098 			uth->uu_cdir = vp;
4099 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4100 		} else {
4101 			vnode_rele(vp);
4102 			error = ENOENT;
4103 			goto out;
4104 		}
4105 	} else {
4106 		proc_dirs_lock_exclusive(p);
4107 		proc_fdlock(p);
4108 		tvp = p->p_fd.fd_cdir;
4109 		p->p_fd.fd_cdir = vp;
4110 		proc_fdunlock(p);
4111 		proc_dirs_unlock_exclusive(p);
4112 	}
4113 
4114 	if (tvp) {
4115 		vnode_rele(tvp);
4116 	}
4117 
4118 out:
4119 	if (should_put) {
4120 		vnode_put(vp);
4121 	}
4122 	file_drop(fd);
4123 
4124 	return error;
4125 }
4126 
4127 int
sys_fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4128 sys_fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4129 {
4130 	return fchdir(p, vfs_context_current(), uap->fd, false);
4131 }
4132 
4133 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4134 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4135 {
4136 	return fchdir(p, vfs_context_current(), uap->fd, true);
4137 }
4138 
4139 
4140 /*
4141  * Change current working directory (".").
4142  *
4143  * Returns:	0			Success
4144  *	change_dir:ENOTDIR
4145  *	change_dir:???
4146  *	vnode_ref:ENOENT		No such file or directory
4147  */
4148 /* ARGSUSED */
4149 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4150 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4151 {
4152 	int error;
4153 	vnode_t tvp;
4154 
4155 	error = change_dir(ndp, ctx);
4156 	if (error) {
4157 		return error;
4158 	}
4159 	if ((error = vnode_ref(ndp->ni_vp))) {
4160 		vnode_put(ndp->ni_vp);
4161 		return error;
4162 	}
4163 	/*
4164 	 * drop the iocount we picked up in change_dir
4165 	 */
4166 	vnode_put(ndp->ni_vp);
4167 
4168 	if (per_thread) {
4169 		thread_t th = vfs_context_thread(ctx);
4170 		if (th) {
4171 			uthread_t uth = get_bsdthread_info(th);
4172 			tvp = uth->uu_cdir;
4173 			uth->uu_cdir = ndp->ni_vp;
4174 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4175 		} else {
4176 			vnode_rele(ndp->ni_vp);
4177 			return ENOENT;
4178 		}
4179 	} else {
4180 		proc_dirs_lock_exclusive(p);
4181 		proc_fdlock(p);
4182 		tvp = p->p_fd.fd_cdir;
4183 		p->p_fd.fd_cdir = ndp->ni_vp;
4184 		proc_fdunlock(p);
4185 		proc_dirs_unlock_exclusive(p);
4186 	}
4187 
4188 	if (tvp) {
4189 		vnode_rele(tvp);
4190 	}
4191 
4192 	return 0;
4193 }
4194 
4195 
4196 /*
4197  * Change current working directory (".").
4198  *
4199  * Returns:	0			Success
4200  *	chdir_internal:ENOTDIR
4201  *	chdir_internal:ENOENT		No such file or directory
4202  *	chdir_internal:???
4203  */
4204 /* ARGSUSED */
4205 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4206 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4207 {
4208 	struct nameidata nd;
4209 	vfs_context_t ctx = vfs_context_current();
4210 
4211 	NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4212 	    UIO_USERSPACE, uap->path, ctx);
4213 
4214 	return chdir_internal(p, ctx, &nd, per_thread);
4215 }
4216 
4217 
4218 /*
4219  * chdir
4220  *
4221  * Change current working directory (".") for the entire process
4222  *
4223  * Parameters:  p       Process requesting the call
4224  *              uap     User argument descriptor (see below)
4225  *              retval  (ignored)
4226  *
4227  * Indirect parameters:	uap->path	Directory path
4228  *
4229  * Returns:	0			Success
4230  *              common_chdir: ENOTDIR
4231  *              common_chdir: ENOENT	No such file or directory
4232  *              common_chdir: ???
4233  *
4234  */
4235 int
sys_chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4236 sys_chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4237 {
4238 	return common_chdir(p, (void *)uap, 0);
4239 }
4240 
4241 /*
4242  * __pthread_chdir
4243  *
4244  * Change current working directory (".") for a single thread
4245  *
4246  * Parameters:  p       Process requesting the call
4247  *              uap     User argument descriptor (see below)
4248  *              retval  (ignored)
4249  *
4250  * Indirect parameters:	uap->path	Directory path
4251  *
4252  * Returns:	0			Success
4253  *              common_chdir: ENOTDIR
4254  *		common_chdir: ENOENT	No such file or directory
4255  *		common_chdir: ???
4256  *
4257  */
4258 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4259 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4260 {
4261 	return common_chdir(p, (void *)uap, 1);
4262 }
4263 
4264 
4265 /*
4266  * Change notion of root (``/'') directory.
4267  */
4268 /* ARGSUSED */
4269 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4270 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4271 {
4272 	struct filedesc *fdp = &p->p_fd;
4273 	int error;
4274 	struct nameidata nd;
4275 	vnode_t tvp;
4276 	vfs_context_t ctx = vfs_context_current();
4277 
4278 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4279 		return error;
4280 	}
4281 
4282 	NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4283 	    UIO_USERSPACE, uap->path, ctx);
4284 	error = change_dir(&nd, ctx);
4285 	if (error) {
4286 		return error;
4287 	}
4288 
4289 #if CONFIG_MACF
4290 	error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4291 	    &nd.ni_cnd);
4292 	if (error) {
4293 		vnode_put(nd.ni_vp);
4294 		return error;
4295 	}
4296 #endif
4297 
4298 	if ((error = vnode_ref(nd.ni_vp))) {
4299 		vnode_put(nd.ni_vp);
4300 		return error;
4301 	}
4302 	vnode_put(nd.ni_vp);
4303 
4304 	/*
4305 	 * This lock provides the guarantee that as long as you hold the lock
4306 	 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4307 	 * on a referenced vnode in namei when determining the rootvnode for
4308 	 * a process.
4309 	 */
4310 	/* needed for synchronization with lookup */
4311 	proc_dirs_lock_exclusive(p);
4312 	/* needed for setting the flag and other activities on the fd itself */
4313 	proc_fdlock(p);
4314 	tvp = fdp->fd_rdir;
4315 	fdp->fd_rdir = nd.ni_vp;
4316 	fdt_flag_set(fdp, FD_CHROOT);
4317 	proc_fdunlock(p);
4318 	proc_dirs_unlock_exclusive(p);
4319 
4320 	if (tvp != NULL) {
4321 		vnode_rele(tvp);
4322 	}
4323 
4324 	return 0;
4325 }
4326 
4327 #define PATHSTATICBUFLEN 256
4328 #define PIVOT_ROOT_ENTITLEMENT              \
4329        "com.apple.private.vfs.pivot-root"
4330 
4331 #if defined(XNU_TARGET_OS_OSX)
4332 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4333 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4334 {
4335 	int error;
4336 	char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4337 	char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4338 	char *new_rootfs_path_before_buf = NULL;
4339 	char *old_rootfs_path_after_buf = NULL;
4340 	char *incoming = NULL;
4341 	char *outgoing = NULL;
4342 	vnode_t incoming_rootvp = NULLVP;
4343 	size_t bytes_copied;
4344 
4345 	/*
4346 	 * XXX : Additional restrictions needed
4347 	 * - perhaps callable only once.
4348 	 */
4349 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4350 		return error;
4351 	}
4352 
4353 	/*
4354 	 * pivot_root can be executed by launchd only.
4355 	 * Enforce entitlement.
4356 	 */
4357 	if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4358 		return EPERM;
4359 	}
4360 
4361 	error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4362 	if (error == ENAMETOOLONG) {
4363 		new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4364 		error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4365 	}
4366 
4367 	if (error) {
4368 		goto out;
4369 	}
4370 
4371 	error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4372 	if (error == ENAMETOOLONG) {
4373 		old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4374 		error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4375 	}
4376 	if (error) {
4377 		goto out;
4378 	}
4379 
4380 	if (new_rootfs_path_before_buf) {
4381 		incoming = new_rootfs_path_before_buf;
4382 	} else {
4383 		incoming = &new_rootfs_path_before[0];
4384 	}
4385 
4386 	if (old_rootfs_path_after_buf) {
4387 		outgoing = old_rootfs_path_after_buf;
4388 	} else {
4389 		outgoing = &old_rootfs_path_after[0];
4390 	}
4391 
4392 	/*
4393 	 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4394 	 * Userland is not allowed to pivot to an image.
4395 	 */
4396 	error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4397 	if (error) {
4398 		goto out;
4399 	}
4400 	error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4401 	if (error) {
4402 		goto out;
4403 	}
4404 
4405 	error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4406 
4407 out:
4408 	if (incoming_rootvp != NULLVP) {
4409 		vnode_put(incoming_rootvp);
4410 		incoming_rootvp = NULLVP;
4411 	}
4412 
4413 	if (old_rootfs_path_after_buf) {
4414 		zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4415 	}
4416 
4417 	if (new_rootfs_path_before_buf) {
4418 		zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4419 	}
4420 
4421 	return error;
4422 }
4423 #else
4424 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4425 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4426 {
4427 	return nosys(p, NULL, retval);
4428 }
4429 #endif /* XNU_TARGET_OS_OSX */
4430 
4431 /*
4432  * Common routine for chroot and chdir.
4433  *
4434  * Returns:	0			Success
4435  *		ENOTDIR			Not a directory
4436  *		namei:???		[anything namei can return]
4437  *		vnode_authorize:???	[anything vnode_authorize can return]
4438  */
4439 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4440 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4441 {
4442 	vnode_t vp;
4443 	int error;
4444 
4445 	if ((error = namei(ndp))) {
4446 		return error;
4447 	}
4448 	nameidone(ndp);
4449 	vp = ndp->ni_vp;
4450 
4451 	if (vp->v_type != VDIR) {
4452 		vnode_put(vp);
4453 		return ENOTDIR;
4454 	}
4455 
4456 #if CONFIG_MACF
4457 	error = mac_vnode_check_chdir(ctx, vp);
4458 	if (error) {
4459 		vnode_put(vp);
4460 		return error;
4461 	}
4462 #endif
4463 
4464 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4465 	if (error) {
4466 		vnode_put(vp);
4467 		return error;
4468 	}
4469 
4470 	return error;
4471 }
4472 
4473 /*
4474  * Free the vnode data (for directories) associated with the file glob.
4475  */
4476 struct fd_vn_data *
fg_vn_data_alloc(void)4477 fg_vn_data_alloc(void)
4478 {
4479 	struct fd_vn_data *fvdata;
4480 
4481 	/* Allocate per fd vnode data */
4482 	fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4483 	lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4484 	return fvdata;
4485 }
4486 
4487 /*
4488  * Free the vnode data (for directories) associated with the file glob.
4489  */
4490 void
fg_vn_data_free(void * fgvndata)4491 fg_vn_data_free(void *fgvndata)
4492 {
4493 	struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4494 
4495 	kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4496 	lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4497 	kfree_type(struct fd_vn_data, fvdata);
4498 }
4499 
4500 /*
4501  * Check permissions, allocate an open file structure,
4502  * and call the device open routine if any.
4503  *
4504  * Returns:	0			Success
4505  *		EINVAL
4506  *		EINTR
4507  *	falloc:ENFILE
4508  *	falloc:EMFILE
4509  *	falloc:ENOMEM
4510  *	vn_open_auth:???
4511  *	dupfdopen:???
4512  *	VNOP_ADVLOCK:???
4513  *	vnode_setsize:???
4514  *
4515  * XXX Need to implement uid, gid
4516  */
4517 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4518 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4519     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4520 {
4521 	proc_t p = vfs_context_proc(ctx);
4522 	kauth_cred_t p_cred = current_cached_proc_cred(PROC_NULL);
4523 	uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4524 	struct fileproc *fp;
4525 	vnode_t vp;
4526 	int flags, oflags, amode;
4527 	int type, indx, error;
4528 	struct vfs_context context;
4529 	vnode_t authvp = NULLVP;
4530 
4531 	oflags = uflags;
4532 
4533 	amode = oflags & O_ACCMODE;
4534 	/*
4535 	 * Because O_RDONLY is 0, it is not possible to distinguish between
4536 	 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4537 	 * with FREAD/FWRITE.
4538 	 */
4539 	if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4540 		return EINVAL;
4541 	}
4542 
4543 	flags = FFLAGS(uflags);
4544 	CLR(flags, FENCRYPTED);
4545 	CLR(flags, FUNENCRYPTED);
4546 
4547 	AUDIT_ARG(fflags, oflags);
4548 	AUDIT_ARG(mode, vap->va_mode);
4549 
4550 	if ((error = falloc_withinit(p, p_cred, ctx, &fp, &indx, fp_init, initarg)) != 0) {
4551 		return error;
4552 	}
4553 	if (flags & O_CLOEXEC) {
4554 		fp->fp_flags |= FP_CLOEXEC;
4555 	}
4556 	if (flags & O_CLOFORK) {
4557 		fp->fp_flags |= FP_CLOFORK;
4558 	}
4559 
4560 	/* setup state to recognize when fdesc_open was called */
4561 	uu->uu_dupfd = -1;
4562 
4563 	/*
4564 	 * Disable read/write access if file is opened with O_EVTONLY and
4565 	 * the process has requested to deny read/write access.
4566 	 */
4567 	if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4568 		flags &= ~(FREAD | FWRITE);
4569 	}
4570 
4571 	if (authfd != AUTH_OPEN_NOAUTHFD) {
4572 		error = vnode_getfromfd(ctx, authfd, &authvp);
4573 		if (error) {
4574 			fp_free(p, indx, fp);
4575 			return error;
4576 		}
4577 	}
4578 
4579 	if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4580 		if (authvp != NULLVP) {
4581 			vnode_put(authvp);
4582 		}
4583 		if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4584 			if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4585 				*retval = indx;
4586 				return 0;
4587 			}
4588 		}
4589 		if (error == ERESTART) {
4590 			error = EINTR;
4591 		}
4592 		fp_free(p, indx, fp);
4593 		return error;
4594 	}
4595 
4596 	if (authvp != NULLVP) {
4597 		vnode_put(authvp);
4598 	}
4599 
4600 	uu->uu_dupfd = 0;
4601 	vp = ndp->ni_vp;
4602 
4603 	fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4604 	fp->fp_glob->fg_ops = &vnops;
4605 	fp_set_data(fp, vp);
4606 
4607 #if CONFIG_FILE_LEASES
4608 	/*
4609 	 * If we are creating a file or open with truncate, we need to break the
4610 	 * lease if there is a read lease placed on the parent dir.
4611 	 */
4612 	if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4613 		vnode_breakdirlease(vp, true, oflags);
4614 	}
4615 	/* Now check if there is a lease placed on the file itself. */
4616 	error = vnode_breaklease(vp, oflags, ctx);
4617 	if (error) {
4618 		goto bad;
4619 	}
4620 #endif /* CONFIG_FILE_LEASES */
4621 
4622 	if (flags & (O_EXLOCK | O_SHLOCK)) {
4623 		struct flock lf = {
4624 			.l_whence = SEEK_SET,
4625 		};
4626 
4627 		if (flags & O_EXLOCK) {
4628 			lf.l_type = F_WRLCK;
4629 		} else {
4630 			lf.l_type = F_RDLCK;
4631 		}
4632 		type = F_FLOCK;
4633 		if ((flags & FNONBLOCK) == 0) {
4634 			type |= F_WAIT;
4635 		}
4636 #if CONFIG_MACF
4637 		error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4638 		    F_SETLK, &lf);
4639 		if (error) {
4640 			goto bad;
4641 		}
4642 #endif
4643 		if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4644 			goto bad;
4645 		}
4646 		fp->fp_glob->fg_flag |= FWASLOCKED;
4647 	}
4648 
4649 	/* try to truncate by setting the size attribute */
4650 	if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4651 		goto bad;
4652 	}
4653 
4654 	/*
4655 	 * For directories we hold some additional information in the fd.
4656 	 */
4657 	if (vnode_vtype(vp) == VDIR) {
4658 		fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4659 	} else {
4660 		fp->fp_glob->fg_vn_data = NULL;
4661 	}
4662 
4663 #if CONFIG_SECLUDED_MEMORY
4664 	if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
4665 		memory_object_control_t moc;
4666 		const char *v_name;
4667 
4668 		moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4669 
4670 		if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4671 			/* nothing to do... */
4672 		} else if (fp->fp_glob->fg_flag & FWRITE) {
4673 			/* writable -> no longer  eligible for secluded pages */
4674 			memory_object_mark_eligible_for_secluded(moc,
4675 			    FALSE);
4676 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
4677 			char pathname[32] = { 0, };
4678 			size_t copied;
4679 			/* XXX FBDP: better way to detect /Applications/ ? */
4680 			if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4681 				(void)copyinstr(ndp->ni_dirp,
4682 				    pathname,
4683 				    sizeof(pathname),
4684 				    &copied);
4685 			} else {
4686 				copystr(CAST_DOWN(void *, ndp->ni_dirp),
4687 				    pathname,
4688 				    sizeof(pathname),
4689 				    &copied);
4690 			}
4691 			pathname[sizeof(pathname) - 1] = '\0';
4692 			if (strncmp(pathname,
4693 			    "/Applications/",
4694 			    strlen("/Applications/")) == 0 &&
4695 			    strncmp(pathname,
4696 			    "/Applications/Camera.app/",
4697 			    strlen("/Applications/Camera.app/")) != 0) {
4698 				/*
4699 				 * not writable
4700 				 * AND from "/Applications/"
4701 				 * AND not from "/Applications/Camera.app/"
4702 				 * ==> eligible for secluded
4703 				 */
4704 				memory_object_mark_eligible_for_secluded(moc,
4705 				    TRUE);
4706 			}
4707 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
4708 		    (v_name = vnode_getname(vp))) {
4709 			size_t len = strlen(v_name);
4710 
4711 			if (!strncmp(v_name, "dyld", len) ||
4712 			    !strncmp(v_name, "launchd", len) ||
4713 			    !strncmp(v_name, "Camera", len) ||
4714 			    !strncmp(v_name, "SpringBoard", len) ||
4715 			    !strncmp(v_name, "backboardd", len)) {
4716 				/*
4717 				 * This file matters when launching Camera:
4718 				 * do not store its contents in the secluded
4719 				 * pool that will be drained on Camera launch.
4720 				 */
4721 				memory_object_mark_eligible_for_secluded(moc,
4722 				    FALSE);
4723 			} else if (!strncmp(v_name, "audiomxd", len) ||
4724 			    !strncmp(v_name, "mediaplaybackd", len)) {
4725 				memory_object_mark_eligible_for_secluded(moc,
4726 				    FALSE);
4727 				memory_object_mark_for_realtime(moc,
4728 				    true);
4729 			} else if (!strncmp(v_name, "bluetoothd", len)) {
4730 				/*
4731 				 * bluetoothd might be needed for realtime audio
4732 				 * playback.
4733 				 */
4734 				memory_object_mark_eligible_for_secluded(moc,
4735 				    FALSE);
4736 				memory_object_mark_for_realtime(moc,
4737 				    true);
4738 			} else {
4739 				char pathname[64] = { 0, };
4740 				size_t copied;
4741 				if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4742 					(void)copyinstr(ndp->ni_dirp,
4743 					    pathname,
4744 					    sizeof(pathname),
4745 					    &copied);
4746 				} else {
4747 					copystr(CAST_DOWN(void *, ndp->ni_dirp),
4748 					    pathname,
4749 					    sizeof(pathname),
4750 					    &copied);
4751 				}
4752 				pathname[sizeof(pathname) - 1] = '\0';
4753 				if (strncmp(pathname,
4754 				    "/Library/Audio/Plug-Ins/",
4755 				    strlen("/Library/Audio/Plug-Ins/")) == 0 ||
4756 				    strncmp(pathname,
4757 				    "/System/Library/Audio/Plug-Ins/",
4758 				    strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
4759 					/*
4760 					 * This may be an audio plugin required
4761 					 * for realtime playback.
4762 					 * ==> NOT eligible for secluded.
4763 					 */
4764 					memory_object_mark_eligible_for_secluded(moc,
4765 					    FALSE);
4766 					memory_object_mark_for_realtime(moc,
4767 					    true);
4768 				}
4769 			}
4770 			vnode_putname(v_name);
4771 		}
4772 	}
4773 #endif /* CONFIG_SECLUDED_MEMORY */
4774 
4775 	vnode_put(vp);
4776 
4777 	/*
4778 	 * The first terminal open (without a O_NOCTTY) by a session leader
4779 	 * results in it being set as the controlling terminal.
4780 	 */
4781 	if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4782 	    !(flags & O_NOCTTY)) {
4783 		int tmp = 0;
4784 
4785 		(void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4786 		    (caddr_t)&tmp, ctx);
4787 	}
4788 
4789 	proc_fdlock(p);
4790 	procfdtbl_releasefd(p, indx, NULL);
4791 
4792 	fp_drop(p, indx, fp, 1);
4793 	proc_fdunlock(p);
4794 
4795 	*retval = indx;
4796 
4797 	return 0;
4798 bad:
4799 	context = *vfs_context_current();
4800 	context.vc_ucred = fp->fp_glob->fg_cred;
4801 
4802 	if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4803 	    (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4804 		struct flock lf = {
4805 			.l_whence = SEEK_SET,
4806 			.l_type = F_UNLCK,
4807 		};
4808 
4809 		(void)VNOP_ADVLOCK(
4810 			vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4811 	}
4812 
4813 	vn_close(vp, fp->fp_glob->fg_flag, &context);
4814 	vnode_put(vp);
4815 	fp_free(p, indx, fp);
4816 
4817 	return error;
4818 }
4819 
4820 /*
4821  * While most of the *at syscall handlers can call nameiat() which
4822  * is a wrapper around namei, the use of namei and initialisation
4823  * of nameidata are far removed and in different functions  - namei
4824  * gets called in vn_open_auth for open1. So we'll just do here what
4825  * nameiat() does.
4826  */
4827 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)4828 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4829     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4830     int dirfd, int authfd)
4831 {
4832 	if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4833 		int error;
4834 		char c;
4835 
4836 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4837 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
4838 			if (error) {
4839 				return error;
4840 			}
4841 		} else {
4842 			c = *((char *)(ndp->ni_dirp));
4843 		}
4844 
4845 		if (c != '/') {
4846 			vnode_t dvp_at;
4847 
4848 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4849 			    &dvp_at);
4850 			if (error) {
4851 				return error;
4852 			}
4853 
4854 			if (vnode_vtype(dvp_at) != VDIR) {
4855 				vnode_put(dvp_at);
4856 				return ENOTDIR;
4857 			}
4858 
4859 			ndp->ni_dvp = dvp_at;
4860 			ndp->ni_cnd.cn_flags |= USEDVP;
4861 			error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4862 			    retval, authfd);
4863 			vnode_put(dvp_at);
4864 			return error;
4865 		}
4866 	}
4867 
4868 	return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
4869 }
4870 
4871 /*
4872  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4873  *
4874  * Parameters:	p			Process requesting the open
4875  *		uap			User argument descriptor (see below)
4876  *		retval			Pointer to an area to receive the
4877  *					return calue from the system call
4878  *
4879  * Indirect:	uap->path		Path to open (same as 'open')
4880  *		uap->flags		Flags to open (same as 'open'
4881  *		uap->uid		UID to set, if creating
4882  *		uap->gid		GID to set, if creating
4883  *		uap->mode		File mode, if creating (same as 'open')
4884  *		uap->xsecurity		ACL to set, if creating
4885  *
4886  * Returns:	0			Success
4887  *		!0			errno value
4888  *
4889  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
4890  *
4891  * XXX:		We should enummerate the possible errno values here, and where
4892  *		in the code they originated.
4893  */
4894 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)4895 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4896 {
4897 	int ciferror;
4898 	kauth_filesec_t xsecdst;
4899 	struct vnode_attr va;
4900 	struct nameidata nd;
4901 	int cmode;
4902 
4903 	AUDIT_ARG(owner, uap->uid, uap->gid);
4904 
4905 	xsecdst = NULL;
4906 	if ((uap->xsecurity != USER_ADDR_NULL) &&
4907 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4908 		return ciferror;
4909 	}
4910 
4911 	VATTR_INIT(&va);
4912 	cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4913 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4914 	if (uap->uid != KAUTH_UID_NONE) {
4915 		VATTR_SET(&va, va_uid, uap->uid);
4916 	}
4917 	if (uap->gid != KAUTH_GID_NONE) {
4918 		VATTR_SET(&va, va_gid, uap->gid);
4919 	}
4920 	if (xsecdst != NULL) {
4921 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4922 		va.va_vaflags |= VA_FILESEC_ACL;
4923 	}
4924 
4925 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4926 	    uap->path, vfs_context_current());
4927 
4928 	ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4929 	    NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
4930 	if (xsecdst != NULL) {
4931 		kauth_filesec_free(xsecdst);
4932 	}
4933 
4934 	return ciferror;
4935 }
4936 
4937 /*
4938  * Go through the data-protected atomically controlled open (2)
4939  *
4940  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4941  */
4942 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)4943 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4944     int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
4945 {
4946 	/*
4947 	 * Follow the same path as normal open(2)
4948 	 * Look up the item if it exists, and acquire the vnode.
4949 	 */
4950 	struct vnode_attr va;
4951 	struct nameidata nd;
4952 	int cmode;
4953 	int error;
4954 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
4955 
4956 	VATTR_INIT(&va);
4957 	/* Mask off all but regular access permissions */
4958 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4959 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4960 
4961 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
4962 	    path, ctx);
4963 
4964 	/*
4965 	 * Initialize the extra fields in vnode_attr to pass down our
4966 	 * extra fields.
4967 	 * 1. target cprotect class.
4968 	 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4969 	 */
4970 	if (flags & O_CREAT) {
4971 		/* lower level kernel code validates that the class is valid before applying it. */
4972 		if (class != PROTECTION_CLASS_DEFAULT) {
4973 			/*
4974 			 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4975 			 * file behave the same as open (2)
4976 			 */
4977 			VATTR_SET(&va, va_dataprotect_class, class);
4978 		}
4979 	}
4980 
4981 	if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
4982 		if (flags & (O_RDWR | O_WRONLY)) {
4983 			/*
4984 			 * Not allowed to write raw encrypted bytes or when opening authenticated.
4985 			 */
4986 			return EINVAL;
4987 		}
4988 		if (dpflags & O_DP_GETRAWENCRYPTED) {
4989 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4990 		}
4991 		if (dpflags & O_DP_GETRAWUNENCRYPTED) {
4992 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4993 		}
4994 		if (dpflags & O_DP_AUTHENTICATE) {
4995 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
4996 		}
4997 	}
4998 
4999 	error = open1at(vfs_context_current(), &nd, flags, &va,
5000 	    NULL, NULL, retval, fd, authfd);
5001 
5002 	return error;
5003 }
5004 
5005 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)5006 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5007 {
5008 	if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5009 		return EINVAL;
5010 	}
5011 
5012 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5013 	           uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
5014 }
5015 
5016 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)5017 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5018 {
5019 	if (uap->dpflags & O_DP_AUTHENTICATE) {
5020 		return EINVAL;
5021 	}
5022 
5023 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5024 	           uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5025 }
5026 
5027 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)5028 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5029     int fd, enum uio_seg segflg, int *retval)
5030 {
5031 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5032 	struct {
5033 		struct vnode_attr va;
5034 		struct nameidata nd;
5035 	} *__open_data;
5036 	struct vnode_attr *vap;
5037 	struct nameidata *ndp;
5038 	int cmode;
5039 	int error;
5040 
5041 	__open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5042 	vap = &__open_data->va;
5043 	ndp = &__open_data->nd;
5044 
5045 	VATTR_INIT(vap);
5046 	/* Mask off all but regular access permissions */
5047 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5048 	VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5049 
5050 	NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5051 	    segflg, path, ctx);
5052 
5053 	error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5054 
5055 	kfree_type(typeof(*__open_data), __open_data);
5056 
5057 	return error;
5058 }
5059 
5060 int
open(proc_t p,struct open_args * uap,int32_t * retval)5061 open(proc_t p, struct open_args *uap, int32_t *retval)
5062 {
5063 	__pthread_testcancel(1);
5064 	return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5065 }
5066 
5067 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5068 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5069     int32_t *retval)
5070 {
5071 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5072 	           uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
5073 }
5074 
5075 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5076 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5077     int32_t *retval)
5078 {
5079 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5080 	           uap->mode, uap->fd, UIO_USERSPACE, retval);
5081 }
5082 
5083 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5084 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5085 {
5086 	__pthread_testcancel(1);
5087 	return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5088 }
5089 
5090 #define OPEN_BY_ID_ENTITLEMENT  "com.apple.private.vfs.open-by-id"
5091 
5092 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5093 vfs_context_can_open_by_id(vfs_context_t ctx)
5094 {
5095 	if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5096 		return TRUE;
5097 	}
5098 
5099 	return IOTaskHasEntitlement(vfs_context_task(ctx),
5100 	           OPEN_BY_ID_ENTITLEMENT);
5101 }
5102 
5103 /*
5104  * openbyid_np: open a file given a file system id and a file system object id
5105  *	the hfs file system object id is an fsobj_id_t {uint32, uint32}
5106  *	file systems that don't support object ids it is a node id (uint64_t).
5107  *
5108  * Parameters:	p			Process requesting the open
5109  *		uap			User argument descriptor (see below)
5110  *		retval			Pointer to an area to receive the
5111  *					return calue from the system call
5112  *
5113  * Indirect:	uap->path		Path to open (same as 'open')
5114  *
5115  *		uap->fsid		id of target file system
5116  *		uap->objid		id of target file system object
5117  *		uap->flags		Flags to open (same as 'open')
5118  *
5119  * Returns:	0			Success
5120  *		!0			errno value
5121  *
5122  *
5123  * XXX:		We should enummerate the possible errno values here, and where
5124  *		in the code they originated.
5125  */
5126 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5127 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5128 {
5129 	fsid_t fsid;
5130 	uint64_t objid;
5131 	int error;
5132 	char *buf = NULL;
5133 	int buflen = MAXPATHLEN;
5134 	int pathlen = 0;
5135 	vfs_context_t ctx = vfs_context_current();
5136 
5137 	if (!vfs_context_can_open_by_id(ctx)) {
5138 		return EPERM;
5139 	}
5140 
5141 	if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5142 		return error;
5143 	}
5144 
5145 	/*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5146 	if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5147 		return error;
5148 	}
5149 
5150 	AUDIT_ARG(value32, fsid.val[0]);
5151 	AUDIT_ARG(value64, objid);
5152 
5153 	/*resolve path from fsis, objid*/
5154 	do {
5155 		buf = kalloc_data(buflen + 1, Z_WAITOK);
5156 		if (buf == NULL) {
5157 			return ENOMEM;
5158 		}
5159 
5160 		error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5161 		    buf, FSOPT_ISREALFSID, &pathlen);
5162 
5163 		if (error) {
5164 			kfree_data(buf, buflen + 1);
5165 			buf = NULL;
5166 		}
5167 	} while (error == ENOSPC && (buflen += MAXPATHLEN));
5168 
5169 	if (error) {
5170 		return error;
5171 	}
5172 
5173 	buf[pathlen] = 0;
5174 
5175 	error = openat_internal(
5176 		ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
5177 
5178 	kfree_data(buf, buflen + 1);
5179 
5180 	return error;
5181 }
5182 
5183 
5184 /*
5185  * Create a special file.
5186  */
5187 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5188     int fd);
5189 
5190 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5191 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5192     mode_t mode, int fd)
5193 {
5194 	vfs_context_t ctx = vfs_context_current();
5195 	struct nameidata nd;
5196 	vnode_t vp, dvp;
5197 	int error;
5198 
5199 	/* If it's a mknod() of a FIFO, call mkfifo1() instead */
5200 	if ((mode & S_IFMT) == S_IFIFO) {
5201 		return mkfifo1(ctx, upath, vap, fd);
5202 	}
5203 
5204 	AUDIT_ARG(mode, mode);
5205 	AUDIT_ARG(value32, vap->va_rdev);
5206 
5207 	if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5208 		return error;
5209 	}
5210 	NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5211 	    UIO_USERSPACE, upath, ctx);
5212 	error = nameiat(&nd, fd);
5213 	if (error) {
5214 		return error;
5215 	}
5216 	dvp = nd.ni_dvp;
5217 	vp = nd.ni_vp;
5218 
5219 	if (vp != NULL) {
5220 		error = EEXIST;
5221 		goto out;
5222 	}
5223 
5224 	switch (mode & S_IFMT) {
5225 	case S_IFCHR:
5226 		VATTR_SET(vap, va_type, VCHR);
5227 		break;
5228 	case S_IFBLK:
5229 		VATTR_SET(vap, va_type, VBLK);
5230 		break;
5231 	default:
5232 		error = EINVAL;
5233 		goto out;
5234 	}
5235 
5236 #if CONFIG_MACF
5237 	error = mac_vnode_check_create(ctx,
5238 	    nd.ni_dvp, &nd.ni_cnd, vap);
5239 	if (error) {
5240 		goto out;
5241 	}
5242 #endif
5243 
5244 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5245 		goto out;
5246 	}
5247 
5248 #if CONFIG_FILE_LEASES
5249 	vnode_breakdirlease(dvp, false, O_WRONLY);
5250 #endif
5251 
5252 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5253 		goto out;
5254 	}
5255 
5256 	if (vp) {
5257 		int     update_flags = 0;
5258 
5259 		// Make sure the name & parent pointers are hooked up
5260 		if (vp->v_name == NULL) {
5261 			update_flags |= VNODE_UPDATE_NAME;
5262 		}
5263 		if (vp->v_parent == NULLVP) {
5264 			update_flags |= VNODE_UPDATE_PARENT;
5265 		}
5266 
5267 		if (update_flags) {
5268 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5269 		}
5270 
5271 #if CONFIG_FSE
5272 		add_fsevent(FSE_CREATE_FILE, ctx,
5273 		    FSE_ARG_VNODE, vp,
5274 		    FSE_ARG_DONE);
5275 #endif
5276 	}
5277 
5278 out:
5279 	/*
5280 	 * nameidone has to happen before we vnode_put(dvp)
5281 	 * since it may need to release the fs_nodelock on the dvp
5282 	 */
5283 	nameidone(&nd);
5284 
5285 	if (vp) {
5286 		vnode_put(vp);
5287 	}
5288 	vnode_put(dvp);
5289 
5290 	return error;
5291 }
5292 
5293 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5294 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5295 {
5296 	struct vnode_attr va;
5297 
5298 	VATTR_INIT(&va);
5299 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5300 	VATTR_SET(&va, va_rdev, uap->dev);
5301 
5302 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5303 }
5304 
5305 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5306 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5307 {
5308 	struct vnode_attr va;
5309 
5310 	VATTR_INIT(&va);
5311 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5312 	VATTR_SET(&va, va_rdev, uap->dev);
5313 
5314 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5315 }
5316 
5317 /*
5318  * Create a named pipe.
5319  *
5320  * Returns:	0			Success
5321  *		EEXIST
5322  *	namei:???
5323  *	vnode_authorize:???
5324  *	vn_create:???
5325  */
5326 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5327 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5328 {
5329 	vnode_t vp, dvp;
5330 	int error;
5331 	struct nameidata nd;
5332 
5333 	NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5334 	    UIO_USERSPACE, upath, ctx);
5335 	error = nameiat(&nd, fd);
5336 	if (error) {
5337 		return error;
5338 	}
5339 	dvp = nd.ni_dvp;
5340 	vp = nd.ni_vp;
5341 
5342 	/* check that this is a new file and authorize addition */
5343 	if (vp != NULL) {
5344 		error = EEXIST;
5345 		goto out;
5346 	}
5347 	VATTR_SET(vap, va_type, VFIFO);
5348 
5349 	if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5350 		goto out;
5351 	}
5352 
5353 	error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5354 out:
5355 	/*
5356 	 * nameidone has to happen before we vnode_put(dvp)
5357 	 * since it may need to release the fs_nodelock on the dvp
5358 	 */
5359 	nameidone(&nd);
5360 
5361 	if (vp) {
5362 		vnode_put(vp);
5363 	}
5364 	vnode_put(dvp);
5365 
5366 	return error;
5367 }
5368 
5369 
5370 /*
5371  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5372  *
5373  * Parameters:	p			Process requesting the open
5374  *		uap			User argument descriptor (see below)
5375  *		retval			(Ignored)
5376  *
5377  * Indirect:	uap->path		Path to fifo (same as 'mkfifo')
5378  *		uap->uid		UID to set
5379  *		uap->gid		GID to set
5380  *		uap->mode		File mode to set (same as 'mkfifo')
5381  *		uap->xsecurity		ACL to set, if creating
5382  *
5383  * Returns:	0			Success
5384  *		!0			errno value
5385  *
5386  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
5387  *
5388  * XXX:		We should enummerate the possible errno values here, and where
5389  *		in the code they originated.
5390  */
5391 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5392 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5393 {
5394 	int ciferror;
5395 	kauth_filesec_t xsecdst;
5396 	struct vnode_attr va;
5397 
5398 	AUDIT_ARG(owner, uap->uid, uap->gid);
5399 
5400 	xsecdst = KAUTH_FILESEC_NONE;
5401 	if (uap->xsecurity != USER_ADDR_NULL) {
5402 		if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5403 			return ciferror;
5404 		}
5405 	}
5406 
5407 	VATTR_INIT(&va);
5408 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5409 	if (uap->uid != KAUTH_UID_NONE) {
5410 		VATTR_SET(&va, va_uid, uap->uid);
5411 	}
5412 	if (uap->gid != KAUTH_GID_NONE) {
5413 		VATTR_SET(&va, va_gid, uap->gid);
5414 	}
5415 	if (xsecdst != KAUTH_FILESEC_NONE) {
5416 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5417 		va.va_vaflags |= VA_FILESEC_ACL;
5418 	}
5419 
5420 	ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5421 
5422 	if (xsecdst != KAUTH_FILESEC_NONE) {
5423 		kauth_filesec_free(xsecdst);
5424 	}
5425 	return ciferror;
5426 }
5427 
5428 /* ARGSUSED */
5429 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5430 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5431 {
5432 	struct vnode_attr va;
5433 
5434 	VATTR_INIT(&va);
5435 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5436 
5437 	return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5438 }
5439 
5440 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5441 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5442 {
5443 	struct vnode_attr va;
5444 
5445 	VATTR_INIT(&va);
5446 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5447 
5448 	return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5449 }
5450 
5451 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5452 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5453 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5454 
5455 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5456 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5457 {
5458 	int ret, len = _len;
5459 
5460 	*truncated_path = 0;
5461 
5462 	if (firmlink) {
5463 		ret = vn_getpath(dvp, path, &len);
5464 	} else {
5465 		ret = vn_getpath_no_firmlink(dvp, path, &len);
5466 	}
5467 	if (ret == 0 && len < (MAXPATHLEN - 1)) {
5468 		if (leafname) {
5469 			path[len - 1] = '/';
5470 			len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5471 			if (len > MAXPATHLEN) {
5472 				char *ptr;
5473 
5474 				// the string got truncated!
5475 				*truncated_path = 1;
5476 				ptr = strrchr(path, '/');
5477 				if (ptr) {
5478 					*ptr = '\0';   // chop off the string at the last directory component
5479 				}
5480 				len = (int)strlen(path) + 1;
5481 			}
5482 		}
5483 	} else if (ret == 0) {
5484 		*truncated_path = 1;
5485 	} else if (ret != 0) {
5486 		struct vnode *mydvp = dvp;
5487 
5488 		if (ret != ENOSPC) {
5489 			printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5490 			    dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5491 		}
5492 		*truncated_path = 1;
5493 
5494 		do {
5495 			if (mydvp->v_parent != NULL) {
5496 				mydvp = mydvp->v_parent;
5497 			} else if (mydvp->v_mount) {
5498 				strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5499 				break;
5500 			} else {
5501 				// no parent and no mount point?  only thing is to punt and say "/" changed
5502 				strlcpy(path, "/", _len);
5503 				len = 2;
5504 				mydvp = NULL;
5505 			}
5506 
5507 			if (mydvp == NULL) {
5508 				break;
5509 			}
5510 
5511 			len = _len;
5512 			if (firmlink) {
5513 				ret = vn_getpath(mydvp, path, &len);
5514 			} else {
5515 				ret = vn_getpath_no_firmlink(mydvp, path, &len);
5516 			}
5517 		} while (ret == ENOSPC);
5518 	}
5519 
5520 	return len;
5521 }
5522 
5523 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5524 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5525 {
5526 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5527 }
5528 
5529 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5530 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5531 {
5532 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5533 }
5534 
5535 /*
5536  * Make a hard file link.
5537  *
5538  * Returns:	0			Success
5539  *		EPERM
5540  *		EEXIST
5541  *		EXDEV
5542  *	namei:???
5543  *	vnode_authorize:???
5544  *	VNOP_LINK:???
5545  */
5546 /* ARGSUSED */
5547 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5548 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5549     user_addr_t link, int flag, enum uio_seg segflg)
5550 {
5551 	vnode_t vp, pvp, dvp, lvp;
5552 	struct nameidata nd;
5553 	int follow;
5554 	int error;
5555 #if CONFIG_FSE
5556 	fse_info finfo;
5557 #endif
5558 	int need_event, has_listeners, need_kpath2;
5559 	char *target_path = NULL;
5560 	char  *no_firmlink_path = NULL;
5561 	int truncated = 0;
5562 	int truncated_no_firmlink_path = 0;
5563 
5564 	vp = dvp = lvp = NULLVP;
5565 
5566 	/* look up the object we are linking to */
5567 	follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5568 	NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5569 	    segflg, path, ctx);
5570 
5571 	error = nameiat(&nd, fd1);
5572 	if (error) {
5573 		return error;
5574 	}
5575 	vp = nd.ni_vp;
5576 
5577 	nameidone(&nd);
5578 
5579 	/*
5580 	 * Normally, linking to directories is not supported.
5581 	 * However, some file systems may have limited support.
5582 	 */
5583 	if (vp->v_type == VDIR) {
5584 		if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5585 			error = EPERM;   /* POSIX */
5586 			goto out;
5587 		}
5588 
5589 		/* Linking to a directory requires ownership. */
5590 		if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5591 			struct vnode_attr dva;
5592 
5593 			VATTR_INIT(&dva);
5594 			VATTR_WANTED(&dva, va_uid);
5595 			if (vnode_getattr(vp, &dva, ctx) != 0 ||
5596 			    !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5597 			    (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5598 				error = EACCES;
5599 				goto out;
5600 			}
5601 		}
5602 	}
5603 
5604 	/* lookup the target node */
5605 #if CONFIG_TRIGGERS
5606 	nd.ni_op = OP_LINK;
5607 #endif
5608 	nd.ni_cnd.cn_nameiop = CREATE;
5609 	nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5610 	nd.ni_dirp = link;
5611 	error = nameiat(&nd, fd2);
5612 	if (error != 0) {
5613 		goto out;
5614 	}
5615 	dvp = nd.ni_dvp;
5616 	lvp = nd.ni_vp;
5617 
5618 #if CONFIG_MACF
5619 	if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5620 		goto out2;
5621 	}
5622 #endif
5623 
5624 	/* or to anything that kauth doesn't want us to (eg. immutable items) */
5625 	if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5626 		goto out2;
5627 	}
5628 
5629 	/* target node must not exist */
5630 	if (lvp != NULLVP) {
5631 		error = EEXIST;
5632 		goto out2;
5633 	}
5634 	/* cannot link across mountpoints */
5635 	if (vnode_mount(vp) != vnode_mount(dvp)) {
5636 		error = EXDEV;
5637 		goto out2;
5638 	}
5639 
5640 	/* authorize creation of the target note */
5641 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5642 		goto out2;
5643 	}
5644 
5645 #if CONFIG_FILE_LEASES
5646 	vnode_breakdirlease(dvp, false, O_WRONLY);
5647 #endif
5648 
5649 	/* and finally make the link */
5650 	error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5651 	if (error) {
5652 		goto out2;
5653 	}
5654 
5655 #if CONFIG_MACF
5656 	(void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5657 #endif
5658 
5659 #if CONFIG_FSE
5660 	need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5661 #else
5662 	need_event = 0;
5663 #endif
5664 	has_listeners = kauth_authorize_fileop_has_listeners();
5665 
5666 	need_kpath2 = 0;
5667 #if CONFIG_AUDIT
5668 	if (AUDIT_RECORD_EXISTS()) {
5669 		need_kpath2 = 1;
5670 	}
5671 #endif
5672 
5673 	if (need_event || has_listeners || need_kpath2) {
5674 		char *link_to_path = NULL;
5675 		int len, link_name_len;
5676 		int  len_no_firmlink_path = 0;
5677 
5678 		/* build the path to the new link file */
5679 		GET_PATH(target_path);
5680 
5681 		len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5682 		if (no_firmlink_path == NULL) {
5683 			GET_PATH(no_firmlink_path);
5684 		}
5685 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5686 
5687 		AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5688 
5689 		if (has_listeners) {
5690 			/* build the path to file we are linking to */
5691 			GET_PATH(link_to_path);
5692 
5693 			link_name_len = MAXPATHLEN;
5694 			if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5695 				/*
5696 				 * Call out to allow 3rd party notification of rename.
5697 				 * Ignore result of kauth_authorize_fileop call.
5698 				 */
5699 				kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5700 				    (uintptr_t)link_to_path,
5701 				    (uintptr_t)target_path);
5702 			}
5703 			if (link_to_path != NULL) {
5704 				RELEASE_PATH(link_to_path);
5705 			}
5706 		}
5707 #if CONFIG_FSE
5708 		if (need_event) {
5709 			/* construct fsevent */
5710 			if (get_fse_info(vp, &finfo, ctx) == 0) {
5711 				if (truncated_no_firmlink_path) {
5712 					finfo.mode |= FSE_TRUNCATED_PATH;
5713 				}
5714 
5715 				// build the path to the destination of the link
5716 				add_fsevent(FSE_CREATE_FILE, ctx,
5717 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5718 				    FSE_ARG_FINFO, &finfo,
5719 				    FSE_ARG_DONE);
5720 			}
5721 
5722 			pvp = vp->v_parent;
5723 			// need an iocount on parent vnode in this case
5724 			if (pvp && pvp != dvp) {
5725 				pvp = vnode_getparent_if_different(vp, dvp);
5726 			}
5727 			if (pvp) {
5728 				add_fsevent(FSE_STAT_CHANGED, ctx,
5729 				    FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5730 			}
5731 			if (pvp && pvp != dvp) {
5732 				vnode_put(pvp);
5733 			}
5734 		}
5735 #endif
5736 	}
5737 out2:
5738 	/*
5739 	 * nameidone has to happen before we vnode_put(dvp)
5740 	 * since it may need to release the fs_nodelock on the dvp
5741 	 */
5742 	nameidone(&nd);
5743 	if (target_path != NULL) {
5744 		RELEASE_PATH(target_path);
5745 	}
5746 	if (no_firmlink_path != NULL) {
5747 		RELEASE_PATH(no_firmlink_path);
5748 		no_firmlink_path = NULL;
5749 	}
5750 out:
5751 	if (lvp) {
5752 		vnode_put(lvp);
5753 	}
5754 	if (dvp) {
5755 		vnode_put(dvp);
5756 	}
5757 	vnode_put(vp);
5758 	return error;
5759 }
5760 
5761 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5762 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5763 {
5764 	return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5765 	           AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5766 }
5767 
5768 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5769 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5770 {
5771 	if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5772 		return EINVAL;
5773 	}
5774 
5775 	return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5776 	           uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5777 }
5778 
5779 /*
5780  * Make a symbolic link.
5781  *
5782  * We could add support for ACLs here too...
5783  */
5784 /* ARGSUSED */
5785 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5786 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5787     user_addr_t link, enum uio_seg segflg)
5788 {
5789 	struct vnode_attr va;
5790 	char *path;
5791 	int error;
5792 	struct nameidata nd;
5793 	vnode_t vp, dvp;
5794 	size_t dummy = 0;
5795 	proc_t p;
5796 
5797 	error = 0;
5798 	if (UIO_SEG_IS_USER_SPACE(segflg)) {
5799 		path = zalloc(ZV_NAMEI);
5800 		error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5801 	} else {
5802 		path = (char *)path_data;
5803 	}
5804 	if (error) {
5805 		goto out;
5806 	}
5807 	AUDIT_ARG(text, path);  /* This is the link string */
5808 
5809 	NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5810 	    segflg, link, ctx);
5811 
5812 	error = nameiat(&nd, fd);
5813 	if (error) {
5814 		goto out;
5815 	}
5816 	dvp = nd.ni_dvp;
5817 	vp = nd.ni_vp;
5818 
5819 	p = vfs_context_proc(ctx);
5820 	VATTR_INIT(&va);
5821 	VATTR_SET(&va, va_type, VLNK);
5822 	VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5823 
5824 #if CONFIG_MACF
5825 	error = mac_vnode_check_create(ctx,
5826 	    dvp, &nd.ni_cnd, &va);
5827 #endif
5828 	if (error != 0) {
5829 		goto skipit;
5830 	}
5831 
5832 	if (vp != NULL) {
5833 		error = EEXIST;
5834 		goto skipit;
5835 	}
5836 
5837 	/* authorize */
5838 	if (error == 0) {
5839 		error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5840 	}
5841 	/* get default ownership, etc. */
5842 	if (error == 0) {
5843 		error = vnode_authattr_new(dvp, &va, 0, ctx);
5844 	}
5845 
5846 #if CONFIG_FILE_LEASES
5847 	vnode_breakdirlease(dvp, false, O_WRONLY);
5848 #endif
5849 
5850 	if (error == 0) {
5851 		error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5852 	}
5853 
5854 	/* do fallback attribute handling */
5855 	if (error == 0 && vp) {
5856 		error = vnode_setattr_fallback(vp, &va, ctx);
5857 	}
5858 
5859 #if CONFIG_MACF
5860 	if (error == 0 && vp) {
5861 		error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5862 	}
5863 #endif
5864 
5865 	if (error == 0) {
5866 		int     update_flags = 0;
5867 
5868 		/*check if a new vnode was created, else try to get one*/
5869 		if (vp == NULL) {
5870 			nd.ni_cnd.cn_nameiop = LOOKUP;
5871 #if CONFIG_TRIGGERS
5872 			nd.ni_op = OP_LOOKUP;
5873 #endif
5874 			/*
5875 			 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5876 			 * reallocated again in namei().
5877 			 */
5878 			nd.ni_cnd.cn_flags &= HASBUF;
5879 			error = nameiat(&nd, fd);
5880 			if (error) {
5881 				goto skipit;
5882 			}
5883 			vp = nd.ni_vp;
5884 		}
5885 
5886 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5887 		/* call out to allow 3rd party notification of rename.
5888 		 * Ignore result of kauth_authorize_fileop call.
5889 		 */
5890 		if (kauth_authorize_fileop_has_listeners() &&
5891 		    namei(&nd) == 0) {
5892 			char *new_link_path = NULL;
5893 			int             len;
5894 
5895 			/* build the path to the new link file */
5896 			new_link_path = get_pathbuff();
5897 			len = MAXPATHLEN;
5898 			vn_getpath(dvp, new_link_path, &len);
5899 			if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5900 				new_link_path[len - 1] = '/';
5901 				strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5902 			}
5903 
5904 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5905 			    (uintptr_t)path, (uintptr_t)new_link_path);
5906 			if (new_link_path != NULL) {
5907 				release_pathbuff(new_link_path);
5908 			}
5909 		}
5910 #endif
5911 		// Make sure the name & parent pointers are hooked up
5912 		if (vp->v_name == NULL) {
5913 			update_flags |= VNODE_UPDATE_NAME;
5914 		}
5915 		if (vp->v_parent == NULLVP) {
5916 			update_flags |= VNODE_UPDATE_PARENT;
5917 		}
5918 
5919 		if (update_flags) {
5920 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5921 		}
5922 
5923 #if CONFIG_FSE
5924 		add_fsevent(FSE_CREATE_FILE, ctx,
5925 		    FSE_ARG_VNODE, vp,
5926 		    FSE_ARG_DONE);
5927 #endif
5928 	}
5929 
5930 skipit:
5931 	/*
5932 	 * nameidone has to happen before we vnode_put(dvp)
5933 	 * since it may need to release the fs_nodelock on the dvp
5934 	 */
5935 	nameidone(&nd);
5936 
5937 	if (vp) {
5938 		vnode_put(vp);
5939 	}
5940 	vnode_put(dvp);
5941 out:
5942 	if (path && (path != (char *)path_data)) {
5943 		zfree(ZV_NAMEI, path);
5944 	}
5945 
5946 	return error;
5947 }
5948 
5949 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)5950 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5951 {
5952 	return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5953 	           uap->link, UIO_USERSPACE);
5954 }
5955 
5956 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)5957 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5958     __unused int32_t *retval)
5959 {
5960 	return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5961 	           uap->path2, UIO_USERSPACE);
5962 }
5963 
5964 /*
5965  * Delete a whiteout from the filesystem.
5966  * No longer supported.
5967  */
5968 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)5969 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5970 {
5971 	return ENOTSUP;
5972 }
5973 
5974 /*
5975  * Delete a name from the filesystem.
5976  */
5977 /* ARGSUSED */
5978 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5979 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5980     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5981 {
5982 	struct {
5983 		struct nameidata nd;
5984 #if CONFIG_FSE
5985 		struct vnode_attr va;
5986 		fse_info finfo;
5987 #endif
5988 	} *__unlink_data;
5989 	struct nameidata *ndp;
5990 	vnode_t vp, dvp;
5991 	int error;
5992 	struct componentname *cnp;
5993 	char  *path = NULL;
5994 	char  *no_firmlink_path = NULL;
5995 	int  len_path = 0;
5996 	int  len_no_firmlink_path = 0;
5997 	int flags;
5998 	int need_event;
5999 	int has_listeners;
6000 	int truncated_path;
6001 	int truncated_no_firmlink_path;
6002 	int batched;
6003 	struct vnode_attr *vap;
6004 	int do_retry;
6005 	int retry_count = 0;
6006 	int cn_flags;
6007 	int nofollow_any = 0;
6008 
6009 	cn_flags = LOCKPARENT;
6010 	if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6011 		cn_flags |= AUDITVNPATH1;
6012 	}
6013 	if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
6014 		nofollow_any = NAMEI_NOFOLLOW_ANY;
6015 		unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
6016 	}
6017 	/* If a starting dvp is passed, it trumps any fd passed. */
6018 	if (start_dvp) {
6019 		cn_flags |= USEDVP;
6020 	}
6021 
6022 #if NAMEDRSRCFORK
6023 	/* unlink or delete is allowed on rsrc forks and named streams */
6024 	cn_flags |= CN_ALLOWRSRCFORK;
6025 #endif
6026 
6027 	__unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6028 	ndp = &__unlink_data->nd;
6029 #if CONFIG_FSE
6030 	fse_info *finfop = &__unlink_data->finfo;
6031 #endif
6032 
6033 retry:
6034 	do_retry = 0;
6035 	flags = 0;
6036 	need_event = 0;
6037 	has_listeners = 0;
6038 	truncated_path = 0;
6039 	truncated_no_firmlink_path = 0;
6040 	vap = NULL;
6041 
6042 	NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6043 
6044 	ndp->ni_dvp = start_dvp;
6045 	ndp->ni_flag |= NAMEI_COMPOUNDREMOVE | nofollow_any;
6046 	cnp = &ndp->ni_cnd;
6047 
6048 continue_lookup:
6049 	error = nameiat(ndp, fd);
6050 	if (error) {
6051 		goto early_out;
6052 	}
6053 
6054 	dvp = ndp->ni_dvp;
6055 	vp = ndp->ni_vp;
6056 
6057 	/* With Carbon delete semantics, busy files cannot be deleted */
6058 	if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6059 		flags |= VNODE_REMOVE_NODELETEBUSY;
6060 	}
6061 
6062 	/* Skip any potential upcalls if told to. */
6063 	if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6064 		flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6065 	}
6066 
6067 	if (vp) {
6068 		batched = vnode_compound_remove_available(vp);
6069 		/*
6070 		 * The root of a mounted filesystem cannot be deleted.
6071 		 */
6072 		if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6073 			error = EBUSY;
6074 			goto out;
6075 		}
6076 
6077 #if DEVELOPMENT || DEBUG
6078 		/*
6079 		 * XXX VSWAP: Check for entitlements or special flag here
6080 		 * so we can restrict access appropriately.
6081 		 */
6082 #else /* DEVELOPMENT || DEBUG */
6083 
6084 		if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6085 			error = EPERM;
6086 			goto out;
6087 		}
6088 #endif /* DEVELOPMENT || DEBUG */
6089 
6090 		if (!batched) {
6091 			error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6092 			if (error) {
6093 				if (error == ENOENT) {
6094 					if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6095 						do_retry = 1;
6096 						retry_count++;
6097 					}
6098 				}
6099 				goto out;
6100 			}
6101 		}
6102 	} else {
6103 		batched = 1;
6104 
6105 		if (!vnode_compound_remove_available(dvp)) {
6106 			panic("No vp, but no compound remove?");
6107 		}
6108 	}
6109 
6110 #if CONFIG_FSE
6111 	need_event = need_fsevent(FSE_DELETE, dvp);
6112 	if (need_event) {
6113 		if (!batched) {
6114 			if ((vp->v_flag & VISHARDLINK) == 0) {
6115 				/* XXX need to get these data in batched VNOP */
6116 				get_fse_info(vp, finfop, ctx);
6117 			}
6118 		} else {
6119 			error =
6120 			    vfs_get_notify_attributes(&__unlink_data->va);
6121 			if (error) {
6122 				goto out;
6123 			}
6124 
6125 			vap = &__unlink_data->va;
6126 		}
6127 	}
6128 #endif
6129 	has_listeners = kauth_authorize_fileop_has_listeners();
6130 	if (need_event || has_listeners) {
6131 		if (path == NULL) {
6132 			GET_PATH(path);
6133 		}
6134 		len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6135 		if (no_firmlink_path == NULL) {
6136 			GET_PATH(no_firmlink_path);
6137 		}
6138 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6139 	}
6140 
6141 #if NAMEDRSRCFORK
6142 	if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6143 		error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6144 	} else
6145 #endif
6146 	{
6147 #if CONFIG_FILE_LEASES
6148 		vnode_breakdirlease(dvp, false, O_WRONLY);
6149 #endif
6150 
6151 		error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6152 		vp = ndp->ni_vp;
6153 		if (error == EKEEPLOOKING) {
6154 			if (!batched) {
6155 				panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6156 			}
6157 
6158 			if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6159 				panic("EKEEPLOOKING, but continue flag not set?");
6160 			}
6161 
6162 			if (vnode_isdir(vp)) {
6163 				error = EISDIR;
6164 				goto out;
6165 			}
6166 			goto continue_lookup;
6167 		} else if (error == ENOENT && batched) {
6168 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6169 				/*
6170 				 * For compound VNOPs, the authorization callback may
6171 				 * return ENOENT in case of racing hardlink lookups
6172 				 * hitting the name  cache, redrive the lookup.
6173 				 */
6174 				do_retry = 1;
6175 				retry_count += 1;
6176 				goto out;
6177 			}
6178 		}
6179 	}
6180 
6181 	/*
6182 	 * Call out to allow 3rd party notification of delete.
6183 	 * Ignore result of kauth_authorize_fileop call.
6184 	 */
6185 	if (!error) {
6186 		if (has_listeners) {
6187 			kauth_authorize_fileop(vfs_context_ucred(ctx),
6188 			    KAUTH_FILEOP_DELETE,
6189 			    (uintptr_t)vp,
6190 			    (uintptr_t)path);
6191 		}
6192 
6193 		if (vp->v_flag & VISHARDLINK) {
6194 			//
6195 			// if a hardlink gets deleted we want to blow away the
6196 			// v_parent link because the path that got us to this
6197 			// instance of the link is no longer valid.  this will
6198 			// force the next call to get the path to ask the file
6199 			// system instead of just following the v_parent link.
6200 			//
6201 			vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6202 		}
6203 
6204 #if CONFIG_FSE
6205 		if (need_event) {
6206 			if (vp->v_flag & VISHARDLINK) {
6207 				get_fse_info(vp, finfop, ctx);
6208 			} else if (vap) {
6209 				vnode_get_fse_info_from_vap(vp, finfop, vap);
6210 			}
6211 			if (truncated_path) {
6212 				finfop->mode |= FSE_TRUNCATED_PATH;
6213 			}
6214 			add_fsevent(FSE_DELETE, ctx,
6215 			    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6216 			    FSE_ARG_FINFO, finfop,
6217 			    FSE_ARG_DONE);
6218 		}
6219 #endif
6220 
6221 #if CONFIG_MACF
6222 		mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6223 #endif
6224 	}
6225 
6226 out:
6227 	if (path != NULL) {
6228 		RELEASE_PATH(path);
6229 		path = NULL;
6230 	}
6231 
6232 	if (no_firmlink_path != NULL) {
6233 		RELEASE_PATH(no_firmlink_path);
6234 		no_firmlink_path = NULL;
6235 	}
6236 #if NAMEDRSRCFORK
6237 	/* recycle the deleted rsrc fork vnode to force a reclaim, which
6238 	 * will cause its shadow file to go away if necessary.
6239 	 */
6240 	if (vp && (vnode_isnamedstream(vp)) &&
6241 	    (vp->v_parent != NULLVP) &&
6242 	    vnode_isshadow(vp)) {
6243 		vnode_recycle(vp);
6244 	}
6245 #endif
6246 	/*
6247 	 * nameidone has to happen before we vnode_put(dvp)
6248 	 * since it may need to release the fs_nodelock on the dvp
6249 	 */
6250 	nameidone(ndp);
6251 	vnode_put(dvp);
6252 	if (vp) {
6253 		vnode_put(vp);
6254 	}
6255 
6256 	if (do_retry) {
6257 		goto retry;
6258 	}
6259 
6260 early_out:
6261 	kfree_type(typeof(*__unlink_data), __unlink_data);
6262 	return error;
6263 }
6264 
6265 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6266 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6267     enum uio_seg segflg, int unlink_flags)
6268 {
6269 	return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6270 	           unlink_flags);
6271 }
6272 
6273 /*
6274  * Delete a name from the filesystem using Carbon semantics.
6275  */
6276 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6277 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6278 {
6279 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6280 	           uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6281 }
6282 
6283 /*
6284  * Delete a name from the filesystem using POSIX semantics.
6285  */
6286 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6287 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6288 {
6289 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6290 	           uap->path, UIO_USERSPACE, 0);
6291 }
6292 
6293 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6294 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6295 {
6296 	int unlink_flags = 0;
6297 
6298 	if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS | AT_SYMLINK_NOFOLLOW_ANY)) {
6299 		return EINVAL;
6300 	}
6301 
6302 	if (uap->flag & AT_SYMLINK_NOFOLLOW_ANY) {
6303 		unlink_flags |= VNODE_REMOVE_NOFOLLOW_ANY;
6304 	}
6305 
6306 	if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6307 		if (uap->flag & AT_REMOVEDIR_DATALESS) {
6308 			unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6309 		}
6310 		return rmdirat_internal(vfs_context_current(), uap->fd,
6311 		           uap->path, UIO_USERSPACE, unlink_flags);
6312 	} else {
6313 		return unlinkat_internal(vfs_context_current(), uap->fd,
6314 		           NULLVP, uap->path, UIO_USERSPACE, unlink_flags);
6315 	}
6316 }
6317 
6318 /*
6319  * Reposition read/write file offset.
6320  */
6321 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6322 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6323 {
6324 	struct fileproc *fp;
6325 	vnode_t vp;
6326 	struct vfs_context *ctx;
6327 	off_t offset = uap->offset, file_size;
6328 	int error;
6329 
6330 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6331 		if (error == ENOTSUP) {
6332 			return ESPIPE;
6333 		}
6334 		return error;
6335 	}
6336 	if (vnode_isfifo(vp)) {
6337 		file_drop(uap->fd);
6338 		return ESPIPE;
6339 	}
6340 
6341 
6342 	ctx = vfs_context_current();
6343 #if CONFIG_MACF
6344 	if (uap->whence == L_INCR && uap->offset == 0) {
6345 		error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6346 		    fp->fp_glob);
6347 	} else {
6348 		error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6349 		    fp->fp_glob);
6350 	}
6351 	if (error) {
6352 		file_drop(uap->fd);
6353 		return error;
6354 	}
6355 #endif
6356 	if ((error = vnode_getwithref(vp))) {
6357 		file_drop(uap->fd);
6358 		return error;
6359 	}
6360 
6361 	switch (uap->whence) {
6362 	case L_INCR:
6363 		offset += fp->fp_glob->fg_offset;
6364 		break;
6365 	case L_XTND:
6366 		if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6367 			break;
6368 		}
6369 		offset += file_size;
6370 		break;
6371 	case L_SET:
6372 		break;
6373 	case SEEK_HOLE:
6374 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6375 		break;
6376 	case SEEK_DATA:
6377 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6378 		break;
6379 	default:
6380 		error = EINVAL;
6381 	}
6382 	if (error == 0) {
6383 		if (uap->offset > 0 && offset < 0) {
6384 			/* Incremented/relative move past max size */
6385 			error = EOVERFLOW;
6386 		} else {
6387 			/*
6388 			 * Allow negative offsets on character devices, per
6389 			 * POSIX 1003.1-2001.  Most likely for writing disk
6390 			 * labels.
6391 			 */
6392 			if (offset < 0 && vp->v_type != VCHR) {
6393 				/* Decremented/relative move before start */
6394 				error = EINVAL;
6395 			} else {
6396 				/* Success */
6397 				fp->fp_glob->fg_offset = offset;
6398 				*retval = fp->fp_glob->fg_offset;
6399 			}
6400 		}
6401 	}
6402 
6403 	/*
6404 	 * An lseek can affect whether data is "available to read."  Use
6405 	 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6406 	 */
6407 	post_event_if_success(vp, error, NOTE_NONE);
6408 	(void)vnode_put(vp);
6409 	file_drop(uap->fd);
6410 	return error;
6411 }
6412 
6413 
6414 /*
6415  * Check access permissions.
6416  *
6417  * Returns:	0			Success
6418  *		vnode_authorize:???
6419  */
6420 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6421 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6422 {
6423 	kauth_action_t action;
6424 	int error;
6425 
6426 	/*
6427 	 * If just the regular access bits, convert them to something
6428 	 * that vnode_authorize will understand.
6429 	 */
6430 	if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6431 		action = 0;
6432 		if (uflags & R_OK) {
6433 			action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
6434 		}
6435 		if (uflags & W_OK) {
6436 			if (vnode_isdir(vp)) {
6437 				action |= KAUTH_VNODE_ADD_FILE |
6438 				    KAUTH_VNODE_ADD_SUBDIRECTORY;
6439 				/* might want delete rights here too */
6440 			} else {
6441 				action |= KAUTH_VNODE_WRITE_DATA;
6442 			}
6443 		}
6444 		if (uflags & X_OK) {
6445 			if (vnode_isdir(vp)) {
6446 				action |= KAUTH_VNODE_SEARCH;
6447 			} else {
6448 				action |= KAUTH_VNODE_EXECUTE;
6449 			}
6450 		}
6451 	} else {
6452 		/* take advantage of definition of uflags */
6453 		action = uflags >> 8;
6454 	}
6455 
6456 #if CONFIG_MACF
6457 	error = mac_vnode_check_access(ctx, vp, uflags);
6458 	if (error) {
6459 		return error;
6460 	}
6461 #endif /* MAC */
6462 
6463 	/* action == 0 means only check for existence */
6464 	if (action != 0) {
6465 		error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6466 	} else {
6467 		error = 0;
6468 	}
6469 
6470 	return error;
6471 }
6472 
6473 
6474 
6475 /*
6476  * access_extended: Check access permissions in bulk.
6477  *
6478  * Description:	uap->entries		Pointer to an array of accessx
6479  *                                      descriptor structs, plus one or
6480  *                                      more NULL terminated strings (see
6481  *                                      "Notes" section below).
6482  *		uap->size		Size of the area pointed to by
6483  *					uap->entries.
6484  *		uap->results		Pointer to the results array.
6485  *
6486  * Returns:	0			Success
6487  *		ENOMEM			Insufficient memory
6488  *		EINVAL			Invalid arguments
6489  *		namei:EFAULT		Bad address
6490  *		namei:ENAMETOOLONG	Filename too long
6491  *		namei:ENOENT		No such file or directory
6492  *		namei:ELOOP		Too many levels of symbolic links
6493  *		namei:EBADF		Bad file descriptor
6494  *		namei:ENOTDIR		Not a directory
6495  *		namei:???
6496  *		access1:
6497  *
6498  * Implicit returns:
6499  *		uap->results		Array contents modified
6500  *
6501  * Notes:	The uap->entries are structured as an arbitrary length array
6502  *		of accessx descriptors, followed by one or more NULL terminated
6503  *		strings
6504  *
6505  *			struct accessx_descriptor[0]
6506  *			...
6507  *			struct accessx_descriptor[n]
6508  *			char name_data[0];
6509  *
6510  *		We determine the entry count by walking the buffer containing
6511  *		the uap->entries argument descriptor.  For each descriptor we
6512  *		see, the valid values for the offset ad_name_offset will be
6513  *		in the byte range:
6514  *
6515  *			[ uap->entries + sizeof(struct accessx_descriptor) ]
6516  *						to
6517  *				[ uap->entries + uap->size - 2 ]
6518  *
6519  *		since we must have at least one string, and the string must
6520  *		be at least one character plus the NULL terminator in length.
6521  *
6522  * XXX:		Need to support the check-as uid argument
6523  */
6524 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6525 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6526 {
6527 	struct accessx_descriptor *input = NULL;
6528 	errno_t *result = NULL;
6529 	errno_t error = 0;
6530 	int wantdelete = 0;
6531 	size_t desc_max, desc_actual = 0;
6532 	unsigned int i, j;
6533 	struct vfs_context context;
6534 	struct nameidata nd;
6535 	int niopts;
6536 	vnode_t vp = NULL;
6537 	vnode_t dvp = NULL;
6538 #define ACCESSX_MAX_DESCR_ON_STACK 10
6539 	struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6540 
6541 	context.vc_ucred = NULL;
6542 
6543 	/*
6544 	 * Validate parameters; if valid, copy the descriptor array and string
6545 	 * arguments into local memory.  Before proceeding, the following
6546 	 * conditions must have been met:
6547 	 *
6548 	 * o	The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6549 	 * o	There must be sufficient room in the request for at least one
6550 	 *	descriptor and a one yte NUL terminated string.
6551 	 * o	The allocation of local storage must not fail.
6552 	 */
6553 	if (uap->size > ACCESSX_MAX_TABLESIZE) {
6554 		return ENOMEM;
6555 	}
6556 	if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6557 		return EINVAL;
6558 	}
6559 	if (uap->size <= sizeof(stack_input)) {
6560 		input = stack_input;
6561 	} else {
6562 		input = kalloc_data(uap->size, Z_WAITOK);
6563 		if (input == NULL) {
6564 			error = ENOMEM;
6565 			goto out;
6566 		}
6567 	}
6568 	error = copyin(uap->entries, input, uap->size);
6569 	if (error) {
6570 		goto out;
6571 	}
6572 
6573 	AUDIT_ARG(opaque, input, uap->size);
6574 
6575 	/*
6576 	 * Force NUL termination of the copyin buffer to avoid nami() running
6577 	 * off the end.  If the caller passes us bogus data, they may get a
6578 	 * bogus result.
6579 	 */
6580 	((char *)input)[uap->size - 1] = 0;
6581 
6582 	/*
6583 	 * Access is defined as checking against the process' real identity,
6584 	 * even if operations are checking the effective identity.  This
6585 	 * requires that we use a local vfs context.
6586 	 */
6587 	context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6588 	context.vc_thread = current_thread();
6589 
6590 	/*
6591 	 * Find out how many entries we have, so we can allocate the result
6592 	 * array by walking the list and adjusting the count downward by the
6593 	 * earliest string offset we see.
6594 	 */
6595 	desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6596 	desc_actual = desc_max;
6597 	for (i = 0; i < desc_actual; i++) {
6598 		/*
6599 		 * Take the offset to the name string for this entry and
6600 		 * convert to an input array index, which would be one off
6601 		 * the end of the array if this entry was the lowest-addressed
6602 		 * name string.
6603 		 */
6604 		j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6605 
6606 		/*
6607 		 * An offset greater than the max allowable offset is an error.
6608 		 * It is also an error for any valid entry to point
6609 		 * to a location prior to the end of the current entry, if
6610 		 * it's not a reference to the string of the previous entry.
6611 		 */
6612 		if (j > desc_max || (j != 0 && j <= i)) {
6613 			error = EINVAL;
6614 			goto out;
6615 		}
6616 
6617 		/* Also do not let ad_name_offset point to something beyond the size of the input */
6618 		if (input[i].ad_name_offset >= uap->size) {
6619 			error = EINVAL;
6620 			goto out;
6621 		}
6622 
6623 		/*
6624 		 * An offset of 0 means use the previous descriptor's offset;
6625 		 * this is used to chain multiple requests for the same file
6626 		 * to avoid multiple lookups.
6627 		 */
6628 		if (j == 0) {
6629 			/* This is not valid for the first entry */
6630 			if (i == 0) {
6631 				error = EINVAL;
6632 				goto out;
6633 			}
6634 			continue;
6635 		}
6636 
6637 		/*
6638 		 * If the offset of the string for this descriptor is before
6639 		 * what we believe is the current actual last descriptor,
6640 		 * then we need to adjust our estimate downward; this permits
6641 		 * the string table following the last descriptor to be out
6642 		 * of order relative to the descriptor list.
6643 		 */
6644 		if (j < desc_actual) {
6645 			desc_actual = j;
6646 		}
6647 	}
6648 
6649 	/*
6650 	 * We limit the actual number of descriptors we are willing to process
6651 	 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
6652 	 * requested does not exceed this limit,
6653 	 */
6654 	if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6655 		error = ENOMEM;
6656 		goto out;
6657 	}
6658 	result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6659 	if (result == NULL) {
6660 		error = ENOMEM;
6661 		goto out;
6662 	}
6663 
6664 	/*
6665 	 * Do the work by iterating over the descriptor entries we know to
6666 	 * at least appear to contain valid data.
6667 	 */
6668 	error = 0;
6669 	for (i = 0; i < desc_actual; i++) {
6670 		/*
6671 		 * If the ad_name_offset is 0, then we use the previous
6672 		 * results to make the check; otherwise, we are looking up
6673 		 * a new file name.
6674 		 */
6675 		if (input[i].ad_name_offset != 0) {
6676 			/* discard old vnodes */
6677 			if (vp) {
6678 				vnode_put(vp);
6679 				vp = NULL;
6680 			}
6681 			if (dvp) {
6682 				vnode_put(dvp);
6683 				dvp = NULL;
6684 			}
6685 
6686 			/*
6687 			 * Scan forward in the descriptor list to see if we
6688 			 * need the parent vnode.  We will need it if we are
6689 			 * deleting, since we must have rights  to remove
6690 			 * entries in the parent directory, as well as the
6691 			 * rights to delete the object itself.
6692 			 */
6693 			wantdelete = input[i].ad_flags & _DELETE_OK;
6694 			for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6695 				if (input[j].ad_flags & _DELETE_OK) {
6696 					wantdelete = 1;
6697 				}
6698 			}
6699 
6700 			niopts = FOLLOW | AUDITVNPATH1;
6701 
6702 			/* need parent for vnode_authorize for deletion test */
6703 			if (wantdelete) {
6704 				niopts |= WANTPARENT;
6705 			}
6706 
6707 			/* do the lookup */
6708 			NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6709 			    CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6710 			    &context);
6711 			error = namei(&nd);
6712 			if (!error) {
6713 				vp = nd.ni_vp;
6714 				if (wantdelete) {
6715 					dvp = nd.ni_dvp;
6716 				}
6717 			}
6718 			nameidone(&nd);
6719 		}
6720 
6721 		/*
6722 		 * Handle lookup errors.
6723 		 */
6724 		switch (error) {
6725 		case ENOENT:
6726 		case EACCES:
6727 		case EPERM:
6728 		case ENOTDIR:
6729 			result[i] = error;
6730 			break;
6731 		case 0:
6732 			/* run this access check */
6733 			result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6734 			break;
6735 		default:
6736 			/* fatal lookup error */
6737 
6738 			goto out;
6739 		}
6740 	}
6741 
6742 	AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6743 
6744 	/* copy out results */
6745 	error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6746 
6747 out:
6748 	if (input && input != stack_input) {
6749 		kfree_data(input, uap->size);
6750 	}
6751 	if (result) {
6752 		kfree_data(result, desc_actual * sizeof(errno_t));
6753 	}
6754 	if (vp) {
6755 		vnode_put(vp);
6756 	}
6757 	if (dvp) {
6758 		vnode_put(dvp);
6759 	}
6760 	if (IS_VALID_CRED(context.vc_ucred)) {
6761 		kauth_cred_unref(&context.vc_ucred);
6762 	}
6763 	return error;
6764 }
6765 
6766 
6767 /*
6768  * Returns:	0			Success
6769  *		namei:EFAULT		Bad address
6770  *		namei:ENAMETOOLONG	Filename too long
6771  *		namei:ENOENT		No such file or directory
6772  *		namei:ELOOP		Too many levels of symbolic links
6773  *		namei:EBADF		Bad file descriptor
6774  *		namei:ENOTDIR		Not a directory
6775  *		namei:???
6776  *		access1:
6777  */
6778 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6779 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6780     int flag, enum uio_seg segflg)
6781 {
6782 	int error;
6783 	struct nameidata nd;
6784 	int niopts;
6785 	struct vfs_context context;
6786 #if NAMEDRSRCFORK
6787 	int is_namedstream = 0;
6788 #endif
6789 
6790 	/*
6791 	 * Unless the AT_EACCESS option is used, Access is defined as checking
6792 	 * against the process' real identity, even if operations are checking
6793 	 * the effective identity.  So we need to tweak the credential
6794 	 * in the context for that case.
6795 	 */
6796 	if (!(flag & AT_EACCESS)) {
6797 		context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6798 	} else {
6799 		context.vc_ucred = ctx->vc_ucred;
6800 	}
6801 	context.vc_thread = ctx->vc_thread;
6802 
6803 
6804 	niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6805 	/* need parent for vnode_authorize for deletion test */
6806 	if (amode & _DELETE_OK) {
6807 		niopts |= WANTPARENT;
6808 	}
6809 	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6810 	    path, &context);
6811 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6812 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6813 	}
6814 
6815 #if NAMEDRSRCFORK
6816 	/* access(F_OK) calls are allowed for resource forks. */
6817 	if (amode == F_OK) {
6818 		nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6819 	}
6820 #endif
6821 	error = nameiat(&nd, fd);
6822 	if (error) {
6823 		goto out;
6824 	}
6825 
6826 #if NAMEDRSRCFORK
6827 	/* Grab reference on the shadow stream file vnode to
6828 	 * force an inactive on release which will mark it
6829 	 * for recycle.
6830 	 */
6831 	if (vnode_isnamedstream(nd.ni_vp) &&
6832 	    (nd.ni_vp->v_parent != NULLVP) &&
6833 	    vnode_isshadow(nd.ni_vp)) {
6834 		is_namedstream = 1;
6835 		vnode_ref(nd.ni_vp);
6836 	}
6837 #endif
6838 
6839 	error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6840 
6841 #if NAMEDRSRCFORK
6842 	if (is_namedstream) {
6843 		vnode_rele(nd.ni_vp);
6844 	}
6845 #endif
6846 
6847 	vnode_put(nd.ni_vp);
6848 	if (amode & _DELETE_OK) {
6849 		vnode_put(nd.ni_dvp);
6850 	}
6851 	nameidone(&nd);
6852 
6853 out:
6854 	if (!(flag & AT_EACCESS)) {
6855 		kauth_cred_unref(&context.vc_ucred);
6856 	}
6857 	return error;
6858 }
6859 
6860 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)6861 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6862 {
6863 	return faccessat_internal(vfs_context_current(), AT_FDCWD,
6864 	           uap->path, uap->flags, 0, UIO_USERSPACE);
6865 }
6866 
6867 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)6868 faccessat(__unused proc_t p, struct faccessat_args *uap,
6869     __unused int32_t *retval)
6870 {
6871 	if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
6872 		return EINVAL;
6873 	}
6874 
6875 	return faccessat_internal(vfs_context_current(), uap->fd,
6876 	           uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6877 }
6878 
6879 /*
6880  * Returns:	0			Success
6881  *		EFAULT
6882  *	copyout:EFAULT
6883  *	namei:???
6884  *	vn_stat:???
6885  */
6886 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)6887 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6888     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6889     enum uio_seg segflg, int fd, int flag)
6890 {
6891 	struct nameidata *ndp = NULL;
6892 	int follow;
6893 	union {
6894 		struct stat sb;
6895 		struct stat64 sb64;
6896 	} source = {};
6897 	union {
6898 		struct user64_stat user64_sb;
6899 		struct user32_stat user32_sb;
6900 		struct user64_stat64 user64_sb64;
6901 		struct user32_stat64 user32_sb64;
6902 	} dest = {};
6903 	caddr_t sbp;
6904 	int error, my_size;
6905 	kauth_filesec_t fsec = KAUTH_FILESEC_NONE;
6906 	size_t xsecurity_bufsize;
6907 	void * statptr;
6908 	struct fileproc *fp = NULL;
6909 	int needsrealdev = 0;
6910 
6911 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
6912 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
6913 	NDINIT(ndp, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6914 	    segflg, path, ctx);
6915 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6916 		ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
6917 	}
6918 
6919 #if NAMEDRSRCFORK
6920 	int is_namedstream = 0;
6921 	/* stat calls are allowed for resource forks. */
6922 	ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6923 #endif
6924 
6925 	if (flag & AT_FDONLY) {
6926 		vnode_t fvp;
6927 
6928 		error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6929 		if (error) {
6930 			goto out;
6931 		}
6932 		if ((error = vnode_getwithref(fvp))) {
6933 			file_drop(fd);
6934 			goto out;
6935 		}
6936 		ndp->ni_vp = fvp;
6937 	} else {
6938 		error = nameiat(ndp, fd);
6939 		if (error) {
6940 			goto out;
6941 		}
6942 	}
6943 
6944 	statptr = (void *)&source;
6945 
6946 #if NAMEDRSRCFORK
6947 	/* Grab reference on the shadow stream file vnode to
6948 	 * force an inactive on release which will mark it
6949 	 * for recycle.
6950 	 */
6951 	if (vnode_isnamedstream(ndp->ni_vp) &&
6952 	    (ndp->ni_vp->v_parent != NULLVP) &&
6953 	    vnode_isshadow(ndp->ni_vp)) {
6954 		is_namedstream = 1;
6955 		vnode_ref(ndp->ni_vp);
6956 	}
6957 #endif
6958 
6959 	needsrealdev = flag & AT_REALDEV ? 1 : 0;
6960 	if (fp && (xsecurity == USER_ADDR_NULL)) {
6961 		/*
6962 		 * If the caller has the file open, and is not
6963 		 * requesting extended security information, we are
6964 		 * going to let them get the basic stat information.
6965 		 */
6966 		error = vn_stat_noauth(ndp->ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6967 		    fp->fp_glob->fg_cred);
6968 	} else {
6969 		error = vn_stat(ndp->ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6970 		    isstat64, needsrealdev, ctx);
6971 	}
6972 
6973 #if NAMEDRSRCFORK
6974 	if (is_namedstream) {
6975 		vnode_rele(ndp->ni_vp);
6976 	}
6977 #endif
6978 	vnode_put(ndp->ni_vp);
6979 	nameidone(ndp);
6980 
6981 	if (fp) {
6982 		file_drop(fd);
6983 		fp = NULL;
6984 	}
6985 
6986 	if (error) {
6987 		goto out;
6988 	}
6989 	/* Zap spare fields */
6990 	if (isstat64 != 0) {
6991 		source.sb64.st_lspare = 0;
6992 		source.sb64.st_qspare[0] = 0LL;
6993 		source.sb64.st_qspare[1] = 0LL;
6994 		if (vfs_context_is64bit(ctx)) {
6995 			munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6996 			my_size = sizeof(dest.user64_sb64);
6997 			sbp = (caddr_t)&dest.user64_sb64;
6998 		} else {
6999 			munge_user32_stat64(&source.sb64, &dest.user32_sb64);
7000 			my_size = sizeof(dest.user32_sb64);
7001 			sbp = (caddr_t)&dest.user32_sb64;
7002 		}
7003 		/*
7004 		 * Check if we raced (post lookup) against the last unlink of a file.
7005 		 */
7006 		if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7007 			source.sb64.st_nlink = 1;
7008 		}
7009 	} else {
7010 		source.sb.st_lspare = 0;
7011 		source.sb.st_qspare[0] = 0LL;
7012 		source.sb.st_qspare[1] = 0LL;
7013 		if (vfs_context_is64bit(ctx)) {
7014 			munge_user64_stat(&source.sb, &dest.user64_sb);
7015 			my_size = sizeof(dest.user64_sb);
7016 			sbp = (caddr_t)&dest.user64_sb;
7017 		} else {
7018 			munge_user32_stat(&source.sb, &dest.user32_sb);
7019 			my_size = sizeof(dest.user32_sb);
7020 			sbp = (caddr_t)&dest.user32_sb;
7021 		}
7022 
7023 		/*
7024 		 * Check if we raced (post lookup) against the last unlink of a file.
7025 		 */
7026 		if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7027 			source.sb.st_nlink = 1;
7028 		}
7029 	}
7030 	if ((error = copyout(sbp, ub, my_size)) != 0) {
7031 		goto out;
7032 	}
7033 
7034 	/* caller wants extended security information? */
7035 	if (xsecurity != USER_ADDR_NULL) {
7036 		/* did we get any? */
7037 		if (fsec == KAUTH_FILESEC_NONE) {
7038 			if (susize(xsecurity_size, 0) != 0) {
7039 				error = EFAULT;
7040 				goto out;
7041 			}
7042 		} else {
7043 			/* find the user buffer size */
7044 			xsecurity_bufsize = fusize(xsecurity_size);
7045 
7046 			/* copy out the actual data size */
7047 			if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7048 				error = EFAULT;
7049 				goto out;
7050 			}
7051 
7052 			/* if the caller supplied enough room, copy out to it */
7053 			if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7054 				error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7055 			}
7056 		}
7057 	}
7058 out:
7059 	if (ndp) {
7060 		kfree_type(struct nameidata, ndp);
7061 	}
7062 	if (fsec != KAUTH_FILESEC_NONE) {
7063 		kauth_filesec_free(fsec);
7064 	}
7065 	return error;
7066 }
7067 
7068 /*
7069  * stat_extended: Get file status; with extended security (ACL).
7070  *
7071  * Parameters:    p                       (ignored)
7072  *                uap                     User argument descriptor (see below)
7073  *                retval                  (ignored)
7074  *
7075  * Indirect:      uap->path               Path of file to get status from
7076  *                uap->ub                 User buffer (holds file status info)
7077  *                uap->xsecurity          ACL to get (extended security)
7078  *                uap->xsecurity_size     Size of ACL
7079  *
7080  * Returns:        0                      Success
7081  *                !0                      errno value
7082  *
7083  */
7084 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7085 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7086     __unused int32_t *retval)
7087 {
7088 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7089 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7090 	           0);
7091 }
7092 
7093 /*
7094  * Returns:	0			Success
7095  *	fstatat_internal:???		[see fstatat_internal() in this file]
7096  */
7097 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7098 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7099 {
7100 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7101 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7102 }
7103 
7104 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7105 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7106 {
7107 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7108 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7109 }
7110 
7111 /*
7112  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7113  *
7114  * Parameters:    p                       (ignored)
7115  *                uap                     User argument descriptor (see below)
7116  *                retval                  (ignored)
7117  *
7118  * Indirect:      uap->path               Path of file to get status from
7119  *                uap->ub                 User buffer (holds file status info)
7120  *                uap->xsecurity          ACL to get (extended security)
7121  *                uap->xsecurity_size     Size of ACL
7122  *
7123  * Returns:        0                      Success
7124  *                !0                      errno value
7125  *
7126  */
7127 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7128 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7129 {
7130 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7131 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7132 	           0);
7133 }
7134 
7135 /*
7136  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7137  *
7138  * Parameters:    p                       (ignored)
7139  *                uap                     User argument descriptor (see below)
7140  *                retval                  (ignored)
7141  *
7142  * Indirect:      uap->path               Path of file to get status from
7143  *                uap->ub                 User buffer (holds file status info)
7144  *                uap->xsecurity          ACL to get (extended security)
7145  *                uap->xsecurity_size     Size of ACL
7146  *
7147  * Returns:        0                      Success
7148  *                !0                      errno value
7149  *
7150  */
7151 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7152 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7153 {
7154 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7155 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7156 	           AT_SYMLINK_NOFOLLOW);
7157 }
7158 
7159 /*
7160  * Get file status; this version does not follow links.
7161  */
7162 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7163 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7164 {
7165 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7166 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7167 }
7168 
7169 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7170 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7171 {
7172 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7173 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7174 }
7175 
7176 /*
7177  * lstat64_extended: Get file status; can handle large inode numbers; does not
7178  * follow links; with extended security (ACL).
7179  *
7180  * Parameters:    p                       (ignored)
7181  *                uap                     User argument descriptor (see below)
7182  *                retval                  (ignored)
7183  *
7184  * Indirect:      uap->path               Path of file to get status from
7185  *                uap->ub                 User buffer (holds file status info)
7186  *                uap->xsecurity          ACL to get (extended security)
7187  *                uap->xsecurity_size     Size of ACL
7188  *
7189  * Returns:        0                      Success
7190  *                !0                      errno value
7191  *
7192  */
7193 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7194 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7195 {
7196 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7197 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7198 	           AT_SYMLINK_NOFOLLOW);
7199 }
7200 
7201 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7202 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7203 {
7204 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7205 		return EINVAL;
7206 	}
7207 
7208 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7209 	           0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7210 }
7211 
7212 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7213 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7214     __unused int32_t *retval)
7215 {
7216 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7217 		return EINVAL;
7218 	}
7219 
7220 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7221 	           0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7222 }
7223 
7224 /*
7225  * Get configurable pathname variables.
7226  *
7227  * Returns:	0			Success
7228  *	namei:???
7229  *	vn_pathconf:???
7230  *
7231  * Notes:	Global implementation  constants are intended to be
7232  *		implemented in this function directly; all other constants
7233  *		are per-FS implementation, and therefore must be handled in
7234  *		each respective FS, instead.
7235  *
7236  * XXX We implement some things globally right now that should actually be
7237  * XXX per-FS; we will need to deal with this at some point.
7238  */
7239 /* ARGSUSED */
7240 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7241 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7242 {
7243 	int error;
7244 	struct nameidata nd;
7245 	vfs_context_t ctx = vfs_context_current();
7246 
7247 	NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7248 	    UIO_USERSPACE, uap->path, ctx);
7249 	error = namei(&nd);
7250 	if (error) {
7251 		return error;
7252 	}
7253 
7254 	error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7255 
7256 	vnode_put(nd.ni_vp);
7257 	nameidone(&nd);
7258 	return error;
7259 }
7260 
7261 /*
7262  * Return target name of a symbolic link.
7263  */
7264 /* ARGSUSED */
7265 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7266 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7267     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7268     int *retval)
7269 {
7270 	vnode_t vp;
7271 	uio_t auio;
7272 	int error;
7273 	struct nameidata nd;
7274 	UIO_STACKBUF(uio_buf, 1);
7275 	bool put_vnode;
7276 
7277 	if (bufsize > INT32_MAX) {
7278 		return EINVAL;
7279 	}
7280 
7281 	if (lnk_vp) {
7282 		vp = lnk_vp;
7283 		put_vnode = false;
7284 	} else {
7285 		NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7286 		    seg, path, ctx);
7287 
7288 		error = nameiat(&nd, fd);
7289 		if (error) {
7290 			return error;
7291 		}
7292 		vp = nd.ni_vp;
7293 		put_vnode = true;
7294 		nameidone(&nd);
7295 	}
7296 
7297 	auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7298 	    &uio_buf[0], sizeof(uio_buf));
7299 	uio_addiov(auio, buf, bufsize);
7300 	if (vp->v_type != VLNK) {
7301 		error = EINVAL;
7302 	} else {
7303 #if CONFIG_MACF
7304 		error = mac_vnode_check_readlink(ctx, vp);
7305 #endif
7306 		if (error == 0) {
7307 			error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7308 			    ctx);
7309 		}
7310 		if (error == 0) {
7311 			error = VNOP_READLINK(vp, auio, ctx);
7312 		}
7313 	}
7314 
7315 	if (put_vnode) {
7316 		vnode_put(vp);
7317 	}
7318 
7319 	*retval = (int)(bufsize - uio_resid(auio));
7320 	return error;
7321 }
7322 
7323 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7324 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7325 {
7326 	enum uio_seg procseg;
7327 	vnode_t vp;
7328 	int error;
7329 
7330 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7331 
7332 	AUDIT_ARG(fd, uap->fd);
7333 
7334 	if ((error = file_vnode(uap->fd, &vp))) {
7335 		return error;
7336 	}
7337 	if ((error = vnode_getwithref(vp))) {
7338 		file_drop(uap->fd);
7339 		return error;
7340 	}
7341 
7342 	error = readlinkat_internal(vfs_context_current(), -1,
7343 	    vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7344 	    uap->bufsize, procseg, retval);
7345 
7346 	vnode_put(vp);
7347 	file_drop(uap->fd);
7348 	return error;
7349 }
7350 
7351 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7352 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7353 {
7354 	enum uio_seg procseg;
7355 
7356 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7357 	return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7358 	           CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7359 	           uap->count, procseg, retval);
7360 }
7361 
7362 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7363 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7364 {
7365 	enum uio_seg procseg;
7366 
7367 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7368 	return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7369 	           CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7370 	           retval);
7371 }
7372 
7373 /*
7374  * Change file flags, the deep inner layer.
7375  */
7376 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7377 chflags0(vnode_t vp, struct vnode_attr *va,
7378     int (*setattr)(vnode_t, void *, vfs_context_t),
7379     void *arg, vfs_context_t ctx)
7380 {
7381 	kauth_action_t action = 0;
7382 	int error;
7383 
7384 #if CONFIG_MACF
7385 	error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7386 	if (error) {
7387 		goto out;
7388 	}
7389 #endif
7390 
7391 	/* request authorisation, disregard immutability */
7392 	if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7393 		goto out;
7394 	}
7395 	/*
7396 	 * Request that the auth layer disregard those file flags it's allowed to when
7397 	 * authorizing this operation; we need to do this in order to be able to
7398 	 * clear immutable flags.
7399 	 */
7400 	if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7401 		goto out;
7402 	}
7403 	error = (*setattr)(vp, arg, ctx);
7404 
7405 #if CONFIG_MACF
7406 	if (error == 0) {
7407 		mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7408 	}
7409 #endif
7410 
7411 out:
7412 	return error;
7413 }
7414 
7415 /*
7416  * Change file flags.
7417  *
7418  * NOTE: this will vnode_put() `vp'
7419  */
7420 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7421 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7422 {
7423 	struct vnode_attr va;
7424 	int error;
7425 
7426 	VATTR_INIT(&va);
7427 	VATTR_SET(&va, va_flags, flags);
7428 
7429 	error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7430 	vnode_put(vp);
7431 
7432 	if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7433 		error = ENOTSUP;
7434 	}
7435 
7436 	return error;
7437 }
7438 
7439 /*
7440  * Change flags of a file given a path name.
7441  */
7442 /* ARGSUSED */
7443 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7444 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7445 {
7446 	vnode_t vp;
7447 	vfs_context_t ctx = vfs_context_current();
7448 	int error;
7449 	struct nameidata nd;
7450 	uint32_t wantparent = 0;
7451 
7452 #if CONFIG_FILE_LEASES
7453 	wantparent = WANTPARENT;
7454 #endif
7455 
7456 	AUDIT_ARG(fflags, uap->flags);
7457 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7458 	    UIO_USERSPACE, uap->path, ctx);
7459 	error = namei(&nd);
7460 	if (error) {
7461 		return error;
7462 	}
7463 	vp = nd.ni_vp;
7464 
7465 #if CONFIG_FILE_LEASES
7466 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7467 	vnode_put(nd.ni_dvp);
7468 #endif
7469 
7470 	nameidone(&nd);
7471 
7472 	/* we don't vnode_put() here because chflags1 does internally */
7473 	error = chflags1(vp, uap->flags, ctx);
7474 
7475 	return error;
7476 }
7477 
7478 /*
7479  * Change flags of a file given a file descriptor.
7480  */
7481 /* ARGSUSED */
7482 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7483 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7484 {
7485 	vnode_t vp;
7486 	int error;
7487 
7488 	AUDIT_ARG(fd, uap->fd);
7489 	AUDIT_ARG(fflags, uap->flags);
7490 	if ((error = file_vnode(uap->fd, &vp))) {
7491 		return error;
7492 	}
7493 
7494 	if ((error = vnode_getwithref(vp))) {
7495 		file_drop(uap->fd);
7496 		return error;
7497 	}
7498 
7499 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7500 
7501 #if CONFIG_FILE_LEASES
7502 	vnode_breakdirlease(vp, true, O_WRONLY);
7503 #endif
7504 
7505 	/* we don't vnode_put() here because chflags1 does internally */
7506 	error = chflags1(vp, uap->flags, vfs_context_current());
7507 
7508 	file_drop(uap->fd);
7509 	return error;
7510 }
7511 
7512 /*
7513  * Change security information on a filesystem object.
7514  *
7515  * Returns:	0			Success
7516  *		EPERM			Operation not permitted
7517  *		vnode_authattr:???	[anything vnode_authattr can return]
7518  *		vnode_authorize:???	[anything vnode_authorize can return]
7519  *		vnode_setattr:???	[anything vnode_setattr can return]
7520  *
7521  * Notes:	If vnode_authattr or vnode_authorize return EACCES, it will be
7522  *		translated to EPERM before being returned.
7523  */
7524 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7525 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7526 {
7527 	kauth_action_t action;
7528 	int error;
7529 
7530 	AUDIT_ARG(mode, vap->va_mode);
7531 	/* XXX audit new args */
7532 
7533 #if NAMEDSTREAMS
7534 	/* chmod calls are not allowed for resource forks. */
7535 	if (vp->v_flag & VISNAMEDSTREAM) {
7536 		return EPERM;
7537 	}
7538 #endif
7539 
7540 #if CONFIG_MACF
7541 	if (VATTR_IS_ACTIVE(vap, va_mode) &&
7542 	    (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7543 		return error;
7544 	}
7545 
7546 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7547 		if ((error = mac_vnode_check_setowner(ctx, vp,
7548 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7549 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7550 			return error;
7551 		}
7552 	}
7553 
7554 	if (VATTR_IS_ACTIVE(vap, va_acl) &&
7555 	    (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7556 		return error;
7557 	}
7558 #endif
7559 
7560 	/* make sure that the caller is allowed to set this security information */
7561 	if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7562 	    ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7563 		if (error == EACCES) {
7564 			error = EPERM;
7565 		}
7566 		return error;
7567 	}
7568 
7569 	if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7570 		return error;
7571 	}
7572 
7573 #if CONFIG_MACF
7574 	if (VATTR_IS_ACTIVE(vap, va_mode)) {
7575 		mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7576 	}
7577 
7578 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7579 		mac_vnode_notify_setowner(ctx, vp,
7580 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7581 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7582 	}
7583 
7584 	if (VATTR_IS_ACTIVE(vap, va_acl)) {
7585 		mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7586 	}
7587 #endif
7588 
7589 	return error;
7590 }
7591 
7592 
7593 /*
7594  * Change mode of a file given a path name.
7595  *
7596  * Returns:	0			Success
7597  *		namei:???		[anything namei can return]
7598  *		chmod_vnode:???		[anything chmod_vnode can return]
7599  */
7600 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7601 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7602     int fd, int flag, enum uio_seg segflg)
7603 {
7604 	struct nameidata nd;
7605 	int follow, error;
7606 	uint32_t wantparent = 0;
7607 
7608 #if CONFIG_FILE_LEASES
7609 	wantparent = WANTPARENT;
7610 #endif
7611 
7612 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7613 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7614 	    segflg, path, ctx);
7615 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7616 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7617 	}
7618 	if ((error = nameiat(&nd, fd))) {
7619 		return error;
7620 	}
7621 
7622 #if CONFIG_FILE_LEASES
7623 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7624 	vnode_put(nd.ni_dvp);
7625 #endif
7626 
7627 	error = chmod_vnode(ctx, nd.ni_vp, vap);
7628 	vnode_put(nd.ni_vp);
7629 	nameidone(&nd);
7630 	return error;
7631 }
7632 
7633 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)7634 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7635     gid_t gid, user_addr_t xsecurity)
7636 {
7637 	int error;
7638 
7639 	VATTR_INIT(pva);
7640 
7641 	if (mode != -1) {
7642 		VATTR_SET(pva, va_mode, mode & ALLPERMS);
7643 	} else {
7644 		pva->va_mode = 0;
7645 	}
7646 
7647 	if (uid != KAUTH_UID_NONE) {
7648 		VATTR_SET(pva, va_uid, uid);
7649 	}
7650 
7651 	if (gid != KAUTH_GID_NONE) {
7652 		VATTR_SET(pva, va_gid, gid);
7653 	}
7654 
7655 	*pxsecdst = NULL;
7656 	switch (xsecurity) {
7657 	case USER_ADDR_NULL:
7658 		break;
7659 
7660 	case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7661 		VATTR_SET(pva, va_acl, NULL);
7662 		break;
7663 
7664 	default:
7665 		if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
7666 			return error;
7667 		}
7668 
7669 		VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7670 		pva->va_vaflags |= VA_FILESEC_ACL;
7671 		KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7672 		break;
7673 	}
7674 
7675 	return 0;
7676 }
7677 
7678 /*
7679  * chmod_extended: Change the mode of a file given a path name; with extended
7680  * argument list (including extended security (ACL)).
7681  *
7682  * Parameters:	p			Process requesting the open
7683  *		uap			User argument descriptor (see below)
7684  *		retval			(ignored)
7685  *
7686  * Indirect:	uap->path		Path to object (same as 'chmod')
7687  *		uap->uid		UID to set
7688  *		uap->gid		GID to set
7689  *		uap->mode		File mode to set (same as 'chmod')
7690  *		uap->xsecurity		ACL to set (or delete)
7691  *
7692  * Returns:	0			Success
7693  *		!0			errno value
7694  *
7695  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
7696  *
7697  * XXX:		We should enummerate the possible errno values here, and where
7698  *		in the code they originated.
7699  */
7700 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7701 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7702 {
7703 	int error;
7704 	struct vnode_attr va;
7705 	kauth_filesec_t xsecdst = NULL;
7706 
7707 	AUDIT_ARG(owner, uap->uid, uap->gid);
7708 
7709 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7710 	    uap->gid, uap->xsecurity);
7711 
7712 	if (error) {
7713 		return error;
7714 	}
7715 
7716 	error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7717 	    UIO_USERSPACE);
7718 
7719 	if (xsecdst != NULL) {
7720 		kauth_filesec_free(xsecdst);
7721 	}
7722 	return error;
7723 }
7724 
7725 /*
7726  * Returns:	0			Success
7727  *		chmodat:???		[anything chmodat can return]
7728  */
7729 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7730 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7731     int flag, enum uio_seg segflg)
7732 {
7733 	struct vnode_attr va;
7734 
7735 	VATTR_INIT(&va);
7736 	VATTR_SET(&va, va_mode, mode & ALLPERMS);
7737 
7738 	return chmodat(ctx, path, &va, fd, flag, segflg);
7739 }
7740 
7741 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7742 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7743 {
7744 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7745 	           AT_FDCWD, 0, UIO_USERSPACE);
7746 }
7747 
7748 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7749 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7750 {
7751 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7752 		return EINVAL;
7753 	}
7754 
7755 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7756 	           uap->fd, uap->flag, UIO_USERSPACE);
7757 }
7758 
7759 /*
7760  * Change mode of a file given a file descriptor.
7761  */
7762 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7763 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7764 {
7765 	vnode_t vp;
7766 	int error;
7767 
7768 	AUDIT_ARG(fd, fd);
7769 
7770 	if ((error = file_vnode(fd, &vp)) != 0) {
7771 		return error;
7772 	}
7773 	if ((error = vnode_getwithref(vp)) != 0) {
7774 		file_drop(fd);
7775 		return error;
7776 	}
7777 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7778 
7779 #if CONFIG_FILE_LEASES
7780 	vnode_breakdirlease(vp, true, O_WRONLY);
7781 #endif
7782 
7783 	error = chmod_vnode(vfs_context_current(), vp, vap);
7784 	(void)vnode_put(vp);
7785 	file_drop(fd);
7786 
7787 	return error;
7788 }
7789 
7790 /*
7791  * fchmod_extended: Change mode of a file given a file descriptor; with
7792  * extended argument list (including extended security (ACL)).
7793  *
7794  * Parameters:    p                       Process requesting to change file mode
7795  *                uap                     User argument descriptor (see below)
7796  *                retval                  (ignored)
7797  *
7798  * Indirect:      uap->mode               File mode to set (same as 'chmod')
7799  *                uap->uid                UID to set
7800  *                uap->gid                GID to set
7801  *                uap->xsecurity          ACL to set (or delete)
7802  *                uap->fd                 File descriptor of file to change mode
7803  *
7804  * Returns:        0                      Success
7805  *                !0                      errno value
7806  *
7807  */
7808 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7809 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7810 {
7811 	int error;
7812 	struct vnode_attr va;
7813 	kauth_filesec_t xsecdst = NULL;
7814 
7815 	AUDIT_ARG(owner, uap->uid, uap->gid);
7816 
7817 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7818 	    uap->gid, uap->xsecurity);
7819 
7820 	if (error) {
7821 		return error;
7822 	}
7823 
7824 	error = fchmod1(p, uap->fd, &va);
7825 
7826 	if (xsecdst != NULL) {
7827 		kauth_filesec_free(xsecdst);
7828 	}
7829 	return error;
7830 }
7831 
7832 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)7833 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7834 {
7835 	struct vnode_attr va;
7836 
7837 	VATTR_INIT(&va);
7838 	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7839 
7840 	return fchmod1(p, uap->fd, &va);
7841 }
7842 
7843 static int
vn_chown_internal(__unused vfs_context_t ctx,vnode_t vp,uid_t uid,gid_t gid)7844 vn_chown_internal(__unused vfs_context_t ctx, vnode_t vp, uid_t uid, gid_t gid)
7845 {
7846 	struct vnode_attr va;
7847 	kauth_action_t action;
7848 	int error;
7849 
7850 	VATTR_INIT(&va);
7851 	if (uid != (uid_t)VNOVAL) {
7852 		VATTR_SET(&va, va_uid, uid);
7853 	}
7854 	if (gid != (gid_t)VNOVAL) {
7855 		VATTR_SET(&va, va_gid, gid);
7856 	}
7857 
7858 #if NAMEDSTREAMS
7859 	/* chown calls are not allowed for resource forks. */
7860 	if (vp->v_flag & VISNAMEDSTREAM) {
7861 		error = EPERM;
7862 		goto out;
7863 	}
7864 #endif
7865 
7866 #if CONFIG_MACF
7867 	error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7868 	if (error) {
7869 		goto out;
7870 	}
7871 #endif
7872 
7873 	/* preflight and authorize attribute changes */
7874 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7875 		goto out;
7876 	}
7877 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7878 		/*
7879 		 * EACCES is only allowed from namei(); permissions failure should
7880 		 * return EPERM, so we need to translate the error code.
7881 		 */
7882 		if (error == EACCES) {
7883 			error = EPERM;
7884 		}
7885 
7886 		goto out;
7887 	}
7888 
7889 #if CONFIG_FILE_LEASES
7890 	vnode_breakdirlease(vp, true, O_WRONLY);
7891 #endif
7892 
7893 	error = vnode_setattr(vp, &va, ctx);
7894 
7895 #if CONFIG_MACF
7896 	if (error == 0) {
7897 		mac_vnode_notify_setowner(ctx, vp, uid, gid);
7898 	}
7899 #endif
7900 
7901 out:
7902 	return error;
7903 }
7904 
7905 /*
7906  * Set ownership given a path name.
7907  */
7908 /* ARGSUSED */
7909 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)7910 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7911     gid_t gid, int flag, enum uio_seg segflg)
7912 {
7913 	vnode_t vp;
7914 	int error;
7915 	struct nameidata nd;
7916 	int follow;
7917 
7918 	AUDIT_ARG(owner, uid, gid);
7919 
7920 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7921 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg, path, ctx);
7922 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7923 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7924 	}
7925 
7926 	error = nameiat(&nd, fd);
7927 	if (error) {
7928 		return error;
7929 	}
7930 
7931 	vp = nd.ni_vp;
7932 	error = vn_chown_internal(ctx, vp, uid, gid);
7933 
7934 	nameidone(&nd);
7935 	vnode_put(vp);
7936 	return error;
7937 }
7938 
7939 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)7940 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7941 {
7942 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7943 	           uap->uid, uap->gid, 0, UIO_USERSPACE);
7944 }
7945 
7946 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)7947 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7948 {
7949 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7950 	           uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7951 }
7952 
7953 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)7954 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7955 {
7956 	if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7957 		return EINVAL;
7958 	}
7959 
7960 	return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7961 	           uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7962 }
7963 
7964 /*
7965  * Set ownership given a file descriptor.
7966  */
7967 /* ARGSUSED */
7968 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)7969 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7970 {
7971 	vfs_context_t ctx = vfs_context_current();
7972 	vnode_t vp;
7973 	int error;
7974 
7975 	AUDIT_ARG(owner, uap->uid, uap->gid);
7976 	AUDIT_ARG(fd, uap->fd);
7977 
7978 	if ((error = file_vnode(uap->fd, &vp))) {
7979 		return error;
7980 	}
7981 
7982 	if ((error = vnode_getwithref(vp))) {
7983 		file_drop(uap->fd);
7984 		return error;
7985 	}
7986 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7987 
7988 	error = vn_chown_internal(ctx, vp, uap->uid, uap->gid);
7989 
7990 	(void)vnode_put(vp);
7991 	file_drop(uap->fd);
7992 	return error;
7993 }
7994 
7995 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)7996 getutimes(user_addr_t usrtvp, struct timespec *tsp)
7997 {
7998 	int error;
7999 
8000 	if (usrtvp == USER_ADDR_NULL) {
8001 		struct timeval old_tv;
8002 		/* XXX Y2038 bug because of microtime argument */
8003 		microtime(&old_tv);
8004 		TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8005 		tsp[1] = tsp[0];
8006 	} else {
8007 		if (IS_64BIT_PROCESS(current_proc())) {
8008 			struct user64_timeval tv[2];
8009 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8010 			if (error) {
8011 				return error;
8012 			}
8013 			TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8014 			TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8015 		} else {
8016 			struct user32_timeval tv[2];
8017 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8018 			if (error) {
8019 				return error;
8020 			}
8021 			TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8022 			TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8023 		}
8024 	}
8025 	return 0;
8026 }
8027 
8028 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8029 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8030     int nullflag)
8031 {
8032 	int error;
8033 	struct vnode_attr va;
8034 	kauth_action_t action;
8035 
8036 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8037 
8038 	VATTR_INIT(&va);
8039 	VATTR_SET(&va, va_access_time, ts[0]);
8040 	VATTR_SET(&va, va_modify_time, ts[1]);
8041 	if (nullflag) {
8042 		va.va_vaflags |= VA_UTIMES_NULL;
8043 	}
8044 
8045 #if NAMEDSTREAMS
8046 	/* utimes calls are not allowed for resource forks. */
8047 	if (vp->v_flag & VISNAMEDSTREAM) {
8048 		error = EPERM;
8049 		goto out;
8050 	}
8051 #endif
8052 
8053 #if CONFIG_MACF
8054 	error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8055 	if (error) {
8056 		goto out;
8057 	}
8058 #endif
8059 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8060 		if (!nullflag && error == EACCES) {
8061 			error = EPERM;
8062 		}
8063 		goto out;
8064 	}
8065 
8066 	/* since we may not need to auth anything, check here */
8067 	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8068 		if (!nullflag && error == EACCES) {
8069 			error = EPERM;
8070 		}
8071 		goto out;
8072 	}
8073 	error = vnode_setattr(vp, &va, ctx);
8074 
8075 #if CONFIG_MACF
8076 	if (error == 0) {
8077 		mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8078 	}
8079 #endif
8080 
8081 out:
8082 	return error;
8083 }
8084 
8085 /*
8086  * Set the access and modification times of a file.
8087  */
8088 /* ARGSUSED */
8089 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8090 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8091 {
8092 	struct timespec ts[2];
8093 	user_addr_t usrtvp;
8094 	int error;
8095 	struct nameidata nd;
8096 	vfs_context_t ctx = vfs_context_current();
8097 	uint32_t wantparent = 0;
8098 
8099 #if CONFIG_FILE_LEASES
8100 	wantparent = WANTPARENT;
8101 #endif
8102 
8103 	/*
8104 	 * AUDIT: Needed to change the order of operations to do the
8105 	 * name lookup first because auditing wants the path.
8106 	 */
8107 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8108 	    UIO_USERSPACE, uap->path, ctx);
8109 	error = namei(&nd);
8110 	if (error) {
8111 		return error;
8112 	}
8113 
8114 	/*
8115 	 * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
8116 	 * the current time instead.
8117 	 */
8118 	usrtvp = uap->tptr;
8119 	if ((error = getutimes(usrtvp, ts)) != 0) {
8120 		goto out;
8121 	}
8122 
8123 #if CONFIG_FILE_LEASES
8124 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8125 #endif
8126 
8127 	error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8128 
8129 out:
8130 #if CONFIG_FILE_LEASES
8131 	vnode_put(nd.ni_dvp);
8132 #endif
8133 	nameidone(&nd);
8134 	vnode_put(nd.ni_vp);
8135 	return error;
8136 }
8137 
8138 /*
8139  * Set the access and modification times of a file.
8140  */
8141 /* ARGSUSED */
8142 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8143 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8144 {
8145 	struct timespec ts[2];
8146 	vnode_t vp;
8147 	user_addr_t usrtvp;
8148 	int error;
8149 
8150 	AUDIT_ARG(fd, uap->fd);
8151 	usrtvp = uap->tptr;
8152 	if ((error = getutimes(usrtvp, ts)) != 0) {
8153 		return error;
8154 	}
8155 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
8156 		return error;
8157 	}
8158 	if ((error = vnode_getwithref(vp))) {
8159 		file_drop(uap->fd);
8160 		return error;
8161 	}
8162 
8163 #if CONFIG_FILE_LEASES
8164 	vnode_breakdirlease(vp, true, O_WRONLY);
8165 #endif
8166 
8167 	error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8168 
8169 	vnode_put(vp);
8170 	file_drop(uap->fd);
8171 	return error;
8172 }
8173 
8174 static int
truncate_validate_common(proc_t p,off_t length)8175 truncate_validate_common(proc_t p, off_t length)
8176 {
8177 	rlim_t fsize_limit;
8178 
8179 	if (length < 0) {
8180 		return EINVAL;
8181 	}
8182 
8183 	fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8184 	if ((rlim_t)length > fsize_limit) {
8185 		psignal(p, SIGXFSZ);
8186 		return EFBIG;
8187 	}
8188 
8189 	return 0;
8190 }
8191 
8192 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8193 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8194     vfs_context_t ctx, boolean_t need_auth)
8195 {
8196 	struct vnode_attr va;
8197 	kauth_action_t action;
8198 	int error;
8199 
8200 	VATTR_INIT(&va);
8201 	VATTR_SET(&va, va_data_size, length);
8202 
8203 #if CONFIG_MACF
8204 	error = mac_vnode_check_truncate(ctx, cred, vp);
8205 	if (error) {
8206 		return error;
8207 	}
8208 #endif
8209 
8210 	/*
8211 	 * If we reached here from `ftruncate` then we already did an effective
8212 	 * `vnode_authorize` upon open.  We honour the result from then.
8213 	 */
8214 	if (need_auth) {
8215 		if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8216 			return error;
8217 		}
8218 
8219 		if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8220 			return error;
8221 		}
8222 	}
8223 
8224 #if CONFIG_FILE_LEASES
8225 	/* Check if there is a lease placed on the parent directory. */
8226 	vnode_breakdirlease(vp, true, O_WRONLY);
8227 
8228 	/* Now check if there is a lease placed on the file itself. */
8229 	(void)vnode_breaklease(vp, O_WRONLY, ctx);
8230 #endif
8231 
8232 	error = vnode_setattr(vp, &va, ctx);
8233 
8234 #if CONFIG_MACF
8235 	if (error == 0) {
8236 		mac_vnode_notify_truncate(ctx, cred, vp);
8237 	}
8238 #endif
8239 
8240 	return error;
8241 }
8242 
8243 /*
8244  * Truncate a file given its path name.
8245  */
8246 /* ARGSUSED */
8247 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8248 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8249 {
8250 	vfs_context_t ctx = vfs_context_current();
8251 	vnode_t vp;
8252 	int error;
8253 	struct nameidata nd;
8254 
8255 	if ((error = truncate_validate_common(p, uap->length))) {
8256 		return error;
8257 	}
8258 
8259 	NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8260 	    UIO_USERSPACE, uap->path, ctx);
8261 
8262 	if ((error = namei(&nd))) {
8263 		return error;
8264 	}
8265 
8266 	vp = nd.ni_vp;
8267 	nameidone(&nd);
8268 
8269 	error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8270 	vnode_put(vp);
8271 
8272 	return error;
8273 }
8274 
8275 /*
8276  * Truncate a file given a file descriptor.
8277  */
8278 /* ARGSUSED */
8279 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8280 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8281 {
8282 	vnode_t vp;
8283 	struct fileproc *fp;
8284 	int error;
8285 
8286 	AUDIT_ARG(fd, uap->fd);
8287 
8288 	if ((error = truncate_validate_common(p, uap->length))) {
8289 		return error;
8290 	}
8291 
8292 	if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8293 		return error;
8294 	}
8295 
8296 	switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8297 	case DTYPE_PSXSHM:
8298 		error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8299 		goto out;
8300 	case DTYPE_VNODE:
8301 		break;
8302 	default:
8303 		error = EINVAL;
8304 		goto out;
8305 	}
8306 
8307 	vp = (vnode_t)fp_get_data(fp);
8308 
8309 	if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8310 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8311 		error = EINVAL;
8312 		goto out;
8313 	}
8314 
8315 	if ((error = vnode_getwithref(vp)) != 0) {
8316 		goto out;
8317 	}
8318 
8319 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8320 
8321 	error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8322 	    vfs_context_current(), false);
8323 	vnode_put(vp);
8324 
8325 out:
8326 	file_drop(uap->fd);
8327 	return error;
8328 }
8329 
8330 
8331 /*
8332  * Sync an open file with synchronized I/O _file_ integrity completion
8333  */
8334 /* ARGSUSED */
8335 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8336 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8337 {
8338 	__pthread_testcancel(1);
8339 	return fsync_common(p, uap, MNT_WAIT);
8340 }
8341 
8342 
8343 /*
8344  * Sync an open file with synchronized I/O _file_ integrity completion
8345  *
8346  * Notes:	This is a legacy support function that does not test for
8347  *		thread cancellation points.
8348  */
8349 /* ARGSUSED */
8350 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8351 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8352 {
8353 	return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8354 }
8355 
8356 
8357 /*
8358  * Sync an open file with synchronized I/O _data_ integrity completion
8359  */
8360 /* ARGSUSED */
8361 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8362 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8363 {
8364 	__pthread_testcancel(1);
8365 	return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8366 }
8367 
8368 
8369 /*
8370  * fsync_common
8371  *
8372  * Common fsync code to support both synchronized I/O file integrity completion
8373  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8374  *
8375  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8376  * will only guarantee that the file data contents are retrievable.  If
8377  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8378  * includes additional metadata unnecessary for retrieving the file data
8379  * contents, such as atime, mtime, ctime, etc., also be committed to stable
8380  * storage.
8381  *
8382  * Parameters:	p				The process
8383  *		uap->fd				The descriptor to synchronize
8384  *		flags				The data integrity flags
8385  *
8386  * Returns:	int				Success
8387  *	fp_getfvp:EBADF				Bad file descriptor
8388  *	fp_getfvp:ENOTSUP			fd does not refer to a vnode
8389  *	VNOP_FSYNC:???				unspecified
8390  *
8391  * Notes:	We use struct fsync_args because it is a short name, and all
8392  *		caller argument structures are otherwise identical.
8393  */
8394 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8395 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8396 {
8397 	vnode_t vp;
8398 	struct fileproc *fp;
8399 	vfs_context_t ctx = vfs_context_current();
8400 	int error;
8401 
8402 	AUDIT_ARG(fd, uap->fd);
8403 
8404 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8405 		return error;
8406 	}
8407 	if ((error = vnode_getwithref(vp))) {
8408 		file_drop(uap->fd);
8409 		return error;
8410 	}
8411 
8412 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8413 
8414 	error = VNOP_FSYNC(vp, flags, ctx);
8415 
8416 #if NAMEDRSRCFORK
8417 	/* Sync resource fork shadow file if necessary. */
8418 	if ((error == 0) &&
8419 	    (vp->v_flag & VISNAMEDSTREAM) &&
8420 	    (vp->v_parent != NULLVP) &&
8421 	    vnode_isshadow(vp) &&
8422 	    (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8423 		(void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8424 	}
8425 #endif
8426 
8427 	(void)vnode_put(vp);
8428 	file_drop(uap->fd);
8429 	return error;
8430 }
8431 
8432 /*
8433  * Duplicate files.  Source must be a file, target must be a file or
8434  * must not exist.
8435  *
8436  * XXX Copyfile authorisation checking is woefully inadequate, and will not
8437  *     perform inheritance correctly.
8438  */
8439 /* ARGSUSED */
8440 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8441 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8442 {
8443 	vnode_t tvp, fvp, tdvp, sdvp;
8444 	struct nameidata fromnd, tond;
8445 	int error;
8446 	vfs_context_t ctx = vfs_context_current();
8447 
8448 	/* Check that the flags are valid. */
8449 	if (uap->flags & ~CPF_MASK) {
8450 		return EINVAL;
8451 	}
8452 
8453 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8454 	    UIO_USERSPACE, uap->from, ctx);
8455 	if ((error = namei(&fromnd))) {
8456 		return error;
8457 	}
8458 	fvp = fromnd.ni_vp;
8459 
8460 	NDINIT(&tond, CREATE, OP_LINK,
8461 	    LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8462 	    UIO_USERSPACE, uap->to, ctx);
8463 	if ((error = namei(&tond))) {
8464 		goto out1;
8465 	}
8466 	tdvp = tond.ni_dvp;
8467 	tvp = tond.ni_vp;
8468 
8469 	if (tvp != NULL) {
8470 		if (!(uap->flags & CPF_OVERWRITE)) {
8471 			error = EEXIST;
8472 			goto out;
8473 		}
8474 	}
8475 
8476 	if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8477 		error = EISDIR;
8478 		goto out;
8479 	}
8480 
8481 	if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8482 		error = EOPNOTSUPP;
8483 		goto out;
8484 	}
8485 
8486 #if CONFIG_MACF
8487 	if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8488 		goto out;
8489 	}
8490 #endif /* CONFIG_MACF */
8491 
8492 	if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8493 		goto out;
8494 	}
8495 	if (tvp) {
8496 		if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8497 			goto out;
8498 		}
8499 	}
8500 	if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8501 		goto out;
8502 	}
8503 
8504 	if (fvp == tdvp) {
8505 		error = EINVAL;
8506 	}
8507 	/*
8508 	 * If source is the same as the destination (that is the
8509 	 * same inode number) then there is nothing to do.
8510 	 * (fixed to have POSIX semantics - CSM 3/2/98)
8511 	 */
8512 	if (fvp == tvp) {
8513 		error = -1;
8514 	}
8515 
8516 #if CONFIG_FILE_LEASES
8517 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8518 #endif
8519 
8520 	if (!error) {
8521 		error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8522 	}
8523 out:
8524 	sdvp = tond.ni_startdir;
8525 	/*
8526 	 * nameidone has to happen before we vnode_put(tdvp)
8527 	 * since it may need to release the fs_nodelock on the tdvp
8528 	 */
8529 	nameidone(&tond);
8530 
8531 	if (tvp) {
8532 		vnode_put(tvp);
8533 	}
8534 	vnode_put(tdvp);
8535 	vnode_put(sdvp);
8536 out1:
8537 	vnode_put(fvp);
8538 
8539 	nameidone(&fromnd);
8540 
8541 	if (error == -1) {
8542 		return 0;
8543 	}
8544 	return error;
8545 }
8546 
8547 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8548 
8549 /*
8550  * Helper function for doing clones. The caller is expected to provide an
8551  * iocounted source vnode and release it.
8552  */
8553 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)8554 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8555     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8556 {
8557 	vnode_t tvp, tdvp;
8558 	struct nameidata tond;
8559 	int error;
8560 	int follow;
8561 	boolean_t free_src_acl;
8562 	boolean_t attr_cleanup;
8563 	enum vtype v_type;
8564 	kauth_action_t action;
8565 	struct componentname *cnp;
8566 	uint32_t defaulted = 0;
8567 	struct vnode_attr va;
8568 	struct vnode_attr nva;
8569 	uint32_t vnop_flags;
8570 
8571 	v_type = vnode_vtype(fvp);
8572 	switch (v_type) {
8573 	case VLNK:
8574 	/* FALLTHRU */
8575 	case VREG:
8576 		action = KAUTH_VNODE_ADD_FILE;
8577 		break;
8578 	case VDIR:
8579 		if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8580 		    fvp->v_mountedhere) {
8581 			return EINVAL;
8582 		}
8583 		action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8584 		break;
8585 	default:
8586 		return EINVAL;
8587 	}
8588 
8589 	AUDIT_ARG(fd2, dst_dirfd);
8590 	AUDIT_ARG(value32, flags);
8591 
8592 	follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8593 	NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8594 	    UIO_USERSPACE, dst, ctx);
8595 	if ((error = nameiat(&tond, dst_dirfd))) {
8596 		return error;
8597 	}
8598 	cnp = &tond.ni_cnd;
8599 	tdvp = tond.ni_dvp;
8600 	tvp = tond.ni_vp;
8601 
8602 	free_src_acl = FALSE;
8603 	attr_cleanup = FALSE;
8604 
8605 	if (tvp != NULL) {
8606 		error = EEXIST;
8607 		goto out;
8608 	}
8609 
8610 	if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8611 		error = EXDEV;
8612 		goto out;
8613 	}
8614 
8615 #if CONFIG_MACF
8616 	if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8617 		goto out;
8618 	}
8619 #endif
8620 	if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8621 		goto out;
8622 	}
8623 
8624 	action = KAUTH_VNODE_GENERIC_READ_BITS;
8625 	if (data_read_authorised) {
8626 		action &= ~KAUTH_VNODE_READ_DATA;
8627 	}
8628 	if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8629 		goto out;
8630 	}
8631 
8632 	/*
8633 	 * certain attributes may need to be changed from the source, we ask for
8634 	 * those here with the exception of source file's ACLs unless the CLONE_ACL
8635 	 * flag is specified. By default, the clone file will inherit the target
8636 	 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8637 	 * will inherit the source file's ACLs instead.
8638 	 */
8639 	VATTR_INIT(&va);
8640 	VATTR_WANTED(&va, va_uid);
8641 	VATTR_WANTED(&va, va_gid);
8642 	VATTR_WANTED(&va, va_mode);
8643 	VATTR_WANTED(&va, va_flags);
8644 	if (flags & CLONE_ACL) {
8645 		VATTR_WANTED(&va, va_acl);
8646 	}
8647 
8648 	if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
8649 		goto out;
8650 	}
8651 
8652 	VATTR_INIT(&nva);
8653 	VATTR_SET(&nva, va_type, v_type);
8654 	if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8655 		VATTR_SET(&nva, va_acl, va.va_acl);
8656 		free_src_acl = TRUE;
8657 	}
8658 
8659 	/* Handle ACL inheritance, initialize vap. */
8660 	if (v_type == VLNK) {
8661 		error = vnode_authattr_new(tdvp, &nva, 0, ctx);
8662 	} else {
8663 		error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
8664 		if (error) {
8665 			goto out;
8666 		}
8667 		attr_cleanup = TRUE;
8668 	}
8669 
8670 	vnop_flags = VNODE_CLONEFILE_DEFAULT;
8671 	/*
8672 	 * We've got initial values for all security parameters,
8673 	 * If we are superuser, then we can change owners to be the
8674 	 * same as the source. Both superuser and the owner have default
8675 	 * WRITE_SECURITY privileges so all other fields can be taken
8676 	 * from source as well.
8677 	 */
8678 	if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8679 		if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8680 			VATTR_SET(&nva, va_uid, va.va_uid);
8681 		}
8682 		if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8683 			VATTR_SET(&nva, va_gid, va.va_gid);
8684 		}
8685 	} else {
8686 		vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8687 	}
8688 
8689 	if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8690 		VATTR_SET(&nva, va_mode, va.va_mode);
8691 	}
8692 	if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8693 		VATTR_SET(&nva, va_flags,
8694 		    ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8695 		    (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8696 	}
8697 
8698 #if CONFIG_FILE_LEASES
8699 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8700 #endif
8701 
8702 	error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8703 
8704 	if (!error && tvp) {
8705 		int     update_flags = 0;
8706 #if CONFIG_FSE
8707 		int fsevent;
8708 #endif /* CONFIG_FSE */
8709 
8710 		/*
8711 		 * If some of the requested attributes weren't handled by the
8712 		 * VNOP, use our fallback code.
8713 		 */
8714 		if (!VATTR_ALL_SUPPORTED(&nva)) {
8715 			(void)vnode_setattr_fallback(tvp, &nva, ctx);
8716 		}
8717 
8718 #if CONFIG_MACF
8719 		(void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8720 		    VNODE_LABEL_CREATE, ctx);
8721 #endif
8722 
8723 		// Make sure the name & parent pointers are hooked up
8724 		if (tvp->v_name == NULL) {
8725 			update_flags |= VNODE_UPDATE_NAME;
8726 		}
8727 		if (tvp->v_parent == NULLVP) {
8728 			update_flags |= VNODE_UPDATE_PARENT;
8729 		}
8730 
8731 		if (update_flags) {
8732 			(void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8733 			    cnp->cn_namelen, cnp->cn_hash, update_flags);
8734 		}
8735 
8736 #if CONFIG_FSE
8737 		switch (vnode_vtype(tvp)) {
8738 		case VLNK:
8739 		/* FALLTHRU */
8740 		case VREG:
8741 			fsevent = FSE_CREATE_FILE;
8742 			break;
8743 		case VDIR:
8744 			fsevent = FSE_CREATE_DIR;
8745 			break;
8746 		default:
8747 			goto out;
8748 		}
8749 
8750 		if (need_fsevent(fsevent, tvp)) {
8751 			/*
8752 			 * The following is a sequence of three explicit events.
8753 			 * A pair of FSE_CLONE events representing the source and destination
8754 			 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8755 			 * fseventsd may coalesce the destination clone and create events
8756 			 * into a single event resulting in the following sequence for a client
8757 			 * FSE_CLONE (src)
8758 			 * FSE_CLONE | FSE_CREATE (dst)
8759 			 */
8760 			add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8761 			    FSE_ARG_DONE);
8762 			add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8763 			    FSE_ARG_DONE);
8764 		}
8765 #endif /* CONFIG_FSE */
8766 	}
8767 
8768 out:
8769 	if (attr_cleanup) {
8770 		vn_attribute_cleanup(&nva, defaulted);
8771 	}
8772 	if (free_src_acl && va.va_acl) {
8773 		kauth_acl_free(va.va_acl);
8774 	}
8775 	nameidone(&tond);
8776 	if (tvp) {
8777 		vnode_put(tvp);
8778 	}
8779 	vnode_put(tdvp);
8780 	return error;
8781 }
8782 
8783 /*
8784  * clone files or directories, target must not exist.
8785  */
8786 /* ARGSUSED */
8787 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8788 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8789     __unused int32_t *retval)
8790 {
8791 	vnode_t fvp;
8792 	struct nameidata fromnd;
8793 	int follow;
8794 	int error;
8795 	vfs_context_t ctx = vfs_context_current();
8796 
8797 	/* Check that the flags are valid. */
8798 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8799 		return EINVAL;
8800 	}
8801 
8802 	AUDIT_ARG(fd, uap->src_dirfd);
8803 
8804 	follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8805 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8806 	    UIO_USERSPACE, uap->src, ctx);
8807 	if ((error = nameiat(&fromnd, uap->src_dirfd))) {
8808 		return error;
8809 	}
8810 
8811 	fvp = fromnd.ni_vp;
8812 	nameidone(&fromnd);
8813 
8814 	error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8815 	    uap->flags, ctx);
8816 
8817 	vnode_put(fvp);
8818 	return error;
8819 }
8820 
8821 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)8822 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8823     __unused int32_t *retval)
8824 {
8825 	vnode_t fvp;
8826 	struct fileproc *fp;
8827 	int error;
8828 	vfs_context_t ctx = vfs_context_current();
8829 
8830 	/* Check that the flags are valid. */
8831 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8832 		return EINVAL;
8833 	}
8834 
8835 	AUDIT_ARG(fd, uap->src_fd);
8836 	error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8837 	if (error) {
8838 		return error;
8839 	}
8840 
8841 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8842 		AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8843 		error = EBADF;
8844 		goto out;
8845 	}
8846 
8847 	if ((error = vnode_getwithref(fvp))) {
8848 		goto out;
8849 	}
8850 
8851 	AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8852 
8853 	error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8854 	    uap->flags, ctx);
8855 
8856 	vnode_put(fvp);
8857 out:
8858 	file_drop(uap->src_fd);
8859 	return error;
8860 }
8861 
8862 static int
rename_submounts_callback(mount_t mp,void * arg)8863 rename_submounts_callback(mount_t mp, void *arg)
8864 {
8865 	int error = 0;
8866 	mount_t pmp = (mount_t)arg;
8867 	int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8868 
8869 	if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8870 		return 0;
8871 	}
8872 
8873 	if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8874 		return 0;
8875 	}
8876 
8877 	if ((error = vfs_busy(mp, LK_NOWAIT))) {
8878 		printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8879 		return -1;
8880 	}
8881 
8882 	size_t pathlen = MAXPATHLEN;
8883 	if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8884 		printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8885 	}
8886 
8887 	vfs_unbusy(mp);
8888 
8889 	return error;
8890 }
8891 
8892 /*
8893  * Rename files.  Source and destination must either both be directories,
8894  * or both not be directories.  If target is a directory, it must be empty.
8895  */
8896 /* ARGSUSED */
8897 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)8898 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8899     int tofd, user_addr_t to, int segflg, u_int uflags)
8900 {
8901 	vnode_t tvp, tdvp;
8902 	vnode_t fvp, fdvp;
8903 	vnode_t mnt_fvp;
8904 	struct nameidata *fromnd, *tond;
8905 	int error = 0;
8906 	int do_retry;
8907 	int retry_count;
8908 	int mntrename;
8909 	int need_event;
8910 	int need_kpath2;
8911 	int has_listeners;
8912 	const char *oname = NULL;
8913 	char *from_name = NULL, *to_name = NULL;
8914 	char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8915 	int from_len = 0, to_len = 0;
8916 	int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8917 	int holding_mntlock;
8918 	int vn_authorize_skipped;
8919 	mount_t locked_mp = NULL;
8920 	vnode_t oparent = NULLVP;
8921 #if CONFIG_FSE
8922 	fse_info from_finfo = {}, to_finfo;
8923 #endif
8924 	int from_truncated = 0, to_truncated = 0;
8925 	int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8926 	int batched = 0;
8927 	struct vnode_attr *fvap, *tvap;
8928 	int continuing = 0;
8929 	vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
8930 	int32_t nofollow_any = 0;
8931 	/* carving out a chunk for structs that are too big to be on stack. */
8932 	struct {
8933 		struct nameidata from_node, to_node;
8934 		struct vnode_attr fv_attr, tv_attr;
8935 	} * __rename_data;
8936 
8937 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
8938 	fromnd = &__rename_data->from_node;
8939 	tond = &__rename_data->to_node;
8940 
8941 	holding_mntlock = 0;
8942 	do_retry = 0;
8943 	retry_count = 0;
8944 retry:
8945 	fvp = tvp = NULL;
8946 	fdvp = tdvp = NULL;
8947 	fvap = tvap = NULL;
8948 	mnt_fvp = NULLVP;
8949 	mntrename = FALSE;
8950 	vn_authorize_skipped = FALSE;
8951 
8952 	if (uflags & RENAME_NOFOLLOW_ANY) {
8953 		nofollow_any = NAMEI_NOFOLLOW_ANY;
8954 	}
8955 	NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8956 	    segflg, from, ctx);
8957 	fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8958 
8959 	NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8960 	    segflg, to, ctx);
8961 	tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8962 
8963 continue_lookup:
8964 	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8965 		if ((error = nameiat(fromnd, fromfd))) {
8966 			goto out1;
8967 		}
8968 		fdvp = fromnd->ni_dvp;
8969 		fvp  = fromnd->ni_vp;
8970 
8971 		if (fvp && fvp->v_type == VDIR) {
8972 			tond->ni_cnd.cn_flags |= WILLBEDIR;
8973 		}
8974 	}
8975 
8976 	if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8977 		if ((error = nameiat(tond, tofd))) {
8978 			/*
8979 			 * Translate error code for rename("dir1", "dir2/.").
8980 			 */
8981 			if (error == EISDIR && fvp->v_type == VDIR) {
8982 				error = EINVAL;
8983 			}
8984 			goto out1;
8985 		}
8986 		tdvp = tond->ni_dvp;
8987 		tvp  = tond->ni_vp;
8988 	}
8989 
8990 #if DEVELOPMENT || DEBUG
8991 	/*
8992 	 * XXX VSWAP: Check for entitlements or special flag here
8993 	 * so we can restrict access appropriately.
8994 	 */
8995 #else /* DEVELOPMENT || DEBUG */
8996 
8997 	if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
8998 		error = EPERM;
8999 		goto out1;
9000 	}
9001 
9002 	if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
9003 		error = EPERM;
9004 		goto out1;
9005 	}
9006 #endif /* DEVELOPMENT || DEBUG */
9007 
9008 	if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9009 		error = ENOENT;
9010 		goto out1;
9011 	}
9012 
9013 	if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9014 		int32_t pval = 0;
9015 		int err = 0;
9016 
9017 		/*
9018 		 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9019 		 * has the same name as target iff the following conditions are met:
9020 		 * 1. the target file system is case insensitive
9021 		 * 2. source and target directories are the same
9022 		 * 3. source and target files are the same
9023 		 * 4. name only differs in case (determined by underlying filesystem)
9024 		 */
9025 		if (fvp != tvp || fdvp != tdvp) {
9026 			error = EEXIST;
9027 			goto out1;
9028 		}
9029 
9030 		/*
9031 		 * Assume that the target file system is case sensitive if
9032 		 * _PC_CASE_SENSITIVE selector isn't supported.
9033 		 */
9034 		err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9035 		if (err != 0 || pval != 0) {
9036 			error = EEXIST;
9037 			goto out1;
9038 		}
9039 	}
9040 
9041 	batched = vnode_compound_rename_available(fdvp);
9042 
9043 #if CONFIG_FSE
9044 	need_event = need_fsevent(FSE_RENAME, fdvp);
9045 	if (need_event) {
9046 		if (fvp) {
9047 			get_fse_info(fvp, &from_finfo, ctx);
9048 		} else {
9049 			error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9050 			if (error) {
9051 				goto out1;
9052 			}
9053 
9054 			fvap = &__rename_data->fv_attr;
9055 		}
9056 
9057 		if (tvp) {
9058 			get_fse_info(tvp, &to_finfo, ctx);
9059 		} else if (batched) {
9060 			error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9061 			if (error) {
9062 				goto out1;
9063 			}
9064 
9065 			tvap = &__rename_data->tv_attr;
9066 		}
9067 	}
9068 #else
9069 	need_event = 0;
9070 #endif /* CONFIG_FSE */
9071 
9072 	has_listeners = kauth_authorize_fileop_has_listeners();
9073 
9074 	need_kpath2 = 0;
9075 #if CONFIG_AUDIT
9076 	if (AUDIT_RECORD_EXISTS()) {
9077 		need_kpath2 = 1;
9078 	}
9079 #endif
9080 
9081 	if (need_event || has_listeners) {
9082 		if (from_name == NULL) {
9083 			GET_PATH(from_name);
9084 		}
9085 
9086 		from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9087 
9088 		if (from_name_no_firmlink == NULL) {
9089 			GET_PATH(from_name_no_firmlink);
9090 		}
9091 
9092 		from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9093 	}
9094 
9095 	if (need_event || need_kpath2 || has_listeners) {
9096 		if (to_name == NULL) {
9097 			GET_PATH(to_name);
9098 		}
9099 
9100 		to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9101 
9102 		if (to_name_no_firmlink == NULL) {
9103 			GET_PATH(to_name_no_firmlink);
9104 		}
9105 
9106 		to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9107 		if (to_name && need_kpath2) {
9108 			AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9109 		}
9110 	}
9111 	if (!fvp) {
9112 		/*
9113 		 * Claim: this check will never reject a valid rename.
9114 		 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9115 		 * Suppose fdvp and tdvp are not on the same mount.
9116 		 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
9117 		 *      then you can't move it to within another dir on the same mountpoint.
9118 		 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9119 		 *
9120 		 * If this check passes, then we are safe to pass these vnodes to the same FS.
9121 		 */
9122 		if (fdvp->v_mount != tdvp->v_mount) {
9123 			error = EXDEV;
9124 			goto out1;
9125 		}
9126 		goto skipped_lookup;
9127 	}
9128 
9129 	/*
9130 	 * If the source and destination are the same (i.e. they're
9131 	 * links to the same vnode) and the target file system is
9132 	 * case sensitive, then there is nothing to do.
9133 	 *
9134 	 * XXX Come back to this.
9135 	 */
9136 	if (fvp == tvp) {
9137 		int pathconf_val;
9138 
9139 		/*
9140 		 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9141 		 * then assume that this file system is case sensitive.
9142 		 */
9143 		if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9144 		    pathconf_val != 0) {
9145 			vn_authorize_skipped = TRUE;
9146 			goto out1;
9147 		}
9148 	}
9149 
9150 	/*
9151 	 * Allow the renaming of mount points.
9152 	 * - target must not exist
9153 	 * - target must reside in the same directory as source
9154 	 * - union mounts cannot be renamed
9155 	 * - the root fs, and tightly-linked system volumes, cannot be renamed
9156 	 *
9157 	 * XXX Handle this in VFS after a continued lookup (if we missed
9158 	 * in the cache to start off)
9159 	 *
9160 	 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9161 	 * we'll skip past here.  The file system is responsible for
9162 	 * checking that @tvp is not a descendent of @fvp and vice versa
9163 	 * so it should always return EINVAL if either @tvp or @fvp is the
9164 	 * root of a volume.
9165 	 */
9166 	if ((fvp->v_flag & VROOT) &&
9167 	    (fvp->v_type == VDIR) &&
9168 	    (tvp == NULL) &&
9169 	    (fvp->v_mountedhere == NULL) &&
9170 	    (fdvp == tdvp) &&
9171 	    ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9172 	    ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9173 	    (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9174 		vnode_t coveredvp;
9175 
9176 		/* switch fvp to the covered vnode */
9177 		coveredvp = fvp->v_mount->mnt_vnodecovered;
9178 		if ((vnode_getwithref(coveredvp))) {
9179 			error = ENOENT;
9180 			goto out1;
9181 		}
9182 		/*
9183 		 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9184 		 * later.
9185 		 */
9186 		mnt_fvp = fvp;
9187 
9188 		fvp = coveredvp;
9189 		mntrename = TRUE;
9190 	}
9191 	/*
9192 	 * Check for cross-device rename.
9193 	 */
9194 	if ((fvp->v_mount != tdvp->v_mount) ||
9195 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
9196 		error = EXDEV;
9197 		goto out1;
9198 	}
9199 
9200 	/*
9201 	 * If source is the same as the destination (that is the
9202 	 * same inode number) then there is nothing to do...
9203 	 * EXCEPT if the underlying file system supports case
9204 	 * insensitivity and is case preserving.  In this case
9205 	 * the file system needs to handle the special case of
9206 	 * getting the same vnode as target (fvp) and source (tvp).
9207 	 *
9208 	 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9209 	 * and _PC_CASE_PRESERVING can have this exception, and they need to
9210 	 * handle the special case of getting the same vnode as target and
9211 	 * source.  NOTE: Then the target is unlocked going into vnop_rename,
9212 	 * so not to cause locking problems. There is a single reference on tvp.
9213 	 *
9214 	 * NOTE - that fvp == tvp also occurs if they are hard linked and
9215 	 * that correct behaviour then is just to return success without doing
9216 	 * anything.
9217 	 *
9218 	 * XXX filesystem should take care of this itself, perhaps...
9219 	 */
9220 	if (fvp == tvp && fdvp == tdvp) {
9221 		if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9222 		    !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9223 		    fromnd->ni_cnd.cn_namelen)) {
9224 			vn_authorize_skipped = TRUE;
9225 			goto out1;
9226 		}
9227 	}
9228 
9229 	if (holding_mntlock && fvp->v_mount != locked_mp) {
9230 		/*
9231 		 * we're holding a reference and lock
9232 		 * on locked_mp, but it no longer matches
9233 		 * what we want to do... so drop our hold
9234 		 */
9235 		mount_unlock_renames(locked_mp);
9236 		mount_drop(locked_mp, 0);
9237 		holding_mntlock = 0;
9238 	}
9239 	if (tdvp != fdvp && fvp->v_type == VDIR) {
9240 		/*
9241 		 * serialize renames that re-shape
9242 		 * the tree... if holding_mntlock is
9243 		 * set, then we're ready to go...
9244 		 * otherwise we
9245 		 * first need to drop the iocounts
9246 		 * we picked up, second take the
9247 		 * lock to serialize the access,
9248 		 * then finally start the lookup
9249 		 * process over with the lock held
9250 		 */
9251 		if (!holding_mntlock) {
9252 			/*
9253 			 * need to grab a reference on
9254 			 * the mount point before we
9255 			 * drop all the iocounts... once
9256 			 * the iocounts are gone, the mount
9257 			 * could follow
9258 			 */
9259 			locked_mp = fvp->v_mount;
9260 			mount_ref(locked_mp, 0);
9261 
9262 			/*
9263 			 * nameidone has to happen before we vnode_put(tvp)
9264 			 * since it may need to release the fs_nodelock on the tvp
9265 			 */
9266 			nameidone(tond);
9267 
9268 			if (tvp) {
9269 				vnode_put(tvp);
9270 			}
9271 			vnode_put(tdvp);
9272 
9273 			/*
9274 			 * nameidone has to happen before we vnode_put(fdvp)
9275 			 * since it may need to release the fs_nodelock on the fvp
9276 			 */
9277 			nameidone(fromnd);
9278 
9279 			vnode_put(fvp);
9280 			vnode_put(fdvp);
9281 
9282 			if (mnt_fvp != NULLVP) {
9283 				vnode_put(mnt_fvp);
9284 			}
9285 
9286 			mount_lock_renames(locked_mp);
9287 			holding_mntlock = 1;
9288 
9289 			goto retry;
9290 		}
9291 	} else {
9292 		/*
9293 		 * when we dropped the iocounts to take
9294 		 * the lock, we allowed the identity of
9295 		 * the various vnodes to change... if they did,
9296 		 * we may no longer be dealing with a rename
9297 		 * that reshapes the tree... once we're holding
9298 		 * the iocounts, the vnodes can't change type
9299 		 * so we're free to drop the lock at this point
9300 		 * and continue on
9301 		 */
9302 		if (holding_mntlock) {
9303 			mount_unlock_renames(locked_mp);
9304 			mount_drop(locked_mp, 0);
9305 			holding_mntlock = 0;
9306 		}
9307 	}
9308 
9309 	if (!batched) {
9310 		error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9311 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9312 		    flags, NULL);
9313 		if (error) {
9314 			if (error == ENOENT) {
9315 				if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9316 					/*
9317 					 * We encountered a race where after doing the namei,
9318 					 * tvp stops being valid. If so, simply re-drive the rename
9319 					 * call from the top.
9320 					 */
9321 					do_retry = 1;
9322 					retry_count += 1;
9323 				}
9324 			}
9325 			goto out1;
9326 		}
9327 	}
9328 
9329 	/* Release the 'mnt_fvp' now that it is no longer needed. */
9330 	if (mnt_fvp != NULLVP) {
9331 		vnode_put(mnt_fvp);
9332 		mnt_fvp = NULLVP;
9333 	}
9334 
9335 	// save these off so we can later verify that fvp is the same
9336 	oname   = fvp->v_name;
9337 	oparent = fvp->v_parent;
9338 
9339 skipped_lookup:
9340 #if CONFIG_FILE_LEASES
9341 	/* Lease break needed for source's parent dir? */
9342 	vnode_breakdirlease(fdvp, false, O_WRONLY);
9343 
9344 	/* Lease break needed for target's parent dir? */
9345 	vnode_breakdirlease(tdvp, false, O_WRONLY);
9346 #endif
9347 
9348 	error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9349 	    tdvp, &tvp, &tond->ni_cnd, tvap,
9350 	    flags, ctx);
9351 
9352 	if (holding_mntlock) {
9353 		/*
9354 		 * we can drop our serialization
9355 		 * lock now
9356 		 */
9357 		mount_unlock_renames(locked_mp);
9358 		mount_drop(locked_mp, 0);
9359 		holding_mntlock = 0;
9360 	}
9361 	if (error) {
9362 		if (error == EDATALESS) {
9363 			/*
9364 			 * If we've been here before, something has gone
9365 			 * horribly wrong and we should just get out lest
9366 			 * we spiral around the drain forever.
9367 			 */
9368 			if (flags & VFS_RENAME_DATALESS) {
9369 				error = EIO;
9370 				goto out1;
9371 			}
9372 
9373 			/*
9374 			 * The object we're renaming is dataless (or has a
9375 			 * dataless descendent) and requires materialization
9376 			 * before the rename occurs.  But we're holding the
9377 			 * mount point's rename lock, so it's not safe to
9378 			 * make the upcall.
9379 			 *
9380 			 * In this case, we release the lock (above), perform
9381 			 * the materialization, and start the whole thing over.
9382 			 */
9383 			error = vfs_materialize_reparent(fvp, tdvp);
9384 			if (error == 0) {
9385 				/*
9386 				 * The next time around we need to tell the
9387 				 * file system that the materializtaion has
9388 				 * been performed.
9389 				 */
9390 				flags |= VFS_RENAME_DATALESS;
9391 				do_retry = 1;
9392 			}
9393 			goto out1;
9394 		}
9395 		if (error == EKEEPLOOKING) {
9396 			if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9397 				if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9398 					panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9399 				}
9400 			}
9401 
9402 			fromnd->ni_vp = fvp;
9403 			tond->ni_vp = tvp;
9404 
9405 			goto continue_lookup;
9406 		}
9407 
9408 		/*
9409 		 * We may encounter a race in the VNOP where the destination didn't
9410 		 * exist when we did the namei, but it does by the time we go and
9411 		 * try to create the entry. In this case, we should re-drive this rename
9412 		 * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
9413 		 * but other filesystems susceptible to this race could return it, too.
9414 		 */
9415 		if (error == ERECYCLE) {
9416 			if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9417 				do_retry = 1;
9418 				retry_count += 1;
9419 			} else {
9420 				printf("rename retry limit due to ERECYCLE reached\n");
9421 				error = ENOENT;
9422 			}
9423 		}
9424 
9425 		/*
9426 		 * For compound VNOPs, the authorization callback may return
9427 		 * ENOENT in case of racing hardlink lookups hitting the name
9428 		 * cache, redrive the lookup.
9429 		 */
9430 		if (batched && error == ENOENT) {
9431 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9432 				do_retry = 1;
9433 				retry_count += 1;
9434 			}
9435 		}
9436 
9437 		goto out1;
9438 	}
9439 
9440 	/* call out to allow 3rd party notification of rename.
9441 	 * Ignore result of kauth_authorize_fileop call.
9442 	 */
9443 	kauth_authorize_fileop(vfs_context_ucred(ctx),
9444 	    KAUTH_FILEOP_RENAME,
9445 	    (uintptr_t)from_name, (uintptr_t)to_name);
9446 	if (flags & VFS_RENAME_SWAP) {
9447 		kauth_authorize_fileop(vfs_context_ucred(ctx),
9448 		    KAUTH_FILEOP_RENAME,
9449 		    (uintptr_t)to_name, (uintptr_t)from_name);
9450 	}
9451 
9452 #if CONFIG_FSE
9453 	if (from_name != NULL && to_name != NULL) {
9454 		if (from_truncated || to_truncated) {
9455 			// set it here since only the from_finfo gets reported up to user space
9456 			from_finfo.mode |= FSE_TRUNCATED_PATH;
9457 		}
9458 
9459 		if (tvap && tvp) {
9460 			vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
9461 		}
9462 		if (fvap) {
9463 			vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
9464 		}
9465 
9466 		if (tvp) {
9467 			add_fsevent(FSE_RENAME, ctx,
9468 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9469 			    FSE_ARG_FINFO, &from_finfo,
9470 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9471 			    FSE_ARG_FINFO, &to_finfo,
9472 			    FSE_ARG_DONE);
9473 			if (flags & VFS_RENAME_SWAP) {
9474 				/*
9475 				 * Strictly speaking, swap is the equivalent of
9476 				 * *three* renames.  FSEvents clients should only take
9477 				 * the events as a hint, so we only bother reporting
9478 				 * two.
9479 				 */
9480 				add_fsevent(FSE_RENAME, ctx,
9481 				    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9482 				    FSE_ARG_FINFO, &to_finfo,
9483 				    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9484 				    FSE_ARG_FINFO, &from_finfo,
9485 				    FSE_ARG_DONE);
9486 			}
9487 		} else {
9488 			add_fsevent(FSE_RENAME, ctx,
9489 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9490 			    FSE_ARG_FINFO, &from_finfo,
9491 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9492 			    FSE_ARG_DONE);
9493 		}
9494 	}
9495 #endif /* CONFIG_FSE */
9496 
9497 	/*
9498 	 * update filesystem's mount point data
9499 	 */
9500 	if (mntrename) {
9501 		char *cp, *pathend, *mpname;
9502 		char * tobuf;
9503 		struct mount *mp;
9504 		int maxlen;
9505 		size_t len = 0;
9506 
9507 		mp = fvp->v_mountedhere;
9508 
9509 		if (vfs_busy(mp, LK_NOWAIT)) {
9510 			error = EBUSY;
9511 			goto out1;
9512 		}
9513 		tobuf = zalloc(ZV_NAMEI);
9514 
9515 		if (UIO_SEG_IS_USER_SPACE(segflg)) {
9516 			error = copyinstr(to, tobuf, MAXPATHLEN, &len);
9517 		} else {
9518 			error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
9519 		}
9520 		if (!error) {
9521 			/* find current mount point prefix */
9522 			pathend = &mp->mnt_vfsstat.f_mntonname[0];
9523 			for (cp = pathend; *cp != '\0'; ++cp) {
9524 				if (*cp == '/') {
9525 					pathend = cp + 1;
9526 				}
9527 			}
9528 			/* find last component of target name */
9529 			for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9530 				if (*cp == '/') {
9531 					mpname = cp + 1;
9532 				}
9533 			}
9534 
9535 			/* Update f_mntonname of sub mounts */
9536 			vfs_iterate(0, rename_submounts_callback, (void *)mp);
9537 
9538 			/* append name to prefix */
9539 			maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9540 			bzero(pathend, maxlen);
9541 
9542 			strlcpy(pathend, mpname, maxlen);
9543 		}
9544 		zfree(ZV_NAMEI, tobuf);
9545 
9546 		vfs_unbusy(mp);
9547 
9548 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
9549 	}
9550 	/*
9551 	 * fix up name & parent pointers.  note that we first
9552 	 * check that fvp has the same name/parent pointers it
9553 	 * had before the rename call... this is a 'weak' check
9554 	 * at best...
9555 	 *
9556 	 * XXX oparent and oname may not be set in the compound vnop case
9557 	 */
9558 	if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9559 		int update_flags;
9560 
9561 		update_flags = VNODE_UPDATE_NAME;
9562 
9563 		if (fdvp != tdvp) {
9564 			update_flags |= VNODE_UPDATE_PARENT;
9565 		}
9566 
9567 		vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
9568 	}
9569 out1:
9570 	/*
9571 	 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9572 	 * skipped earlier as no actual rename was performed.
9573 	 */
9574 	if (vn_authorize_skipped && error == 0) {
9575 		error = vn_authorize_renamex_with_paths(fdvp, fvp,
9576 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9577 		    flags, NULL);
9578 		if (error && error == ENOENT) {
9579 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9580 				do_retry = 1;
9581 				retry_count += 1;
9582 			}
9583 		}
9584 	}
9585 	if (to_name != NULL) {
9586 		RELEASE_PATH(to_name);
9587 		to_name = NULL;
9588 	}
9589 	if (to_name_no_firmlink != NULL) {
9590 		RELEASE_PATH(to_name_no_firmlink);
9591 		to_name_no_firmlink = NULL;
9592 	}
9593 	if (from_name != NULL) {
9594 		RELEASE_PATH(from_name);
9595 		from_name = NULL;
9596 	}
9597 	if (from_name_no_firmlink != NULL) {
9598 		RELEASE_PATH(from_name_no_firmlink);
9599 		from_name_no_firmlink = NULL;
9600 	}
9601 	if (holding_mntlock) {
9602 		mount_unlock_renames(locked_mp);
9603 		mount_drop(locked_mp, 0);
9604 		holding_mntlock = 0;
9605 	}
9606 	if (tdvp) {
9607 		/*
9608 		 * nameidone has to happen before we vnode_put(tdvp)
9609 		 * since it may need to release the fs_nodelock on the tdvp
9610 		 */
9611 		nameidone(tond);
9612 
9613 		if (tvp) {
9614 			vnode_put(tvp);
9615 		}
9616 		vnode_put(tdvp);
9617 	}
9618 	if (fdvp) {
9619 		/*
9620 		 * nameidone has to happen before we vnode_put(fdvp)
9621 		 * since it may need to release the fs_nodelock on the fdvp
9622 		 */
9623 		nameidone(fromnd);
9624 
9625 		if (fvp) {
9626 			vnode_put(fvp);
9627 		}
9628 		vnode_put(fdvp);
9629 	}
9630 	if (mnt_fvp != NULLVP) {
9631 		vnode_put(mnt_fvp);
9632 	}
9633 	/*
9634 	 * If things changed after we did the namei, then we will re-drive
9635 	 * this rename call from the top.
9636 	 */
9637 	if (do_retry) {
9638 		do_retry = 0;
9639 		goto retry;
9640 	}
9641 
9642 	kfree_type(typeof(*__rename_data), __rename_data);
9643 	return error;
9644 }
9645 
9646 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9647 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9648 {
9649 	return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9650 	           AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9651 }
9652 
9653 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9654 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9655 {
9656 	if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9657 		return EINVAL;
9658 	}
9659 
9660 	if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9661 		return EINVAL;
9662 	}
9663 
9664 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9665 	           uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9666 }
9667 
9668 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9669 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9670 {
9671 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9672 	           uap->tofd, uap->to, UIO_USERSPACE, 0);
9673 }
9674 
9675 /*
9676  * Make a directory file.
9677  *
9678  * Returns:	0			Success
9679  *		EEXIST
9680  *	namei:???
9681  *	vnode_authorize:???
9682  *	vn_create:???
9683  */
9684 /* ARGSUSED */
9685 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9686 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9687     enum uio_seg segflg)
9688 {
9689 	vnode_t vp, dvp;
9690 	int error;
9691 	int update_flags = 0;
9692 	int batched;
9693 	struct nameidata nd;
9694 
9695 	AUDIT_ARG(mode, vap->va_mode);
9696 	NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9697 	    path, ctx);
9698 	nd.ni_cnd.cn_flags |= WILLBEDIR;
9699 	nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9700 
9701 continue_lookup:
9702 	error = nameiat(&nd, fd);
9703 	if (error) {
9704 		return error;
9705 	}
9706 	dvp = nd.ni_dvp;
9707 	vp = nd.ni_vp;
9708 
9709 	if (vp != NULL) {
9710 		error = EEXIST;
9711 		goto out;
9712 	}
9713 
9714 	batched = vnode_compound_mkdir_available(dvp);
9715 
9716 	VATTR_SET(vap, va_type, VDIR);
9717 
9718 	/*
9719 	 * XXX
9720 	 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9721 	 * only get EXISTS or EISDIR for existing path components, and not that it could see
9722 	 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9723 	 * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
9724 	 */
9725 	if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9726 		if (error == EACCES || error == EPERM) {
9727 			int error2;
9728 
9729 			nameidone(&nd);
9730 			vnode_put(dvp);
9731 			dvp = NULLVP;
9732 
9733 			/*
9734 			 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9735 			 * rather than EACCESS if the target exists.
9736 			 */
9737 			NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9738 			    path, ctx);
9739 			error2 = nameiat(&nd, fd);
9740 			if (error2) {
9741 				goto out;
9742 			} else {
9743 				vp = nd.ni_vp;
9744 				error = EEXIST;
9745 				goto out;
9746 			}
9747 		}
9748 
9749 		goto out;
9750 	}
9751 
9752 #if CONFIG_FILE_LEASES
9753 	vnode_breakdirlease(dvp, false, O_WRONLY);
9754 #endif
9755 
9756 	/*
9757 	 * make the directory
9758 	 */
9759 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9760 		if (error == EKEEPLOOKING) {
9761 			nd.ni_vp = vp;
9762 			goto continue_lookup;
9763 		}
9764 
9765 		goto out;
9766 	}
9767 
9768 	// Make sure the name & parent pointers are hooked up
9769 	if (vp->v_name == NULL) {
9770 		update_flags |= VNODE_UPDATE_NAME;
9771 	}
9772 	if (vp->v_parent == NULLVP) {
9773 		update_flags |= VNODE_UPDATE_PARENT;
9774 	}
9775 
9776 	if (update_flags) {
9777 		vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9778 	}
9779 
9780 #if CONFIG_FSE
9781 	add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9782 #endif
9783 
9784 out:
9785 	/*
9786 	 * nameidone has to happen before we vnode_put(dvp)
9787 	 * since it may need to release the fs_nodelock on the dvp
9788 	 */
9789 	nameidone(&nd);
9790 
9791 	if (vp) {
9792 		vnode_put(vp);
9793 	}
9794 	if (dvp) {
9795 		vnode_put(dvp);
9796 	}
9797 
9798 	return error;
9799 }
9800 
9801 /*
9802  * mkdir_extended: Create a directory; with extended security (ACL).
9803  *
9804  * Parameters:    p                       Process requesting to create the directory
9805  *                uap                     User argument descriptor (see below)
9806  *                retval                  (ignored)
9807  *
9808  * Indirect:      uap->path               Path of directory to create
9809  *                uap->mode               Access permissions to set
9810  *                uap->xsecurity          ACL to set
9811  *
9812  * Returns:        0                      Success
9813  *                !0                      Not success
9814  *
9815  */
9816 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)9817 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9818 {
9819 	int ciferror;
9820 	kauth_filesec_t xsecdst;
9821 	struct vnode_attr va;
9822 
9823 	AUDIT_ARG(owner, uap->uid, uap->gid);
9824 
9825 	xsecdst = NULL;
9826 	if ((uap->xsecurity != USER_ADDR_NULL) &&
9827 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9828 		return ciferror;
9829 	}
9830 
9831 	VATTR_INIT(&va);
9832 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9833 	if (xsecdst != NULL) {
9834 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9835 		va.va_vaflags |= VA_FILESEC_ACL;
9836 	}
9837 
9838 	ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9839 	    UIO_USERSPACE);
9840 	if (xsecdst != NULL) {
9841 		kauth_filesec_free(xsecdst);
9842 	}
9843 	return ciferror;
9844 }
9845 
9846 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)9847 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9848 {
9849 	struct vnode_attr va;
9850 
9851 	VATTR_INIT(&va);
9852 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9853 
9854 	return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9855 	           UIO_USERSPACE);
9856 }
9857 
9858 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)9859 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9860 {
9861 	struct vnode_attr va;
9862 
9863 	VATTR_INIT(&va);
9864 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9865 
9866 	return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9867 	           UIO_USERSPACE);
9868 }
9869 
9870 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)9871 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9872     enum uio_seg segflg, int unlink_flags)
9873 {
9874 	struct {
9875 		struct nameidata nd;
9876 #if CONFIG_FSE
9877 		struct vnode_attr va;
9878 #endif /* CONFIG_FSE */
9879 	} *__rmdir_data;
9880 	vnode_t vp, dvp;
9881 	int error;
9882 	struct nameidata *ndp;
9883 	char     *path = NULL;
9884 	char     *no_firmlink_path = NULL;
9885 	int       len_path = 0;
9886 	int       len_no_firmlink_path = 0;
9887 	int has_listeners = 0;
9888 	int need_event = 0;
9889 	int truncated_path = 0;
9890 	int truncated_no_firmlink_path = 0;
9891 	struct vnode_attr *vap = NULL;
9892 	int restart_count = 0;
9893 	int batched;
9894 
9895 	int restart_flag;
9896 	int nofollow_any = 0;
9897 
9898 	__rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
9899 	ndp = &__rmdir_data->nd;
9900 
9901 	if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
9902 		nofollow_any = NAMEI_NOFOLLOW_ANY;
9903 		unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
9904 	}
9905 
9906 	/*
9907 	 * This loop exists to restart rmdir in the unlikely case that two
9908 	 * processes are simultaneously trying to remove the same directory
9909 	 * containing orphaned appleDouble files.
9910 	 */
9911 	do {
9912 		NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9913 		    segflg, dirpath, ctx);
9914 		ndp->ni_flag = NAMEI_COMPOUNDRMDIR | nofollow_any;
9915 continue_lookup:
9916 		restart_flag = 0;
9917 		vap = NULL;
9918 
9919 		error = nameiat(ndp, fd);
9920 		if (error) {
9921 			goto err_out;
9922 		}
9923 
9924 		dvp = ndp->ni_dvp;
9925 		vp = ndp->ni_vp;
9926 
9927 		if (vp) {
9928 			batched = vnode_compound_rmdir_available(vp);
9929 
9930 			if (vp->v_flag & VROOT) {
9931 				/*
9932 				 * The root of a mounted filesystem cannot be deleted.
9933 				 */
9934 				error = EBUSY;
9935 				goto out;
9936 			}
9937 
9938 #if DEVELOPMENT || DEBUG
9939 			/*
9940 			 * XXX VSWAP: Check for entitlements or special flag here
9941 			 * so we can restrict access appropriately.
9942 			 */
9943 #else /* DEVELOPMENT || DEBUG */
9944 
9945 			if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
9946 				error = EPERM;
9947 				goto out;
9948 			}
9949 #endif /* DEVELOPMENT || DEBUG */
9950 
9951 			/*
9952 			 * Removed a check here; we used to abort if vp's vid
9953 			 * was not the same as what we'd seen the last time around.
9954 			 * I do not think that check was valid, because if we retry
9955 			 * and all dirents are gone, the directory could legitimately
9956 			 * be recycled but still be present in a situation where we would
9957 			 * have had permission to delete.  Therefore, we won't make
9958 			 * an effort to preserve that check now that we may not have a
9959 			 * vp here.
9960 			 */
9961 
9962 			if (!batched) {
9963 				error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
9964 				if (error) {
9965 					if (error == ENOENT) {
9966 						if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9967 							restart_flag = 1;
9968 							restart_count += 1;
9969 						}
9970 					}
9971 					goto out;
9972 				}
9973 			}
9974 		} else {
9975 			batched = 1;
9976 
9977 			if (!vnode_compound_rmdir_available(dvp)) {
9978 				panic("No error, but no compound rmdir?");
9979 			}
9980 		}
9981 
9982 #if CONFIG_FSE
9983 		fse_info  finfo = {0};
9984 
9985 		need_event = need_fsevent(FSE_DELETE, dvp);
9986 		if (need_event) {
9987 			if (!batched) {
9988 				get_fse_info(vp, &finfo, ctx);
9989 			} else {
9990 				error = vfs_get_notify_attributes(&__rmdir_data->va);
9991 				if (error) {
9992 					goto out;
9993 				}
9994 
9995 				vap = &__rmdir_data->va;
9996 			}
9997 		}
9998 #endif
9999 		has_listeners = kauth_authorize_fileop_has_listeners();
10000 		if (need_event || has_listeners) {
10001 			if (path == NULL) {
10002 				GET_PATH(path);
10003 			}
10004 
10005 			len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10006 
10007 			if (no_firmlink_path == NULL) {
10008 				GET_PATH(no_firmlink_path);
10009 			}
10010 
10011 			len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10012 #if CONFIG_FSE
10013 			if (truncated_no_firmlink_path) {
10014 				finfo.mode |= FSE_TRUNCATED_PATH;
10015 			}
10016 #endif
10017 		}
10018 
10019 #if CONFIG_FILE_LEASES
10020 		vnode_breakdirlease(dvp, false, O_WRONLY);
10021 #endif
10022 
10023 		error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10024 		ndp->ni_vp = vp;
10025 		if (vp == NULLVP) {
10026 			/* Couldn't find a vnode */
10027 			goto out;
10028 		}
10029 
10030 		if (error == EKEEPLOOKING) {
10031 			goto continue_lookup;
10032 		} else if (batched && error == ENOENT) {
10033 			if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10034 				/*
10035 				 * For compound VNOPs, the authorization callback
10036 				 * may return ENOENT in case of racing hard link lookups
10037 				 * redrive the lookup.
10038 				 */
10039 				restart_flag = 1;
10040 				restart_count += 1;
10041 				goto out;
10042 			}
10043 		}
10044 
10045 		/*
10046 		 * XXX There's no provision for passing flags
10047 		 * to VNOP_RMDIR().  So, if vn_rmdir() fails
10048 		 * because it's not empty, then we try again
10049 		 * with VNOP_REMOVE(), passing in a special
10050 		 * flag that clever file systems will know
10051 		 * how to handle.
10052 		 */
10053 		if (error == ENOTEMPTY &&
10054 		    (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10055 			/*
10056 			 * Only do this if the directory is actually
10057 			 * marked as DATALESS.
10058 			 */
10059 			struct vnode_attr *lvap =
10060 			    kalloc_type(struct vnode_attr, Z_WAITOK);
10061 
10062 			VATTR_INIT(lvap);
10063 			VATTR_WANTED(lvap, va_flags);
10064 			if (vnode_getattr(vp, lvap, ctx) == 0 &&
10065 			    VATTR_IS_SUPPORTED(lvap, va_flags) &&
10066 			    (lvap->va_flags & SF_DATALESS) != 0) {
10067 				/*
10068 				 * If this fails, we want to keep the original
10069 				 * error.
10070 				 */
10071 				if (vn_remove(dvp, &vp, ndp,
10072 				    VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10073 					error = 0;
10074 				}
10075 			}
10076 			kfree_type(struct vnode_attr, lvap);
10077 		}
10078 
10079 #if CONFIG_APPLEDOUBLE
10080 		/*
10081 		 * Special case to remove orphaned AppleDouble
10082 		 * files. I don't like putting this in the kernel,
10083 		 * but carbon does not like putting this in carbon either,
10084 		 * so here we are.
10085 		 */
10086 		if (error == ENOTEMPTY) {
10087 			int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10088 			if (ad_error == EBUSY) {
10089 				error = ad_error;
10090 				goto out;
10091 			}
10092 
10093 
10094 			/*
10095 			 * Assuming everything went well, we will try the RMDIR again
10096 			 */
10097 			if (!ad_error) {
10098 				error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10099 			}
10100 		}
10101 #endif /* CONFIG_APPLEDOUBLE */
10102 		/*
10103 		 * Call out to allow 3rd party notification of delete.
10104 		 * Ignore result of kauth_authorize_fileop call.
10105 		 */
10106 		if (!error) {
10107 			if (has_listeners) {
10108 				kauth_authorize_fileop(vfs_context_ucred(ctx),
10109 				    KAUTH_FILEOP_DELETE,
10110 				    (uintptr_t)vp,
10111 				    (uintptr_t)path);
10112 			}
10113 
10114 			if (vp->v_flag & VISHARDLINK) {
10115 				// see the comment in unlink1() about why we update
10116 				// the parent of a hard link when it is removed
10117 				vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10118 			}
10119 
10120 #if CONFIG_FSE
10121 			if (need_event) {
10122 				if (vap) {
10123 					vnode_get_fse_info_from_vap(vp, &finfo, vap);
10124 				}
10125 				add_fsevent(FSE_DELETE, ctx,
10126 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10127 				    FSE_ARG_FINFO, &finfo,
10128 				    FSE_ARG_DONE);
10129 			}
10130 #endif
10131 
10132 #if CONFIG_MACF
10133 			mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10134 #endif
10135 		}
10136 
10137 out:
10138 		if (path != NULL) {
10139 			RELEASE_PATH(path);
10140 			path = NULL;
10141 		}
10142 
10143 		if (no_firmlink_path != NULL) {
10144 			RELEASE_PATH(no_firmlink_path);
10145 			no_firmlink_path = NULL;
10146 		}
10147 
10148 		/*
10149 		 * nameidone has to happen before we vnode_put(dvp)
10150 		 * since it may need to release the fs_nodelock on the dvp
10151 		 */
10152 		nameidone(ndp);
10153 		vnode_put(dvp);
10154 
10155 		if (vp) {
10156 			vnode_put(vp);
10157 		}
10158 
10159 		if (restart_flag == 0) {
10160 			wakeup_one((caddr_t)vp);
10161 			goto err_out;
10162 		}
10163 		tsleep(vp, PVFS, "rm AD", 1);
10164 	} while (restart_flag != 0);
10165 
10166 err_out:
10167 	kfree_type(typeof(*__rmdir_data), __rmdir_data);
10168 
10169 	return error;
10170 }
10171 
10172 /*
10173  * Remove a directory file.
10174  */
10175 /* ARGSUSED */
10176 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10177 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10178 {
10179 	return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10180 	           CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10181 }
10182 
10183 /* Get direntry length padded to 8 byte alignment */
10184 #define DIRENT64_LEN(namlen) \
10185 	((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10186 
10187 /* Get dirent length padded to 4 byte alignment */
10188 #define DIRENT_LEN(namelen) \
10189 	((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10190 
10191 /* Get the end of this dirent */
10192 #define DIRENT_END(dep) \
10193 	(((char *)(dep)) + (dep)->d_reclen - 1)
10194 
10195 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10196 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10197     int *numdirent, vfs_context_t ctxp)
10198 {
10199 	/* Check if fs natively supports VNODE_READDIR_EXTENDED */
10200 	if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10201 	    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10202 		return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10203 	} else {
10204 		size_t bufsize;
10205 		void * bufptr;
10206 		uio_t auio;
10207 		struct direntry *entry64;
10208 		struct dirent *dep;
10209 		size_t bytesread;
10210 		int error;
10211 
10212 		/*
10213 		 * We're here because the underlying file system does not
10214 		 * support direnties or we mounted denying support so we must
10215 		 * fall back to dirents and convert them to direntries.
10216 		 *
10217 		 * Our kernel buffer needs to be smaller since re-packing will
10218 		 * expand each dirent.  The worse case (when the name length
10219 		 * is 3 or less) corresponds to a struct direntry size of 32
10220 		 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10221 		 * (4-byte aligned).  So having a buffer that is 3/8 the size
10222 		 * will prevent us from reading more than we can pack.
10223 		 *
10224 		 * Since this buffer is wired memory, we will limit the
10225 		 * buffer size to a maximum of 32K. We would really like to
10226 		 * use 32K in the MIN(), but we use magic number 87371 to
10227 		 * prevent uio_resid() * 3 / 8 from overflowing.
10228 		 */
10229 		bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10230 		bufptr = kalloc_data(bufsize, Z_WAITOK);
10231 		if (bufptr == NULL) {
10232 			return ENOMEM;
10233 		}
10234 
10235 		auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10236 		uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10237 		auio->uio_offset = uio->uio_offset;
10238 
10239 		error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10240 
10241 		dep = (struct dirent *)bufptr;
10242 		bytesread = bufsize - uio_resid(auio);
10243 
10244 		entry64 = kalloc_type(struct direntry, Z_WAITOK);
10245 		/*
10246 		 * Convert all the entries and copy them out to user's buffer.
10247 		 */
10248 		while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10249 			/* First check that the dirent struct up to d_name is within the buffer */
10250 			if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10251 			    /* Check that the length of the entire dirent is within the buffer */
10252 			    DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10253 			    /* Check that the actual length including the name doesn't exceed d_reclen */
10254 			    DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10255 				printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10256 				    vp->v_mount->mnt_vfsstat.f_mntonname,
10257 				    vp->v_name ? vp->v_name : "<unknown>");
10258 				error = EIO;
10259 				break;
10260 			}
10261 
10262 			size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
10263 
10264 			bzero(entry64, enbufsize);
10265 			/* Convert a dirent to a dirent64. */
10266 			entry64->d_ino = dep->d_ino;
10267 			entry64->d_seekoff = 0;
10268 			entry64->d_reclen = (uint16_t)enbufsize;
10269 			entry64->d_namlen = dep->d_namlen;
10270 			entry64->d_type = dep->d_type;
10271 			bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10272 
10273 			/* Move to next entry. */
10274 			dep = (struct dirent *)((char *)dep + dep->d_reclen);
10275 
10276 			/* Copy entry64 to user's buffer. */
10277 			error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10278 		}
10279 
10280 		/* Update the real offset using the offset we got from VNOP_READDIR. */
10281 		if (error == 0) {
10282 			uio->uio_offset = auio->uio_offset;
10283 		}
10284 		uio_free(auio);
10285 		kfree_data(bufptr, bufsize);
10286 		kfree_type(struct direntry, entry64);
10287 		return error;
10288 	}
10289 }
10290 
10291 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
10292 
10293 /*
10294  * Read a block of directory entries in a file system independent format.
10295  */
10296 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10297 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10298     off_t *offset, int *eofflag, int flags)
10299 {
10300 	vnode_t vp;
10301 	struct vfs_context context = *vfs_context_current();    /* local copy */
10302 	struct fileproc *fp;
10303 	uio_t auio;
10304 	int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10305 	off_t loff;
10306 	int error, numdirent;
10307 	UIO_STACKBUF(uio_buf, 1);
10308 
10309 get_from_fd:
10310 	error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10311 	if (error) {
10312 		return error;
10313 	}
10314 
10315 	vn_offset_lock(fp->fp_glob);
10316 	if (((vnode_t)fp_get_data(fp)) != vp) {
10317 		vn_offset_unlock(fp->fp_glob);
10318 		file_drop(fd);
10319 		goto get_from_fd;
10320 	}
10321 
10322 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10323 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10324 		error = EBADF;
10325 		goto out;
10326 	}
10327 
10328 	if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10329 		bufsize = GETDIRENTRIES_MAXBUFSIZE;
10330 	}
10331 
10332 #if CONFIG_MACF
10333 	error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10334 	if (error) {
10335 		goto out;
10336 	}
10337 #endif
10338 
10339 	if ((error = vnode_getwithref(vp))) {
10340 		goto out;
10341 	}
10342 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10343 
10344 #if CONFIG_UNION_MOUNTS
10345 unionread:
10346 #endif /* CONFIG_UNION_MOUNTS */
10347 	if (vp->v_type != VDIR) {
10348 		(void)vnode_put(vp);
10349 		error = EINVAL;
10350 		goto out;
10351 	}
10352 
10353 #if CONFIG_MACF
10354 	error = mac_vnode_check_readdir(&context, vp);
10355 	if (error != 0) {
10356 		(void)vnode_put(vp);
10357 		goto out;
10358 	}
10359 #endif /* MAC */
10360 
10361 	loff = fp->fp_glob->fg_offset;
10362 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10363 	uio_addiov(auio, bufp, bufsize);
10364 
10365 	if (flags & VNODE_READDIR_EXTENDED) {
10366 		error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10367 		fp->fp_glob->fg_offset = uio_offset(auio);
10368 	} else {
10369 		error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10370 		fp->fp_glob->fg_offset = uio_offset(auio);
10371 	}
10372 	if (error) {
10373 		(void)vnode_put(vp);
10374 		goto out;
10375 	}
10376 
10377 #if CONFIG_UNION_MOUNTS
10378 	if ((user_ssize_t)bufsize == uio_resid(auio) &&
10379 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
10380 		vnode_t uvp;
10381 
10382 		if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10383 			if (vnode_ref(uvp) == 0) {
10384 				fp_set_data(fp, uvp);
10385 				fp->fp_glob->fg_offset = 0;
10386 				vnode_rele(vp);
10387 				vnode_put(vp);
10388 				vp = uvp;
10389 				goto unionread;
10390 			} else {
10391 				/* could not get a ref, can't replace in fd */
10392 				vnode_put(uvp);
10393 			}
10394 		}
10395 	}
10396 #endif /* CONFIG_UNION_MOUNTS */
10397 
10398 	vnode_put(vp);
10399 	if (offset) {
10400 		*offset = loff;
10401 	}
10402 
10403 	*bytesread = bufsize - uio_resid(auio);
10404 out:
10405 	vn_offset_unlock(fp->fp_glob);
10406 	file_drop(fd);
10407 	return error;
10408 }
10409 
10410 
10411 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)10412 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10413 {
10414 	off_t offset;
10415 	ssize_t bytesread;
10416 	int error, eofflag;
10417 
10418 	AUDIT_ARG(fd, uap->fd);
10419 	error = getdirentries_common(uap->fd, uap->buf, uap->count,
10420 	    &bytesread, &offset, &eofflag, 0);
10421 
10422 	if (error == 0) {
10423 		if (proc_is64bit(p)) {
10424 			user64_long_t base = (user64_long_t)offset;
10425 			error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10426 		} else {
10427 			user32_long_t base = (user32_long_t)offset;
10428 			error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10429 		}
10430 		*retval = (int)bytesread;
10431 	}
10432 	return error;
10433 }
10434 
10435 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)10436 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10437 {
10438 	off_t offset;
10439 	ssize_t bytesread;
10440 	int error, eofflag;
10441 	user_size_t bufsize;
10442 
10443 	AUDIT_ARG(fd, uap->fd);
10444 
10445 	/*
10446 	 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10447 	 * then the kernel carves out the last 4 bytes to return extended
10448 	 * information to userspace (namely whether we reached EOF with this call).
10449 	 */
10450 	if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10451 		bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10452 	} else {
10453 		bufsize = uap->bufsize;
10454 	}
10455 
10456 	error = getdirentries_common(uap->fd, uap->buf, bufsize,
10457 	    &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
10458 
10459 	if (error == 0) {
10460 		*retval = bytesread;
10461 		error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10462 
10463 		if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10464 			getdirentries64_flags_t flags = 0;
10465 			if (eofflag) {
10466 				flags |= GETDIRENTRIES64_EOF;
10467 			}
10468 			error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10469 			    sizeof(flags));
10470 		}
10471 	}
10472 	return error;
10473 }
10474 
10475 
10476 /*
10477  * Set the mode mask for creation of filesystem nodes.
10478  * XXX implement xsecurity
10479  */
10480 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
10481 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)10482 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10483 {
10484 	AUDIT_ARG(mask, newmask);
10485 	proc_fdlock(p);
10486 	*retval = p->p_fd.fd_cmask;
10487 	p->p_fd.fd_cmask = newmask & ALLPERMS;
10488 	proc_fdunlock(p);
10489 	return 0;
10490 }
10491 
10492 /*
10493  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10494  *
10495  * Parameters:    p                       Process requesting to set the umask
10496  *                uap                     User argument descriptor (see below)
10497  *                retval                  umask of the process (parameter p)
10498  *
10499  * Indirect:      uap->newmask            umask to set
10500  *                uap->xsecurity          ACL to set
10501  *
10502  * Returns:        0                      Success
10503  *                !0                      Not success
10504  *
10505  */
10506 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)10507 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10508 {
10509 	return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
10510 }
10511 
10512 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)10513 umask(proc_t p, struct umask_args *uap, int32_t *retval)
10514 {
10515 	return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
10516 }
10517 
10518 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT                               \
10519 	"com.apple.private.vfs.revoke-mounted-device"
10520 
10521 /*
10522  * Void all references to file by ripping underlying filesystem
10523  * away from vnode.
10524  */
10525 /* ARGSUSED */
10526 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)10527 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10528 {
10529 	vnode_t vp;
10530 	struct vnode_attr va;
10531 	vfs_context_t ctx = vfs_context_current();
10532 	int error;
10533 	struct nameidata nd;
10534 
10535 	NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10536 	    uap->path, ctx);
10537 	error = namei(&nd);
10538 	if (error) {
10539 		return error;
10540 	}
10541 	vp = nd.ni_vp;
10542 
10543 	nameidone(&nd);
10544 
10545 	if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10546 		error = ENOTSUP;
10547 		goto out;
10548 	}
10549 
10550 	if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10551 		error = EBUSY;
10552 		goto out;
10553 	}
10554 
10555 #if CONFIG_MACF
10556 	error = mac_vnode_check_revoke(ctx, vp);
10557 	if (error) {
10558 		goto out;
10559 	}
10560 #endif
10561 
10562 	VATTR_INIT(&va);
10563 	VATTR_WANTED(&va, va_uid);
10564 	if ((error = vnode_getattr(vp, &va, ctx))) {
10565 		goto out;
10566 	}
10567 	if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
10568 	    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
10569 		goto out;
10570 	}
10571 	if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10572 		VNOP_REVOKE(vp, REVOKEALL, ctx);
10573 	}
10574 out:
10575 	vnode_put(vp);
10576 	return error;
10577 }
10578 
10579 
10580 /*
10581  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10582  *  The following system calls are designed to support features
10583  *  which are specific to the HFS & HFS Plus volume formats
10584  */
10585 
10586 
10587 /*
10588  * Obtain attribute information on objects in a directory while enumerating
10589  * the directory.
10590  */
10591 /* ARGSUSED */
10592 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)10593 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10594 {
10595 	vnode_t vp;
10596 	struct fileproc *fp;
10597 	uio_t auio = NULL;
10598 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10599 	uint32_t count = 0, savecount = 0;
10600 	uint32_t newstate = 0;
10601 	int error, eofflag = 0;
10602 	off_t loff = 0;
10603 	struct attrlist attributelist;
10604 	vfs_context_t ctx = vfs_context_current();
10605 	int fd = uap->fd;
10606 	UIO_STACKBUF(uio_buf, 1);
10607 	kauth_action_t action;
10608 
10609 	AUDIT_ARG(fd, fd);
10610 
10611 	/* Get the attributes into kernel space */
10612 	if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10613 		return error;
10614 	}
10615 	if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10616 		return error;
10617 	}
10618 	savecount = count;
10619 
10620 get_from_fd:
10621 	if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10622 		return error;
10623 	}
10624 
10625 	vn_offset_lock(fp->fp_glob);
10626 	if (((vnode_t)fp_get_data(fp)) != vp) {
10627 		vn_offset_unlock(fp->fp_glob);
10628 		file_drop(fd);
10629 		goto get_from_fd;
10630 	}
10631 
10632 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10633 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10634 		error = EBADF;
10635 		goto out;
10636 	}
10637 
10638 
10639 #if CONFIG_MACF
10640 	error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10641 	    fp->fp_glob);
10642 	if (error) {
10643 		goto out;
10644 	}
10645 #endif
10646 
10647 
10648 	if ((error = vnode_getwithref(vp))) {
10649 		goto out;
10650 	}
10651 
10652 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10653 
10654 #if CONFIG_UNION_MOUNTS
10655 unionread:
10656 #endif /* CONFIG_UNION_MOUNTS */
10657 	if (vp->v_type != VDIR) {
10658 		(void)vnode_put(vp);
10659 		error = EINVAL;
10660 		goto out;
10661 	}
10662 
10663 #if CONFIG_MACF
10664 	error = mac_vnode_check_readdir(ctx, vp);
10665 	if (error != 0) {
10666 		(void)vnode_put(vp);
10667 		goto out;
10668 	}
10669 #endif /* MAC */
10670 
10671 	/* set up the uio structure which will contain the users return buffer */
10672 	loff = fp->fp_glob->fg_offset;
10673 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10674 	uio_addiov(auio, uap->buffer, uap->buffersize);
10675 
10676 	/*
10677 	 * If the only item requested is file names, we can let that past with
10678 	 * just LIST_DIRECTORY.  If they want any other attributes, that means
10679 	 * they need SEARCH as well.
10680 	 */
10681 	action = KAUTH_VNODE_LIST_DIRECTORY;
10682 	if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10683 	    attributelist.fileattr || attributelist.dirattr) {
10684 		action |= KAUTH_VNODE_SEARCH;
10685 	}
10686 
10687 	if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10688 		/* Believe it or not, uap->options only has 32-bits of valid
10689 		 * info, so truncate before extending again */
10690 
10691 		error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10692 		    (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10693 	}
10694 
10695 	if (error) {
10696 		(void) vnode_put(vp);
10697 		goto out;
10698 	}
10699 
10700 #if CONFIG_UNION_MOUNTS
10701 	/*
10702 	 * If we've got the last entry of a directory in a union mount
10703 	 * then reset the eofflag and pretend there's still more to come.
10704 	 * The next call will again set eofflag and the buffer will be empty,
10705 	 * so traverse to the underlying directory and do the directory
10706 	 * read there.
10707 	 */
10708 	if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10709 		if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10710 			eofflag = 0;
10711 		} else {                                                // Empty buffer
10712 			vnode_t uvp;
10713 			if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10714 				if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10715 					fp_set_data(fp, uvp);
10716 					fp->fp_glob->fg_offset = 0; // reset index for new dir
10717 					count = savecount;
10718 					vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10719 					vnode_put(vp);
10720 					vp = uvp;
10721 					goto unionread;
10722 				} else {
10723 					/* could not get a ref, can't replace in fd */
10724 					vnode_put(uvp);
10725 				}
10726 			}
10727 		}
10728 	}
10729 #endif /* CONFIG_UNION_MOUNTS */
10730 
10731 	(void)vnode_put(vp);
10732 
10733 	if (error) {
10734 		goto out;
10735 	}
10736 	fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10737 
10738 	if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10739 		goto out;
10740 	}
10741 	if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10742 		goto out;
10743 	}
10744 	if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10745 		goto out;
10746 	}
10747 
10748 	*retval = eofflag;  /* similar to getdirentries */
10749 	error = 0;
10750 out:
10751 	vn_offset_unlock(fp->fp_glob);
10752 	file_drop(fd);
10753 	return error; /* return error earlier, an retval of 0 or 1 now */
10754 } /* end of getdirentriesattr system call */
10755 
10756 /*
10757  * Exchange data between two files
10758  */
10759 
10760 /* ARGSUSED */
10761 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10762 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10763 {
10764 	struct nameidata fnd, snd;
10765 	vfs_context_t ctx = vfs_context_current();
10766 	vnode_t fvp;
10767 	vnode_t svp;
10768 	int error;
10769 	u_int32_t nameiflags;
10770 	char *fpath = NULL;
10771 	char *spath = NULL;
10772 	int   flen = 0, slen = 0;
10773 	int from_truncated = 0, to_truncated = 0;
10774 #if CONFIG_FSE
10775 	fse_info f_finfo, s_finfo;
10776 #endif
10777 
10778 	nameiflags = 0;
10779 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10780 		nameiflags |= FOLLOW;
10781 	}
10782 
10783 	NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10784 	    UIO_USERSPACE, uap->path1, ctx);
10785 
10786 	error = namei(&fnd);
10787 	if (error) {
10788 		goto out2;
10789 	}
10790 
10791 	nameidone(&fnd);
10792 	fvp = fnd.ni_vp;
10793 
10794 	NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10795 	    UIO_USERSPACE, uap->path2, ctx);
10796 
10797 	error = namei(&snd);
10798 	if (error) {
10799 		vnode_put(fvp);
10800 		goto out2;
10801 	}
10802 	nameidone(&snd);
10803 	svp = snd.ni_vp;
10804 
10805 	/*
10806 	 * if the files are the same, return an inval error
10807 	 */
10808 	if (svp == fvp) {
10809 		error = EINVAL;
10810 		goto out;
10811 	}
10812 
10813 	/*
10814 	 * if the files are on different volumes, return an error
10815 	 */
10816 	if (svp->v_mount != fvp->v_mount) {
10817 		error = EXDEV;
10818 		goto out;
10819 	}
10820 
10821 	/* If they're not files, return an error */
10822 	if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10823 		error = EINVAL;
10824 		goto out;
10825 	}
10826 
10827 #if CONFIG_MACF
10828 	error = mac_vnode_check_exchangedata(ctx,
10829 	    fvp, svp);
10830 	if (error) {
10831 		goto out;
10832 	}
10833 #endif
10834 	if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10835 	    ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10836 		goto out;
10837 	}
10838 
10839 	if (
10840 #if CONFIG_FSE
10841 		need_fsevent(FSE_EXCHANGE, fvp) ||
10842 #endif
10843 		kauth_authorize_fileop_has_listeners()) {
10844 		GET_PATH(fpath);
10845 		GET_PATH(spath);
10846 
10847 		flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10848 		slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10849 
10850 #if CONFIG_FSE
10851 		get_fse_info(fvp, &f_finfo, ctx);
10852 		get_fse_info(svp, &s_finfo, ctx);
10853 		if (from_truncated || to_truncated) {
10854 			// set it here since only the f_finfo gets reported up to user space
10855 			f_finfo.mode |= FSE_TRUNCATED_PATH;
10856 		}
10857 #endif
10858 	}
10859 	/* Ok, make the call */
10860 	error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10861 
10862 	if (error == 0) {
10863 		const char *tmpname;
10864 
10865 		if (fpath != NULL && spath != NULL) {
10866 			/* call out to allow 3rd party notification of exchangedata.
10867 			 * Ignore result of kauth_authorize_fileop call.
10868 			 */
10869 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10870 			    (uintptr_t)fpath, (uintptr_t)spath);
10871 		}
10872 		name_cache_lock();
10873 
10874 		tmpname     = fvp->v_name;
10875 		fvp->v_name = svp->v_name;
10876 		svp->v_name = tmpname;
10877 
10878 		if (fvp->v_parent != svp->v_parent) {
10879 			vnode_t tmp;
10880 
10881 			tmp           = fvp->v_parent;
10882 			fvp->v_parent = svp->v_parent;
10883 			svp->v_parent = tmp;
10884 		}
10885 		name_cache_unlock();
10886 
10887 #if CONFIG_FSE
10888 		if (fpath != NULL && spath != NULL) {
10889 			add_fsevent(FSE_EXCHANGE, ctx,
10890 			    FSE_ARG_STRING, flen, fpath,
10891 			    FSE_ARG_FINFO, &f_finfo,
10892 			    FSE_ARG_STRING, slen, spath,
10893 			    FSE_ARG_FINFO, &s_finfo,
10894 			    FSE_ARG_DONE);
10895 		}
10896 #endif
10897 	}
10898 
10899 out:
10900 	if (fpath != NULL) {
10901 		RELEASE_PATH(fpath);
10902 	}
10903 	if (spath != NULL) {
10904 		RELEASE_PATH(spath);
10905 	}
10906 	vnode_put(svp);
10907 	vnode_put(fvp);
10908 out2:
10909 	return error;
10910 }
10911 
10912 /*
10913  * Return (in MB) the amount of freespace on the given vnode's volume.
10914  */
10915 uint32_t freespace_mb(vnode_t vp);
10916 
10917 uint32_t
freespace_mb(vnode_t vp)10918 freespace_mb(vnode_t vp)
10919 {
10920 	vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
10921 	return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10922 	       vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10923 }
10924 
10925 #if CONFIG_SEARCHFS
10926 
10927 /* ARGSUSED */
10928 
10929 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)10930 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10931 {
10932 	vnode_t vp, tvp;
10933 	int i, error = 0;
10934 	int fserror = 0;
10935 	struct nameidata nd;
10936 	struct user64_fssearchblock searchblock;
10937 	struct searchstate *state;
10938 	struct attrlist *returnattrs;
10939 	struct timeval timelimit;
10940 	void *searchparams1, *searchparams2;
10941 	uio_t auio = NULL;
10942 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10943 	uint32_t nummatches;
10944 	size_t mallocsize;
10945 	uint32_t nameiflags;
10946 	vfs_context_t ctx = vfs_context_current();
10947 	UIO_STACKBUF(uio_buf, 1);
10948 
10949 	/* Start by copying in fsearchblock parameter list */
10950 	if (IS_64BIT_PROCESS(p)) {
10951 		error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
10952 		timelimit.tv_sec = searchblock.timelimit.tv_sec;
10953 		timelimit.tv_usec = searchblock.timelimit.tv_usec;
10954 	} else {
10955 		struct user32_fssearchblock tmp_searchblock;
10956 
10957 		error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
10958 		// munge into 64-bit version
10959 		searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
10960 		searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
10961 		searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
10962 		searchblock.maxmatches = tmp_searchblock.maxmatches;
10963 		/*
10964 		 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
10965 		 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
10966 		 */
10967 		timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
10968 		timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
10969 		searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
10970 		searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
10971 		searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
10972 		searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
10973 		searchblock.searchattrs = tmp_searchblock.searchattrs;
10974 	}
10975 	if (error) {
10976 		return error;
10977 	}
10978 
10979 	/* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
10980 	 */
10981 	if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10982 	    searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10983 		return EINVAL;
10984 	}
10985 
10986 	/* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
10987 	/* It all has to do into local memory and it's not that big so we might as well  put it all together. */
10988 	/* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
10989 	/* block.                                                                                             */
10990 	/*												      */
10991 	/* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
10992 	/*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
10993 	/*       assumes the size is still 556 bytes it will continue to work				      */
10994 
10995 	mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
10996 	    sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
10997 
10998 	searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
10999 
11000 	/* Now set up the various pointers to the correct place in our newly allocated memory */
11001 
11002 	searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
11003 	returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11004 	state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11005 
11006 	/* Now copy in the stuff given our local variables. */
11007 
11008 	if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11009 		goto freeandexit;
11010 	}
11011 
11012 	if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11013 		goto freeandexit;
11014 	}
11015 
11016 	if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11017 		goto freeandexit;
11018 	}
11019 
11020 	if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11021 		goto freeandexit;
11022 	}
11023 
11024 	/*
11025 	 * When searching a union mount, need to set the
11026 	 * start flag at the first call on each layer to
11027 	 * reset state for the new volume.
11028 	 */
11029 	if (uap->options & SRCHFS_START) {
11030 		state->ss_union_layer = 0;
11031 	} else {
11032 		uap->options |= state->ss_union_flags;
11033 	}
11034 	state->ss_union_flags = 0;
11035 
11036 	/*
11037 	 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11038 	 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11039 	 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11040 	 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11041 	 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11042 	 */
11043 
11044 	if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11045 		attrreference_t* string_ref;
11046 		u_int32_t* start_length;
11047 		user64_size_t param_length;
11048 
11049 		/* validate searchparams1 */
11050 		param_length = searchblock.sizeofsearchparams1;
11051 		/* skip the word that specifies length of the buffer */
11052 		start_length = (u_int32_t*) searchparams1;
11053 		start_length = start_length + 1;
11054 		string_ref = (attrreference_t*) start_length;
11055 
11056 		/* ensure no negative offsets or too big offsets */
11057 		if (string_ref->attr_dataoffset < 0) {
11058 			error = EINVAL;
11059 			goto freeandexit;
11060 		}
11061 		if (string_ref->attr_length > MAXPATHLEN) {
11062 			error = EINVAL;
11063 			goto freeandexit;
11064 		}
11065 
11066 		/* Check for pointer overflow in the string ref */
11067 		if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11068 			error = EINVAL;
11069 			goto freeandexit;
11070 		}
11071 
11072 		if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11073 			error = EINVAL;
11074 			goto freeandexit;
11075 		}
11076 		if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11077 			error = EINVAL;
11078 			goto freeandexit;
11079 		}
11080 	}
11081 
11082 	/* set up the uio structure which will contain the users return buffer */
11083 	auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11084 	uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11085 
11086 	nameiflags = 0;
11087 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11088 		nameiflags |= FOLLOW;
11089 	}
11090 	NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11091 	    UIO_USERSPACE, uap->path, ctx);
11092 
11093 	error = namei(&nd);
11094 	if (error) {
11095 		goto freeandexit;
11096 	}
11097 	vp = nd.ni_vp;
11098 	nameidone(&nd);
11099 
11100 	/*
11101 	 * Switch to the root vnode for the volume
11102 	 */
11103 	error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11104 	vnode_put(vp);
11105 	if (error) {
11106 		goto freeandexit;
11107 	}
11108 	vp = tvp;
11109 
11110 #if CONFIG_UNION_MOUNTS
11111 	/*
11112 	 * If it's a union mount, the path lookup takes
11113 	 * us to the top layer. But we may need to descend
11114 	 * to a lower layer. For non-union mounts the layer
11115 	 * is always zero.
11116 	 */
11117 	for (i = 0; i < (int) state->ss_union_layer; i++) {
11118 		if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11119 			break;
11120 		}
11121 		tvp = vp;
11122 		vp = vp->v_mount->mnt_vnodecovered;
11123 		if (vp == NULL) {
11124 			vnode_put(tvp);
11125 			error = ENOENT;
11126 			goto freeandexit;
11127 		}
11128 		error = vnode_getwithref(vp);
11129 		vnode_put(tvp);
11130 		if (error) {
11131 			goto freeandexit;
11132 		}
11133 	}
11134 #endif /* CONFIG_UNION_MOUNTS */
11135 
11136 #if CONFIG_MACF
11137 	error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11138 	if (error) {
11139 		vnode_put(vp);
11140 		goto freeandexit;
11141 	}
11142 #endif
11143 
11144 
11145 	/*
11146 	 * If searchblock.maxmatches == 0, then skip the search. This has happened
11147 	 * before and sometimes the underlying code doesnt deal with it well.
11148 	 */
11149 	if (searchblock.maxmatches == 0) {
11150 		nummatches = 0;
11151 		goto saveandexit;
11152 	}
11153 
11154 	/*
11155 	 * Allright, we have everything we need, so lets make that call.
11156 	 *
11157 	 * We keep special track of the return value from the file system:
11158 	 * EAGAIN is an acceptable error condition that shouldn't keep us
11159 	 * from copying out any results...
11160 	 */
11161 
11162 	fserror = VNOP_SEARCHFS(vp,
11163 	    searchparams1,
11164 	    searchparams2,
11165 	    &searchblock.searchattrs,
11166 	    (uint32_t)searchblock.maxmatches,
11167 	    &timelimit,
11168 	    returnattrs,
11169 	    &nummatches,
11170 	    (uint32_t)uap->scriptcode,
11171 	    (uint32_t)uap->options,
11172 	    auio,
11173 	    (struct searchstate *) &state->ss_fsstate,
11174 	    ctx);
11175 
11176 #if CONFIG_UNION_MOUNTS
11177 	/*
11178 	 * If it's a union mount we need to be called again
11179 	 * to search the mounted-on filesystem.
11180 	 */
11181 	if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11182 		state->ss_union_flags = SRCHFS_START;
11183 		state->ss_union_layer++;        // search next layer down
11184 		fserror = EAGAIN;
11185 	}
11186 #endif /* CONFIG_UNION_MOUNTS */
11187 
11188 saveandexit:
11189 
11190 	vnode_put(vp);
11191 
11192 	/* Now copy out the stuff that needs copying out. That means the number of matches, the
11193 	 *  search state.  Everything was already put into he return buffer by the vop call. */
11194 
11195 	if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11196 		goto freeandexit;
11197 	}
11198 
11199 	if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11200 		goto freeandexit;
11201 	}
11202 
11203 	error = fserror;
11204 
11205 freeandexit:
11206 
11207 	kfree_data(searchparams1, mallocsize);
11208 
11209 	return error;
11210 } /* end of searchfs system call */
11211 
11212 #else /* CONFIG_SEARCHFS */
11213 
11214 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11215 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11216 {
11217 	return ENOTSUP;
11218 }
11219 
11220 #endif /* CONFIG_SEARCHFS */
11221 
11222 
11223 #if CONFIG_DATALESS_FILES
11224 
11225 /*
11226  * === Namespace Resolver Up-call Mechanism ===
11227  *
11228  * When I/O is performed to a dataless file or directory (read, write,
11229  * lookup-in, etc.), the file system performs an upcall to the namespace
11230  * resolver (filecoordinationd) to materialize the object.
11231  *
11232  * We need multiple up-calls to be in flight at once, and we need these
11233  * up-calls to be interruptible, thus the following implementation:
11234  *
11235  * => The nspace_resolver_request represents the in-kernel request state.
11236  *    It contains a request ID, storage space for the errno code returned
11237  *    by filecoordinationd, and flags.
11238  *
11239  * => The request ID is simply a global monotonically incrementing 32-bit
11240  *    number.  Outstanding requests are stored in a hash table, and the
11241  *    hash function is extremely simple.
11242  *
11243  * => When an upcall is to be made to filecoordinationd, a request structure
11244  *    is allocated on the stack (it is small, and needs to live only during
11245  *    the duration of the call to resolve_nspace_item_ext()).  It is
11246  *    initialized and inserted into the table.  Some backpressure from
11247  *    filecoordinationd is applied by limiting the numnber of entries that
11248  *    can be inserted into the table (and thus limiting the number of
11249  *    outstanding requests issued to filecoordinationd); waiting for an
11250  *    available slot is interruptible.
11251  *
11252  * => Once the request has been inserted into the table, the up-call is made
11253  *    to filecoordinationd via a MiG-generated stub.  The up-call returns
11254  *    immediately and filecoordinationd processes the request asynchronously.
11255  *
11256  * => The caller now waits for the request to complete.  Tnis is achieved by
11257  *    sleeping on the address of the request structure and waiting for
11258  *    filecoordinationd to mark the request structure as complete.  This
11259  *    is an interruptible sleep call; if interrupted, the request structure
11260  *    is removed from the table and EINTR is returned to the caller.  If
11261  *    this occurs, an advisory up-call is made to filecoordinationd with
11262  *    the request ID to indicate that the request can be aborted or
11263  *    de-prioritized at the discretion of filecoordinationd.
11264  *
11265  * => When filecoordinationd has completed the request, it signals completion
11266  *    by writing to the vfs.nspace.complete sysctl node.  Only a process
11267  *    decorated as a namespace resolver can write to this sysctl node.  The
11268  *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11269  *    The request ID is looked up in the table, and if the request is found,
11270  *    the error code is stored in the request structure and a wakeup()
11271  *    issued on the address of the request structure.  If the request is not
11272  *    found, we simply drop the completion notification, assuming that the
11273  *    caller was interrupted.
11274  *
11275  * => When the waiting thread wakes up, it extracts the error code from the
11276  *    request structure, removes the request from the table, and returns the
11277  *    error code to the calling function.  Fini!
11278  */
11279 
11280 struct nspace_resolver_request {
11281 	LIST_ENTRY(nspace_resolver_request) r_hashlink;
11282 	vnode_t         r_vp;
11283 	vnode_t         r_tdvp;
11284 	uint32_t        r_req_id;
11285 	int             r_resolver_error;
11286 	int             r_flags;
11287 };
11288 
11289 #define RRF_COMPLETE    0x0001
11290 #define RRF_COMPLETING  0x0002
11291 
11292 struct nspace_resolver_completion_data {
11293 	uint32_t req_id;
11294 	int32_t  resolver_error;
11295 	uint64_t orig_gencount;
11296 	uint64_t orig_syncroot;
11297 };
11298 
11299 static uint32_t
next_nspace_req_id(void)11300 next_nspace_req_id(void)
11301 {
11302 	static uint32_t next_req_id;
11303 
11304 	return OSAddAtomic(1, &next_req_id);
11305 }
11306 
11307 #define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
11308 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
11309 
11310 static LIST_HEAD(nspace_resolver_requesthead,
11311     nspace_resolver_request) * nspace_resolver_request_hashtbl;
11312 static u_long nspace_resolver_request_hashmask;
11313 static u_int nspace_resolver_request_count;
11314 static bool nspace_resolver_request_wait_slot;
11315 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11316 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11317     &nspace_resolver_request_lck_grp);
11318 
11319 #define NSPACE_REQ_LOCK() \
11320 	lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11321 #define NSPACE_REQ_UNLOCK() \
11322 	lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11323 
11324 #define NSPACE_RESOLVER_HASH(req_id)    \
11325 	(&nspace_resolver_request_hashtbl[(req_id) & \
11326 	 nspace_resolver_request_hashmask])
11327 
11328 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id,bool skip_completing)11329 nspace_resolver_req_lookup(uint32_t req_id, bool skip_completing)
11330 {
11331 	struct nspace_resolver_requesthead *bucket;
11332 	struct nspace_resolver_request *req;
11333 
11334 	bucket = NSPACE_RESOLVER_HASH(req_id);
11335 	LIST_FOREACH(req, bucket, r_hashlink) {
11336 		if (req->r_req_id == req_id) {
11337 			/*
11338 			 * If this request already has a completion
11339 			 * pending, don't return it again.
11340 			 */
11341 			if ((req->r_flags & RRF_COMPLETING) != 0 &&
11342 			    skip_completing) {
11343 				req = NULL;
11344 			}
11345 			return req;
11346 		}
11347 	}
11348 
11349 	return NULL;
11350 }
11351 
11352 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11353 nspace_resolver_req_add(struct nspace_resolver_request *req)
11354 {
11355 	struct nspace_resolver_requesthead *bucket;
11356 	int error;
11357 
11358 	NSPACE_REQ_LOCK();
11359 
11360 	while (nspace_resolver_request_count >=
11361 	    NSPACE_RESOLVER_MAX_OUTSTANDING) {
11362 		nspace_resolver_request_wait_slot = true;
11363 		error = msleep(&nspace_resolver_request_count,
11364 		    &nspace_resolver_request_hash_mutex,
11365 		    PVFS | PCATCH, "nspacerq", NULL);
11366 		if (error) {
11367 			NSPACE_REQ_UNLOCK();
11368 			return error;
11369 		}
11370 	}
11371 
11372 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11373 #if DIAGNOSTIC
11374 	assert(nspace_resolver_req_lookup(req->r_req_id, false) == NULL);
11375 #endif /* DIAGNOSTIC */
11376 	LIST_INSERT_HEAD(bucket, req, r_hashlink);
11377 	nspace_resolver_request_count++;
11378 
11379 	NSPACE_REQ_UNLOCK();
11380 
11381 	return 0;
11382 }
11383 
11384 static void
nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request * req)11385 nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request *req)
11386 {
11387 	/*
11388 	 * If a completion is in-progress, we have to wait for the
11389 	 * completion handler to finish because it's still using 'req',
11390 	 * which is allocated on our stack a couple of frames up.
11391 	 */
11392 	while ((req->r_flags & RRF_COMPLETING) != 0) {
11393 		(void) msleep(req, &nspace_resolver_request_hash_mutex,
11394 		    PVFS, "nspacecmplt", NULL);
11395 	}
11396 }
11397 
11398 static void
nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request * req)11399 nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request *req)
11400 {
11401 	struct nspace_resolver_requesthead *bucket;
11402 
11403 	/* We're called with NSPACE_REQ_LOCK held. */
11404 
11405 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11406 #if DIAGNOSTIC
11407 	assert((req->r_flags & RRF_COMPLETING) == 0);
11408 	assert(nspace_resolver_req_lookup(req->r_req_id, false) != NULL);
11409 #endif /* DIAGNOSTIC */
11410 	LIST_REMOVE(req, r_hashlink);
11411 	nspace_resolver_request_count--;
11412 
11413 	if (nspace_resolver_request_wait_slot) {
11414 		nspace_resolver_request_wait_slot = false;
11415 		wakeup(&nspace_resolver_request_count);
11416 	}
11417 
11418 	nspace_resolver_req_wait_pending_completion(req);
11419 
11420 	NSPACE_REQ_UNLOCK();
11421 }
11422 
11423 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)11424 nspace_resolver_req_remove(struct nspace_resolver_request *req)
11425 {
11426 	NSPACE_REQ_LOCK();
11427 	nspace_resolver_req_remove_and_unlock(req);
11428 }
11429 
11430 static void
nspace_resolver_req_cancel(uint32_t req_id)11431 nspace_resolver_req_cancel(uint32_t req_id)
11432 {
11433 	kern_return_t kr;
11434 	mach_port_t mp;
11435 
11436 	// Failures here aren't fatal -- the cancellation message
11437 	// sent to the resolver is merely advisory.
11438 
11439 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11440 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11441 		return;
11442 	}
11443 
11444 	kr = send_nspace_resolve_cancel(mp, req_id);
11445 	if (kr != KERN_SUCCESS) {
11446 		os_log_error(OS_LOG_DEFAULT,
11447 		    "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11448 	}
11449 
11450 	ipc_port_release_send(mp);
11451 }
11452 
11453 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)11454 nspace_resolver_req_wait(struct nspace_resolver_request *req)
11455 {
11456 	bool send_cancel_message = false;
11457 	int error;
11458 
11459 	NSPACE_REQ_LOCK();
11460 
11461 	while ((req->r_flags & RRF_COMPLETE) == 0) {
11462 		error = msleep(req, &nspace_resolver_request_hash_mutex,
11463 		    PVFS | PCATCH, "nspace", NULL);
11464 		if (error && error != ERESTART) {
11465 			req->r_resolver_error = (error == EINTR) ? EINTR :
11466 			    ETIMEDOUT;
11467 			send_cancel_message = true;
11468 			break;
11469 		}
11470 	}
11471 
11472 	nspace_resolver_req_remove_and_unlock(req);
11473 
11474 	/*
11475 	 * It's safe to continue referencing 'req' here because it's
11476 	 * allocated on our caller's stack.
11477 	 */
11478 
11479 	if (send_cancel_message) {
11480 		nspace_resolver_req_cancel(req->r_req_id);
11481 	}
11482 
11483 	return req->r_resolver_error;
11484 }
11485 
11486 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)11487 nspace_resolver_req_mark_complete(
11488 	struct nspace_resolver_request *req,
11489 	int resolver_error)
11490 {
11491 	req->r_resolver_error = resolver_error;
11492 	req->r_flags = (req->r_flags & ~RRF_COMPLETING) | RRF_COMPLETE;
11493 	wakeup(req);
11494 }
11495 
11496 static void
nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request * req)11497 nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request *req)
11498 {
11499 	req->r_flags |= RRF_COMPLETING;
11500 }
11501 
11502 static void
nspace_resolver_req_completed(const struct nspace_resolver_completion_data * c)11503 nspace_resolver_req_completed(const struct nspace_resolver_completion_data *c)
11504 {
11505 	struct nspace_resolver_request *req;
11506 	int error;
11507 	struct vnode_attr va;
11508 	vnode_t vp;
11509 
11510 	NSPACE_REQ_LOCK();
11511 
11512 	req = nspace_resolver_req_lookup(c->req_id, true);
11513 	if (req == NULL) {
11514 		/*
11515 		 * If we don't find the request corresponding to our req_id,
11516 		 * just drop the completion on the floor; it's likely that
11517 		 * the requester interrupted with a signal, or it may already
11518 		 * be completing.
11519 		 */
11520 		NSPACE_REQ_UNLOCK();
11521 		return;
11522 	}
11523 
11524 	/*
11525 	 * Get out now if the resolver reported an error.
11526 	 */
11527 	if ((error = c->resolver_error) != 0) {
11528 		goto out;
11529 	}
11530 
11531 	/*
11532 	 * If the resolver did not specify any namespace shape criteria
11533 	 * for letting the operation proceed, then get out now.
11534 	 */
11535 	if (c->orig_gencount == 0 && c->orig_syncroot == 0) {
11536 		goto out;
11537 	}
11538 
11539 	/*
11540 	 * We're going to have to acquire the mount rename lock and do
11541 	 * some I/O in order to verify the criteria.  Mark the request
11542 	 * as pending so no one else messes with it after we drop the
11543 	 * NSPACE_REQ_LOCK.
11544 	 */
11545 	nspace_resolver_req_mark_completion_pending(req);
11546 	NSPACE_REQ_UNLOCK();
11547 
11548 	/*
11549 	 * Lock out renames from changing the shape of the tree while
11550 	 * validate the criteria.
11551 	 */
11552 	mount_t locked_mp = req->r_vp->v_mount;
11553 	mount_ref(locked_mp, 0);
11554 	mount_lock_renames(locked_mp);
11555 
11556 	if (c->orig_gencount != 0) {
11557 		vp = req->r_vp;
11558 		if (error) {
11559 			goto out_dropmount;
11560 		}
11561 
11562 		VATTR_INIT(&va);
11563 		VATTR_WANTED(&va, va_recursive_gencount);
11564 		error = vnode_getattr(vp, &va, vfs_context_kernel());
11565 		if (error) {
11566 			goto out_dropmount;
11567 		}
11568 		if (VATTR_NOT_RETURNED(&va, va_recursive_gencount) ||
11569 		    va.va_recursive_gencount != c->orig_gencount) {
11570 			printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n",
11571 			    c->orig_gencount, va.va_recursive_gencount);
11572 			error = EBUSY;
11573 			goto out_dropmount;
11574 		}
11575 	}
11576 
11577 	/*
11578 	 * Ignore orig_syncroot if a destination directory wasn't specified
11579 	 * in the request.
11580 	 */
11581 	if (c->orig_syncroot != 0 && (vp = req->r_tdvp) != NULL) {
11582 		uint64_t syncroot_id;
11583 
11584 		if (error) {
11585 			goto out_dropmount;
11586 		}
11587 
11588 #ifndef APFSIOC_GET_SYNC_ROOT
11589 #define APFSIOC_GET_SYNC_ROOT _IOR('J', 115, uint64_t)
11590 #endif
11591 
11592 		error = VNOP_IOCTL(vp, APFSIOC_GET_SYNC_ROOT,
11593 		    (caddr_t)&syncroot_id, 0, vfs_context_kernel());
11594 		if (error) {
11595 			goto out_dropmount;
11596 		}
11597 		if (syncroot_id != c->orig_syncroot) {
11598 			printf("nspace.complete: syncroot changed! (orig %llu cur %llu)\n",
11599 			    c->orig_syncroot, syncroot_id);
11600 			error = EBUSY;
11601 			goto out_dropmount;
11602 		}
11603 	}
11604 
11605 out_dropmount:
11606 	mount_unlock_renames(locked_mp);
11607 	mount_drop(locked_mp, 0);
11608 	NSPACE_REQ_LOCK();
11609 
11610 out:
11611 	nspace_resolver_req_mark_complete(req, error);
11612 	NSPACE_REQ_UNLOCK();
11613 }
11614 
11615 static struct proc *nspace_resolver_proc;
11616 
11617 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)11618 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11619 {
11620 	*is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11621 	    p == nspace_resolver_proc) ? 1 : 0;
11622 	return 0;
11623 }
11624 
11625 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11626 
11627 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)11628 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11629 {
11630 	vfs_context_t ctx = vfs_context_current();
11631 	int error = 0;
11632 
11633 	//
11634 	// The system filecoordinationd runs as uid == 0.  This also
11635 	// has the nice side-effect of filtering out filecoordinationd
11636 	// running in the simulator.
11637 	//
11638 	if (!vfs_context_issuser(ctx) ||
11639 	    !vfs_context_is_dataless_resolver(ctx)) {
11640 		return EPERM;
11641 	}
11642 
11643 	if (is_resolver) {
11644 		NSPACE_REQ_LOCK();
11645 
11646 		if (nspace_resolver_proc == NULL) {
11647 			proc_lock(p);
11648 			p->p_lflag |= P_LNSPACE_RESOLVER;
11649 			proc_unlock(p);
11650 			nspace_resolver_proc = p;
11651 		} else {
11652 			error = EBUSY;
11653 		}
11654 
11655 		NSPACE_REQ_UNLOCK();
11656 	} else {
11657 		// This is basically just like the exit case.
11658 		// nspace_resolver_exited() will verify that the
11659 		// process is the resolver, and will clear the
11660 		// global.
11661 		nspace_resolver_exited(p);
11662 	}
11663 
11664 	return error;
11665 }
11666 
11667 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)11668 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11669 {
11670 	if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11671 	    (p->p_vfs_iopolicy &
11672 	    P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11673 		*is_prevented = 1;
11674 	} else {
11675 		*is_prevented = 0;
11676 	}
11677 	return 0;
11678 }
11679 
11680 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)11681 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11682 {
11683 	if (p->p_lflag & P_LNSPACE_RESOLVER) {
11684 		return is_prevented ? 0 : EBUSY;
11685 	}
11686 
11687 	if (is_prevented) {
11688 		OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
11689 	} else {
11690 		OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
11691 	}
11692 	return 0;
11693 }
11694 
11695 static int
nspace_materialization_get_thread_state(int * is_prevented)11696 nspace_materialization_get_thread_state(int *is_prevented)
11697 {
11698 	uthread_t ut = current_uthread();
11699 
11700 	*is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11701 	return 0;
11702 }
11703 
11704 static int
nspace_materialization_set_thread_state(int is_prevented)11705 nspace_materialization_set_thread_state(int is_prevented)
11706 {
11707 	uthread_t ut = current_uthread();
11708 
11709 	if (is_prevented) {
11710 		ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11711 	} else {
11712 		ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11713 	}
11714 	return 0;
11715 }
11716 
11717 /* the vfs.nspace branch */
11718 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11719 
11720 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11721 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11722     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11723 {
11724 	struct proc *p = req->p;
11725 	int new_value, old_value, changed = 0;
11726 	int error;
11727 
11728 	error = nspace_resolver_get_proc_state(p, &old_value);
11729 	if (error) {
11730 		return error;
11731 	}
11732 
11733 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11734 	    &changed);
11735 	if (error == 0 && changed) {
11736 		error = nspace_resolver_set_proc_state(p, new_value);
11737 	}
11738 	return error;
11739 }
11740 
11741 /* decorate this process as the dataless file resolver */
11742 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11743     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11744     0, 0, sysctl_nspace_resolver, "I", "");
11745 
11746 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11747 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11748     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11749 {
11750 	struct proc *p = req->p;
11751 	int new_value, old_value, changed = 0;
11752 	int error;
11753 
11754 	error = nspace_materialization_get_proc_state(p, &old_value);
11755 	if (error) {
11756 		return error;
11757 	}
11758 
11759 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11760 	    &changed);
11761 	if (error == 0 && changed) {
11762 		error = nspace_materialization_set_proc_state(p, new_value);
11763 	}
11764 	return error;
11765 }
11766 
11767 /* decorate this process as not wanting to materialize dataless files */
11768 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
11769     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11770     0, 0, sysctl_nspace_prevent_materialization, "I", "");
11771 
11772 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11773 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
11774     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11775 {
11776 	int new_value, old_value, changed = 0;
11777 	int error;
11778 
11779 	error = nspace_materialization_get_thread_state(&old_value);
11780 	if (error) {
11781 		return error;
11782 	}
11783 
11784 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11785 	    &changed);
11786 	if (error == 0 && changed) {
11787 		error = nspace_materialization_set_thread_state(new_value);
11788 	}
11789 	return error;
11790 }
11791 
11792 /* decorate this thread as not wanting to materialize dataless files */
11793 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
11794     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11795     0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
11796 
11797 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11798 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
11799     __unused int arg2, struct sysctl_req *req)
11800 {
11801 	struct proc *p = req->p;
11802 	uint32_t req_status[2] = { 0, 0 };
11803 	uint64_t gencount = 0;
11804 	uint64_t syncroot = 0;
11805 	int error, is_resolver, changed = 0, other_changed;
11806 
11807 	error = nspace_resolver_get_proc_state(p, &is_resolver);
11808 	if (error) {
11809 		return error;
11810 	}
11811 
11812 	if (!is_resolver) {
11813 		return EPERM;
11814 	}
11815 
11816 	error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11817 	    &changed);
11818 	if (error) {
11819 		return error;
11820 	}
11821 
11822 	/*
11823 	 * Get the gencount if it was passed.  Ignore errors, because
11824 	 * it's optional.
11825 	 */
11826 	error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11827 	    &other_changed);
11828 	if (error) {
11829 		gencount = 0;
11830 		error = 0;
11831 	}
11832 
11833 	/*
11834 	 * ...and now the syncroot ID.
11835 	 */
11836 	error = sysctl_io_opaque(req, &syncroot, sizeof(syncroot),
11837 	    &other_changed);
11838 	if (error) {
11839 		syncroot = 0;
11840 		error = 0;
11841 	}
11842 
11843 	/*
11844 	 * req_status[0] is the req_id
11845 	 *
11846 	 * req_status[1] is the errno
11847 	 */
11848 	if (error == 0 && changed) {
11849 		const struct nspace_resolver_completion_data cd = {
11850 			.req_id = req_status[0],
11851 			.resolver_error = req_status[1],
11852 			.orig_gencount = gencount,
11853 			.orig_syncroot = syncroot,
11854 		};
11855 		nspace_resolver_req_completed(&cd);
11856 	}
11857 	return error;
11858 }
11859 
11860 /* Resolver reports completed reqs here. */
11861 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11862     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11863     0, 0, sysctl_nspace_complete, "-", "");
11864 
11865 #endif /* CONFIG_DATALESS_FILES */
11866 
11867 #if CONFIG_DATALESS_FILES
11868 #define __no_dataless_unused    /* nothing */
11869 #else
11870 #define __no_dataless_unused    __unused
11871 #endif
11872 
11873 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)11874 vfs_context_dataless_materialization_is_prevented(
11875 	vfs_context_t const ctx __no_dataless_unused)
11876 {
11877 #if CONFIG_DATALESS_FILES
11878 	proc_t const p = vfs_context_proc(ctx);
11879 	thread_t const t = vfs_context_thread(ctx);
11880 	uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11881 
11882 	/*
11883 	 * Kernel context ==> return EDEADLK, as we would with any random
11884 	 * process decorated as no-materialize.
11885 	 */
11886 	if (ctx == vfs_context_kernel()) {
11887 		return EDEADLK;
11888 	}
11889 
11890 	/*
11891 	 * If the process has the dataless-manipulation entitlement,
11892 	 * materialization is prevented, and depending on the kind
11893 	 * of file system operation, things get to proceed as if the
11894 	 * object is not dataless.
11895 	 */
11896 	if (vfs_context_is_dataless_manipulator(ctx)) {
11897 		return EJUSTRETURN;
11898 	}
11899 
11900 	/*
11901 	 * Per-thread decorations override any process-wide decorations.
11902 	 * (Foundation uses this, and this overrides even the dataless-
11903 	 * manipulation entitlement so as to make API contracts consistent.)
11904 	 */
11905 	if (ut != NULL) {
11906 		if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11907 			return EDEADLK;
11908 		}
11909 		if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11910 			return 0;
11911 		}
11912 	}
11913 
11914 	/*
11915 	 * If the process's iopolicy specifies that dataless files
11916 	 * can be materialized, then we let it go ahead.
11917 	 */
11918 	if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11919 		return 0;
11920 	}
11921 #endif /* CONFIG_DATALESS_FILES */
11922 
11923 	/*
11924 	 * The default behavior is to not materialize dataless files;
11925 	 * return to the caller that deadlock was detected.
11926 	 */
11927 	return EDEADLK;
11928 }
11929 
11930 void
nspace_resolver_init(void)11931 nspace_resolver_init(void)
11932 {
11933 #if CONFIG_DATALESS_FILES
11934 	nspace_resolver_request_hashtbl =
11935 	    hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
11936 	    M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
11937 #endif /* CONFIG_DATALESS_FILES */
11938 }
11939 
11940 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)11941 nspace_resolver_exited(struct proc *p __no_dataless_unused)
11942 {
11943 #if CONFIG_DATALESS_FILES
11944 	struct nspace_resolver_requesthead *bucket;
11945 	struct nspace_resolver_request *req;
11946 	u_long idx;
11947 
11948 	NSPACE_REQ_LOCK();
11949 
11950 	if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11951 	    p == nspace_resolver_proc) {
11952 		for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
11953 			bucket = &nspace_resolver_request_hashtbl[idx];
11954 			LIST_FOREACH(req, bucket, r_hashlink) {
11955 				nspace_resolver_req_wait_pending_completion(req);
11956 				nspace_resolver_req_mark_complete(req,
11957 				    ETIMEDOUT);
11958 			}
11959 		}
11960 		nspace_resolver_proc = NULL;
11961 	}
11962 
11963 	NSPACE_REQ_UNLOCK();
11964 #endif /* CONFIG_DATALESS_FILES */
11965 }
11966 
11967 #define DATALESS_RESOLVER_ENTITLEMENT     \
11968 	"com.apple.private.vfs.dataless-resolver"
11969 #define DATALESS_MANIPULATION_ENTITLEMENT \
11970 	"com.apple.private.vfs.dataless-manipulation"
11971 
11972 #if CONFIG_DATALESS_FILES
11973 /*
11974  * Return TRUE if the vfs context is associated with the dataless
11975  * resolver.
11976  */
11977 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)11978 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
11979 {
11980 	return IOTaskHasEntitlement(vfs_context_task(ctx),
11981 	           DATALESS_RESOLVER_ENTITLEMENT);
11982 }
11983 #endif /* CONFIG_DATALESS_FILES */
11984 
11985 /*
11986  * Return TRUE if the vfs context is associated with a process entitled
11987  * for dataless manipulation.
11988  *
11989  * XXX Arguably belongs in vfs_subr.c, but is here because of the
11990  * complication around CONFIG_DATALESS_FILES.
11991  */
11992 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)11993 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
11994 {
11995 #if CONFIG_DATALESS_FILES
11996 	task_t task = vfs_context_task(ctx);
11997 	return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
11998 	       IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
11999 #else
12000 	return false;
12001 #endif /* CONFIG_DATALESS_FILES */
12002 }
12003 
12004 #if CONFIG_DATALESS_FILES
12005 static void
log_materialization_prevented(vnode_t vp,uint64_t op)12006 log_materialization_prevented(vnode_t vp, uint64_t op)
12007 {
12008 	char p_name[MAXCOMLEN + 1];
12009 	char *vntype;
12010 	proc_selfname(&p_name[0], sizeof(p_name));
12011 
12012 	if (vp->v_type == VREG) {
12013 		vntype = "File";
12014 	} else if (vp->v_type == VDIR) {
12015 		vntype = "Dir";
12016 	} else if (vp->v_type == VLNK) {
12017 		vntype = "SymLink";
12018 	} else {
12019 		vntype = "Other";
12020 	}
12021 
12022 #if DEVELOPMENT
12023 	char *path = NULL;
12024 	int   len;
12025 
12026 	path = get_pathbuff();
12027 	len = MAXPATHLEN;
12028 	if (path) {
12029 		vn_getpath(vp, path, &len);
12030 	}
12031 
12032 	os_log_debug(OS_LOG_DEFAULT,
12033 	    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) path: %s",
12034 	    p_name, proc_selfpid(),
12035 	    op, vntype, path ? path : "<unknown-path>");
12036 	if (path) {
12037 		release_pathbuff(path);
12038 	}
12039 #else
12040 	os_log_debug(OS_LOG_DEFAULT,
12041 	    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
12042 	    p_name, proc_selfpid(),
12043 	    op, vntype);
12044 #endif
12045 }
12046 #endif /* CONFIG_DATALESS_FILES */
12047 
12048 static int
vfs_materialize_item(vnode_t vp __no_dataless_unused,uint32_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused,vnode_t tdvp __no_dataless_unused)12049 vfs_materialize_item(
12050 	vnode_t vp __no_dataless_unused,
12051 	uint32_t op __no_dataless_unused,
12052 	int64_t offset __no_dataless_unused,
12053 	int64_t size __no_dataless_unused,
12054 	char *lookup_name __no_dataless_unused,
12055 	size_t const namelen __no_dataless_unused,
12056 	vnode_t tdvp __no_dataless_unused)
12057 {
12058 #if CONFIG_DATALESS_FILES
12059 	kern_return_t kern_ret;
12060 	mach_port_t mach_port;
12061 	char *path = NULL;
12062 	vfs_context_t context;
12063 	int path_len;
12064 	int error;
12065 	audit_token_t atoken;
12066 	enum vtype vp_vtype;
12067 
12068 	/* Swap files are special; ignore them */
12069 	if (vnode_isswap(vp)) {
12070 		return 0;
12071 	}
12072 
12073 	/*
12074 	 * NAMESPACE_HANDLER_SNAPSHOT_EVENT and NAMESPACE_HANDLER_TRACK_EVENT
12075 	 * are no longer used nor supported.
12076 	 */
12077 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12078 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12079 		return ENOTSUP;
12080 	}
12081 	if (op & NAMESPACE_HANDLER_TRACK_EVENT) {
12082 		os_log_debug(OS_LOG_DEFAULT, "NSPACE TRACK not handled");
12083 		return ENOTSUP;
12084 	}
12085 
12086 	/* Normalize 'op'. */
12087 	op &= ~NAMESPACE_HANDLER_EVENT_TYPE_MASK;
12088 
12089 	/*
12090 	 * To-directory is only meaningful for rename operations;
12091 	 * ignore it if someone handed one to us unexpectedly.
12092 	 */
12093 	if (op != NAMESPACE_HANDLER_RENAME_OP) {
12094 		tdvp = NULL;
12095 	}
12096 
12097 	context = vfs_context_current();
12098 
12099 	/* Remember this for later. */
12100 	vp_vtype = vnode_vtype(vp);
12101 
12102 	error = vfs_context_dataless_materialization_is_prevented(context);
12103 	if (error) {
12104 		log_materialization_prevented(vp, op);
12105 		goto out_check_errors;
12106 	}
12107 
12108 	kern_ret = host_get_filecoordinationd_port(host_priv_self(),
12109 	    &mach_port);
12110 	if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
12111 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12112 		/*
12113 		 * Treat this like being unable to access the backing store
12114 		 * server.
12115 		 */
12116 		return ETIMEDOUT;
12117 	}
12118 
12119 	int path_alloc_len = MAXPATHLEN;
12120 	do {
12121 		path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
12122 		if (path == NULL) {
12123 			return ENOMEM;
12124 		}
12125 
12126 		path_len = path_alloc_len;
12127 		error = vn_getpath(vp, path, &path_len);
12128 		if (error == 0) {
12129 			break;
12130 		} else if (error == ENOSPC) {
12131 			kfree_data(path, path_alloc_len);
12132 			path = NULL;
12133 		} else {
12134 			goto out_release_port;
12135 		}
12136 	} while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) && path_alloc_len <= FSGETPATH_MAXBUFLEN);
12137 
12138 	error = vfs_context_copy_audit_token(context, &atoken);
12139 	if (error) {
12140 		goto out_release_port;
12141 	}
12142 
12143 	struct nspace_resolver_request req = {
12144 		.r_req_id = next_nspace_req_id(),
12145 		.r_vp = vp,
12146 		.r_tdvp = tdvp,
12147 	};
12148 
12149 	error = nspace_resolver_req_add(&req);
12150 	if (error) {
12151 		goto out_release_port;
12152 	}
12153 
12154 	os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12155 
12156 	if (op == NAMESPACE_HANDLER_RENAME_OP && tdvp != NULL) {
12157 		char *dest_path = NULL;
12158 		int dest_path_len;
12159 
12160 		dest_path = zalloc(ZV_NAMEI);
12161 		dest_path_len = MAXPATHLEN;
12162 
12163 		error = vn_getpath(tdvp, dest_path, &dest_path_len);
12164 		if (error) {
12165 			zfree(ZV_NAMEI, dest_path);
12166 			goto out_release_port;
12167 		}
12168 
12169 		/*
12170 		 * Force setting NAMESPACE_HANDLER_NSPACE_EVENT for
12171 		 * compatibility with existing agents in user-space
12172 		 * who get passed this value.
12173 		 */
12174 		kern_ret = send_vfs_resolve_reparent_with_audit_token(mach_port,
12175 		    req.r_req_id,
12176 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12177 		    path, dest_path, atoken);
12178 
12179 		zfree(ZV_NAMEI, dest_path);
12180 	} else if (vp_vtype == VDIR) {
12181 		char *tmpname = NULL;
12182 
12183 		/*
12184 		 * If the caller provided a lookup_name *and* a name length,
12185 		 * then we assume the lookup_name is not NUL-terminated.
12186 		 * Allocate a temporary buffer in this case to provide
12187 		 * a NUL-terminated path name to the IPC call.
12188 		 */
12189 		if (lookup_name != NULL && namelen != 0) {
12190 			if (namelen >= PATH_MAX) {
12191 				error = EINVAL;
12192 				goto out_req_remove;
12193 			}
12194 			tmpname = zalloc(ZV_NAMEI);
12195 			strlcpy(tmpname, lookup_name, namelen + 1);
12196 			lookup_name = tmpname;
12197 		} else if (lookup_name != NULL) {
12198 			/*
12199 			 * If the caller provided a lookup_name with a
12200 			 * zero name length, then we assume it's NUL-
12201 			 * terminated.  Verify it has a valid length.
12202 			 */
12203 			if (strlen(lookup_name) >= PATH_MAX) {
12204 				error = EINVAL;
12205 				goto out_req_remove;
12206 			}
12207 		}
12208 
12209 		/* (See above.) */
12210 		kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12211 		    req.r_req_id,
12212 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12213 		    lookup_name == NULL ? "" : lookup_name, path, atoken);
12214 
12215 		if (tmpname != NULL) {
12216 			zfree(ZV_NAMEI, tmpname);
12217 
12218 			/*
12219 			 * Poison lookup_name rather than reference
12220 			 * freed memory.
12221 			 */
12222 			lookup_name = NULL;
12223 		}
12224 	} else {
12225 		/* (See above.) */
12226 		kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12227 		    req.r_req_id,
12228 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12229 		    offset, size, path, atoken);
12230 	}
12231 	if (kern_ret != KERN_SUCCESS) {
12232 		/*
12233 		 * Also treat this like being unable to access the backing
12234 		 * store server.
12235 		 */
12236 		os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12237 		    kern_ret);
12238 		error = ETIMEDOUT;
12239 		goto out_req_remove;
12240 	}
12241 
12242 	/*
12243 	 * Give back the memory we allocated earlier while we wait; we
12244 	 * no longer need it.
12245 	 */
12246 	kfree_data(path, path_alloc_len);
12247 	path = NULL;
12248 
12249 	/*
12250 	 * Request has been submitted to the resolver. Now (interruptibly)
12251 	 * wait for completion. Upon requrn, the request will have been
12252 	 * removed from the lookup table.
12253 	 */
12254 	error = nspace_resolver_req_wait(&req);
12255 
12256 out_release_port:
12257 	if (path != NULL) {
12258 		kfree_data(path, path_alloc_len);
12259 		path = NULL;
12260 	}
12261 	ipc_port_release_send(mach_port);
12262 
12263 out_check_errors:
12264 	/*
12265 	 * The file resolver owns the logic about what error to return
12266 	 * to the caller.  We only need to handle a couple of special
12267 	 * cases here:
12268 	 */
12269 	if (error == EJUSTRETURN) {
12270 		/*
12271 		 * The requesting process is allowed to interact with
12272 		 * dataless objects.  Make a couple of sanity-checks
12273 		 * here to ensure the action makes sense.
12274 		 */
12275 		switch (op) {
12276 		case NAMESPACE_HANDLER_WRITE_OP:
12277 		case NAMESPACE_HANDLER_TRUNCATE_OP:
12278 		case NAMESPACE_HANDLER_RENAME_OP:
12279 			/*
12280 			 * This handles the case of the resolver itself
12281 			 * writing data to the file (or throwing it
12282 			 * away).
12283 			 */
12284 			error = 0;
12285 			break;
12286 		case NAMESPACE_HANDLER_READ_OP:
12287 		case NAMESPACE_HANDLER_LOOKUP_OP:
12288 			/*
12289 			 * This handles the case of the resolver needing
12290 			 * to look up inside of a dataless directory while
12291 			 * it's in the process of materializing it (for
12292 			 * example, creating files or directories).
12293 			 */
12294 			error = (vp_vtype == VDIR) ? 0 : EBADF;
12295 			break;
12296 		default:
12297 			error = EBADF;
12298 			break;
12299 		}
12300 	}
12301 
12302 	return error;
12303 
12304 out_req_remove:
12305 	nspace_resolver_req_remove(&req);
12306 	goto out_release_port;
12307 #else
12308 	return ENOTSUP;
12309 #endif /* CONFIG_DATALESS_FILES */
12310 }
12311 
12312 /*
12313  * vfs_materialize_file: Materialize a regular file.
12314  *
12315  * Inputs:
12316  * vp		The dataless file to be materialized.
12317  *
12318  * op		What kind of operation is being performed:
12319  *		-> NAMESPACE_HANDLER_READ_OP
12320  *		-> NAMESPACE_HANDLER_WRITE_OP
12321  *		-> NAMESPACE_HANDLER_LINK_CREATE
12322  *		-> NAMESPACE_HANDLER_DELETE_OP
12323  *		-> NAMESPACE_HANDLER_TRUNCATE_OP
12324  *		-> NAMESPACE_HANDLER_RENAME_OP
12325  *
12326  * offset	offset of I/O for READ or WRITE.  Ignored for
12327  *		other ops.
12328  *
12329  * size		size of I/O for READ or WRITE  Ignored for
12330  *		other ops.
12331  *
12332  * If offset or size are -1 for a READ or WRITE, then the resolver should
12333  * consider the range to be unknown.
12334  *
12335  * Upon successful return, the caller may proceed with the operation.
12336  * N.B. the file may still be "dataless" in this case.
12337  */
12338 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12339 vfs_materialize_file(
12340 	struct vnode *vp,
12341 	uint64_t op,
12342 	int64_t offset,
12343 	int64_t size)
12344 {
12345 	if (vp->v_type != VREG) {
12346 		return EFTYPE;
12347 	}
12348 	return vfs_materialize_item(vp, (uint32_t)op, offset, size, NULL, 0,
12349 	           NULL);
12350 }
12351 
12352 /*
12353  * vfs_materialize_dir:
12354  *
12355  * Inputs:
12356  * vp		The dataless directory to be materialized.
12357  *
12358  * op		What kind of operation is being performed:
12359  *		-> NAMESPACE_HANDLER_READ_OP
12360  *		-> NAMESPACE_HANDLER_WRITE_OP
12361  *		-> NAMESPACE_HANDLER_DELETE_OP
12362  *		-> NAMESPACE_HANDLER_RENAME_OP
12363  *		-> NAMESPACE_HANDLER_LOOKUP_OP
12364  *
12365  * lookup_name	Name being looked up for a LOOKUP op.  Ignored for
12366  *		other ops.  May or may not be NUL-terminated; see below.
12367  *
12368  * namelen	If non-zero, then lookup_name is assumed to not be NUL-
12369  *		terminated and namelen is the number of valid bytes in
12370  *		lookup_name. If zero, then lookup_name is assumed to be
12371  *		NUL-terminated.
12372  *
12373  * Upon successful return, the caller may proceed with the operation.
12374  * N.B. the directory may still be "dataless" in this case.
12375  */
12376 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12377 vfs_materialize_dir(
12378 	struct vnode *vp,
12379 	uint64_t op,
12380 	char *lookup_name,
12381 	size_t namelen)
12382 {
12383 	if (vp->v_type != VDIR) {
12384 		return EFTYPE;
12385 	}
12386 	if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12387 		return EINVAL;
12388 	}
12389 	return vfs_materialize_item(vp, (uint32_t)op, 0, 0, lookup_name,
12390 	           namelen, NULL);
12391 }
12392 
12393 /*
12394  * vfs_materialize_reparent:
12395  *
12396  * Inputs:
12397  * vp		The dataless file or directory to be materialized.
12398  *
12399  * tdvp		The new parent directory for the dataless file.
12400  *
12401  * Upon successful return, the caller may proceed with the operation.
12402  * N.B. the item may still be "dataless" in this case.
12403  */
12404 int
vfs_materialize_reparent(vnode_t vp,vnode_t tdvp)12405 vfs_materialize_reparent(vnode_t vp, vnode_t tdvp)
12406 {
12407 	if (vp->v_type != VDIR && vp->v_type != VREG) {
12408 		return EFTYPE;
12409 	}
12410 	return vfs_materialize_item(vp, NAMESPACE_HANDLER_RENAME_OP,
12411 	           0, 0, NULL, 0, tdvp);
12412 }
12413 
12414 #if 0
12415 static int
12416 build_volfs_path(struct vnode *vp, char *path, int *len)
12417 {
12418 	struct vnode_attr va;
12419 	int ret;
12420 
12421 	VATTR_INIT(&va);
12422 	VATTR_WANTED(&va, va_fsid);
12423 	VATTR_WANTED(&va, va_fileid);
12424 
12425 	if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12426 		*len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12427 		ret = -1;
12428 	} else {
12429 		*len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12430 		ret = 0;
12431 	}
12432 
12433 	return ret;
12434 }
12435 #endif
12436 
12437 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)12438 fsctl_bogus_command_compat(unsigned long cmd)
12439 {
12440 	switch (cmd) {
12441 	case IOCBASECMD(FSIOC_SYNC_VOLUME):
12442 		return FSIOC_SYNC_VOLUME;
12443 	case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12444 		return FSIOC_ROUTEFS_SETROUTEID;
12445 	case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12446 		return FSIOC_SET_PACKAGE_EXTS;
12447 	case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12448 		return FSIOC_SET_FSTYPENAME_OVERRIDE;
12449 	case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12450 		return DISK_CONDITIONER_IOC_GET;
12451 	case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12452 		return DISK_CONDITIONER_IOC_SET;
12453 	case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12454 		return FSIOC_FIOSEEKHOLE;
12455 	case IOCBASECMD(FSIOC_FIOSEEKDATA):
12456 		return FSIOC_FIOSEEKDATA;
12457 	case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12458 		return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12459 	case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12460 		return SPOTLIGHT_IOC_GET_LAST_MTIME;
12461 	}
12462 
12463 	return cmd;
12464 }
12465 
12466 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)12467 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12468 {
12469 	return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
12470 }
12471 
12472 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)12473 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12474 {
12475 	struct vfs_attr vfa;
12476 	mount_t mp = vp->v_mount;
12477 	unsigned arg;
12478 	int error;
12479 
12480 	/* record vid of vp so we can drop it below. */
12481 	uint32_t vvid = vp->v_id;
12482 
12483 	/*
12484 	 * Then grab mount_iterref so that we can release the vnode.
12485 	 * Without this, a thread may call vnode_iterate_prepare then
12486 	 * get into a deadlock because we've never released the root vp
12487 	 */
12488 	error = mount_iterref(mp, 0);
12489 	if (error) {
12490 		return error;
12491 	}
12492 	vnode_hold(vp);
12493 	vnode_put(vp);
12494 
12495 	arg = MNT_NOWAIT;
12496 	if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12497 		arg = MNT_WAIT;
12498 	}
12499 
12500 	/*
12501 	 * If the filessytem supports multiple filesytems in a
12502 	 * partition (For eg APFS volumes in a container, it knows
12503 	 * that the waitfor argument to VFS_SYNC are flags.
12504 	 */
12505 	VFSATTR_INIT(&vfa);
12506 	VFSATTR_WANTED(&vfa, f_capabilities);
12507 	if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
12508 	    VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12509 	    ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12510 	    ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12511 		arg |= MNT_VOLUME;
12512 	}
12513 
12514 	/* issue the sync for this volume */
12515 	(void)sync_callback(mp, &arg);
12516 
12517 	/*
12518 	 * Then release the mount_iterref once we're done syncing; it's not
12519 	 * needed for the VNOP_IOCTL below
12520 	 */
12521 	mount_iterdrop(mp);
12522 
12523 	if (arg & FSCTL_SYNC_FULLSYNC) {
12524 		/* re-obtain vnode iocount on the root vp, if possible */
12525 		error = vnode_getwithvid(vp, vvid);
12526 		if (error == 0) {
12527 			error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
12528 			vnode_put(vp);
12529 		}
12530 	}
12531 	vnode_drop(vp);
12532 	/* mark the argument VP as having been released */
12533 	*arg_vp = NULL;
12534 	return error;
12535 }
12536 
12537 #if ROUTEFS
12538 static int __attribute__((noinline))
handle_routes(user_addr_t udata)12539 handle_routes(user_addr_t udata)
12540 {
12541 	char routepath[MAXPATHLEN];
12542 	size_t len = 0;
12543 	int error;
12544 
12545 	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12546 		return error;
12547 	}
12548 	bzero(routepath, MAXPATHLEN);
12549 	error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12550 	if (error) {
12551 		return error;
12552 	}
12553 	error = routefs_kernel_mount(routepath);
12554 	return error;
12555 }
12556 #endif
12557 
12558 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)12559 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12560 {
12561 	struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12562 	struct vnode_attr va;
12563 	int error;
12564 
12565 	VATTR_INIT(&va);
12566 	VATTR_SET(&va, va_flags, cas->new_flags);
12567 
12568 	error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
12569 
12570 #if CONFIG_FSE
12571 	if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12572 		add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12573 	}
12574 #endif
12575 
12576 	return error;
12577 }
12578 
12579 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)12580 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12581 {
12582 	struct mount *mp = NULL;
12583 	errno_t rootauth = 0;
12584 
12585 	mp = vp->v_mount;
12586 
12587 	/*
12588 	 * query the underlying FS and see if it reports something
12589 	 * sane for this vnode. If volume is authenticated via
12590 	 * chunklist, leave that for the caller to determine.
12591 	 */
12592 	rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12593 
12594 	return rootauth;
12595 }
12596 
12597 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12598 	"com.apple.private.kernel.set-package-extensions"
12599 
12600 /*
12601  * Make a filesystem-specific control call:
12602  */
12603 /* ARGSUSED */
12604 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)12605 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12606 {
12607 	int error = 0;
12608 	boolean_t is64bit;
12609 	u_int size;
12610 #define STK_PARAMS 128
12611 	char stkbuf[STK_PARAMS] = {0};
12612 	caddr_t data, memp;
12613 	vnode_t vp = *arg_vp;
12614 
12615 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
12616 		return ENOTTY;
12617 	}
12618 
12619 	cmd = fsctl_bogus_command_compat(cmd);
12620 
12621 	size = IOCPARM_LEN(cmd);
12622 	if (size > IOCPARM_MAX) {
12623 		return EINVAL;
12624 	}
12625 
12626 	is64bit = proc_is64bit(p);
12627 
12628 	memp = NULL;
12629 
12630 	if (size > sizeof(stkbuf)) {
12631 		if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12632 			return ENOMEM;
12633 		}
12634 		data = memp;
12635 	} else {
12636 		data = &stkbuf[0];
12637 	};
12638 
12639 	if (cmd & IOC_IN) {
12640 		if (size) {
12641 			error = copyin(udata, data, size);
12642 			if (error) {
12643 				if (memp) {
12644 					kfree_data(memp, size);
12645 				}
12646 				return error;
12647 			}
12648 		} else {
12649 			if (is64bit) {
12650 				*(user_addr_t *)data = udata;
12651 			} else {
12652 				*(uint32_t *)data = (uint32_t)udata;
12653 			}
12654 		};
12655 	} else if ((cmd & IOC_OUT) && size) {
12656 		/*
12657 		 * Zero the buffer so the user always
12658 		 * gets back something deterministic.
12659 		 */
12660 		bzero(data, size);
12661 	} else if (cmd & IOC_VOID) {
12662 		if (is64bit) {
12663 			*(user_addr_t *)data = udata;
12664 		} else {
12665 			*(uint32_t *)data = (uint32_t)udata;
12666 		}
12667 	}
12668 
12669 	/* Check to see if it's a generic command */
12670 	switch (cmd) {
12671 	case FSIOC_SYNC_VOLUME:
12672 		error = handle_sync_volume(vp, arg_vp, data, ctx);
12673 		break;
12674 
12675 	case FSIOC_ROUTEFS_SETROUTEID:
12676 #if ROUTEFS
12677 		error = handle_routes(udata);
12678 #endif
12679 		break;
12680 
12681 	case FSIOC_SET_PACKAGE_EXTS: {
12682 		user_addr_t ext_strings;
12683 		uint32_t    num_entries;
12684 		uint32_t    max_width;
12685 
12686 		if (!IOTaskHasEntitlement(vfs_context_task(ctx),
12687 		    SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12688 			error = EPERM;
12689 			break;
12690 		}
12691 
12692 		if ((is64bit && size != sizeof(user64_package_ext_info))
12693 		    || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12694 			// either you're 64-bit and passed a 64-bit struct or
12695 			// you're 32-bit and passed a 32-bit struct.  otherwise
12696 			// it's not ok.
12697 			error = EINVAL;
12698 			break;
12699 		}
12700 
12701 		if (is64bit) {
12702 			if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12703 				assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12704 			}
12705 			ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12706 			num_entries = ((user64_package_ext_info *)data)->num_entries;
12707 			max_width   = ((user64_package_ext_info *)data)->max_width;
12708 		} else {
12709 			ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12710 			num_entries = ((user32_package_ext_info *)data)->num_entries;
12711 			max_width   = ((user32_package_ext_info *)data)->max_width;
12712 		}
12713 		error = set_package_extensions_table(ext_strings, num_entries, max_width);
12714 	}
12715 	break;
12716 
12717 	case FSIOC_SET_FSTYPENAME_OVERRIDE:
12718 	{
12719 		mount_t mp;
12720 
12721 		if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12722 			break;
12723 		}
12724 		if ((mp = vp->v_mount) != NULL) {
12725 			mount_lock(mp);
12726 			if (data[0] != 0) {
12727 				for (int i = 0; i < MFSTYPENAMELEN; i++) {
12728 					if (!data[i]) {
12729 						goto continue_copy;
12730 					}
12731 				}
12732 				/*
12733 				 * Getting here means we have a user data
12734 				 * string which has no NULL termination in
12735 				 * its first MFSTYPENAMELEN bytes.  This is
12736 				 * bogus, let's avoid strlcpy-ing the read
12737 				 * data and return an error.
12738 				 */
12739 				error = EINVAL;
12740 				goto unlock;
12741 continue_copy:
12742 				vfs_setfstypename_locked(mp, data);
12743 				if (vfs_isrdonly(mp) &&
12744 				    strcmp(data, "mtmfs") == 0) {
12745 					mp->mnt_kern_flag |=
12746 					    MNTK_EXTENDED_SECURITY;
12747 					mp->mnt_kern_flag &=
12748 					    ~MNTK_AUTH_OPAQUE;
12749 				}
12750 			} else if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12751 				const char *name =
12752 				    vfs_getfstypenameref_locked(mp, NULL);
12753 				if (strcmp(name, "mtmfs") == 0) {
12754 					mp->mnt_kern_flag &=
12755 					    ~MNTK_EXTENDED_SECURITY;
12756 				}
12757 				vfs_setfstypename_locked(mp, NULL);
12758 			}
12759 unlock:
12760 			mount_unlock(mp);
12761 		}
12762 	}
12763 	break;
12764 
12765 	case DISK_CONDITIONER_IOC_GET: {
12766 		error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12767 	}
12768 	break;
12769 
12770 	case DISK_CONDITIONER_IOC_SET: {
12771 		error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
12772 	}
12773 	break;
12774 
12775 	case FSIOC_CAS_BSDFLAGS:
12776 		error = handle_flags(vp, data, ctx);
12777 		break;
12778 
12779 	case FSIOC_FD_ONLY_OPEN_ONCE: {
12780 		error = 0;
12781 		if (vnode_usecount(vp) > 1) {
12782 			vnode_lock_spin(vp);
12783 			if (vp->v_lflag & VL_HASSTREAMS) {
12784 				if (vnode_isinuse_locked(vp, 1, 1)) {
12785 					error = EBUSY;
12786 				}
12787 			} else if (vnode_usecount(vp) > 1) {
12788 				error = EBUSY;
12789 			}
12790 			vnode_unlock(vp);
12791 		}
12792 	}
12793 	break;
12794 
12795 	case FSIOC_EVAL_ROOTAUTH:
12796 		error = handle_auth(vp, cmd, data, options, ctx);
12797 		break;
12798 
12799 	case FSIOC_TEST_FSE_ACCESS_GRANTED:
12800 		error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
12801 		break;
12802 
12803 #if CONFIG_EXCLAVES
12804 	case FSIOC_EXCLAVE_FS_REGISTER:
12805 		if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12806 			error = vfs_exclave_fs_register(((fsioc_exclave_fs_register_t *)data)->fs_tag, vp);
12807 		} else {
12808 			error = EPERM;
12809 		}
12810 		break;
12811 
12812 	case FSIOC_EXCLAVE_FS_UNREGISTER:
12813 		if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12814 			error = vfs_exclave_fs_unregister(vp);
12815 		} else {
12816 			error = EPERM;
12817 		}
12818 		break;
12819 
12820 	case FSIOC_EXCLAVE_FS_GET_BASE_DIRS: {
12821 		exclave_fs_get_base_dirs_t *get_base_dirs = ((exclave_fs_get_base_dirs_t *)data);
12822 		exclave_fs_base_dir_t *dirs = NULL;
12823 		if (!IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12824 			error = EPERM;
12825 			break;
12826 		}
12827 		if (get_base_dirs->base_dirs) {
12828 			if ((get_base_dirs->count == 0) || (get_base_dirs->count > EXCLAVE_FS_GET_BASE_DIRS_MAX_COUNT)) {
12829 				error = EINVAL;
12830 				break;
12831 			}
12832 			dirs = kalloc_type(exclave_fs_base_dir_t, get_base_dirs->count, Z_WAITOK | Z_ZERO);
12833 			if (!dirs) {
12834 				error = ENOSPC;
12835 				break;
12836 			}
12837 		}
12838 		error = vfs_exclave_fs_get_base_dirs(dirs, &get_base_dirs->count);
12839 		if (!error && dirs) {
12840 			error = copyout(dirs, (user_addr_t)get_base_dirs->base_dirs,
12841 			    get_base_dirs->count * sizeof(exclave_fs_base_dir_t));
12842 		}
12843 		if (dirs) {
12844 			kfree_type(exclave_fs_base_dir_t, get_base_dirs->count, dirs);
12845 		}
12846 	}
12847 	break;
12848 #endif
12849 
12850 	default: {
12851 		/*
12852 		 * Other, known commands shouldn't be passed down here.
12853 		 * (When adding a selector to this list, it may be prudent
12854 		 * to consider adding it to the list in sys_fcntl_nocancel() as well.)
12855 		 */
12856 		switch (cmd) {
12857 		case F_PUNCHHOLE:
12858 		case F_TRIM_ACTIVE_FILE:
12859 		case F_RDADVISE:
12860 		case F_TRANSCODEKEY:
12861 		case F_GETPROTECTIONLEVEL:
12862 		case F_GETDEFAULTPROTLEVEL:
12863 		case F_MAKECOMPRESSED:
12864 		case F_SET_GREEDY_MODE:
12865 		case F_SETSTATICCONTENT:
12866 		case F_SETIOTYPE:
12867 		case F_SETBACKINGSTORE:
12868 		case F_GETPATH_MTMINFO:
12869 		case APFSIOC_REVERT_TO_SNAPSHOT:
12870 		case FSIOC_FIOSEEKHOLE:
12871 		case FSIOC_FIOSEEKDATA:
12872 		case HFS_GET_BOOT_INFO:
12873 		case HFS_SET_BOOT_INFO:
12874 		case FIOPINSWAP:
12875 		case F_CHKCLEAN:
12876 		case F_FULLFSYNC:
12877 		case F_BARRIERFSYNC:
12878 		case F_FREEZE_FS:
12879 		case F_THAW_FS:
12880 		case FSIOC_KERNEL_ROOTAUTH:
12881 		case FSIOC_GRAFT_FS:
12882 		case FSIOC_UNGRAFT_FS:
12883 		case FSIOC_AUTH_FS:
12884 			error = EINVAL;
12885 			goto outdrop;
12886 		}
12887 		/* Invoke the filesystem-specific code */
12888 		error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12889 	}
12890 	} /* end switch stmt */
12891 
12892 	/*
12893 	 * if no errors, copy any data to user. Size was
12894 	 * already set and checked above.
12895 	 */
12896 	if (error == 0 && (cmd & IOC_OUT) && size) {
12897 		error = copyout(data, udata, size);
12898 	}
12899 
12900 outdrop:
12901 	if (memp) {
12902 		kfree_data(memp, size);
12903 	}
12904 
12905 	return error;
12906 }
12907 
12908 /* ARGSUSED */
12909 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)12910 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
12911 {
12912 	int error;
12913 	struct nameidata nd;
12914 	uint32_t nameiflags;
12915 	vnode_t vp = NULL;
12916 	vfs_context_t ctx = vfs_context_current();
12917 
12918 	AUDIT_ARG(cmd, (int)uap->cmd);
12919 	AUDIT_ARG(value32, uap->options);
12920 	/* Get the vnode for the file we are getting info on:  */
12921 	nameiflags = 0;
12922 	//
12923 	// if we come through fsctl() then the file is by definition not open.
12924 	// therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
12925 	// lest the caller mistakenly thinks the only open is their own (but in
12926 	// reality it's someone elses).
12927 	//
12928 	if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
12929 		return EINVAL;
12930 	}
12931 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
12932 		nameiflags |= FOLLOW;
12933 	}
12934 	if (uap->cmd == FSIOC_FIRMLINK_CTL) {
12935 		nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
12936 	}
12937 	NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
12938 	    UIO_USERSPACE, uap->path, ctx);
12939 	if ((error = namei(&nd))) {
12940 		goto done;
12941 	}
12942 	vp = nd.ni_vp;
12943 	nameidone(&nd);
12944 
12945 #if CONFIG_MACF
12946 	error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
12947 	if (error) {
12948 		goto done;
12949 	}
12950 #endif
12951 
12952 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12953 
12954 done:
12955 	if (vp) {
12956 		vnode_put(vp);
12957 	}
12958 	return error;
12959 }
12960 /* ARGSUSED */
12961 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)12962 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
12963 {
12964 	int error;
12965 	vnode_t vp = NULL;
12966 	vfs_context_t ctx = vfs_context_current();
12967 	int fd = -1;
12968 
12969 	AUDIT_ARG(fd, uap->fd);
12970 	AUDIT_ARG(cmd, (int)uap->cmd);
12971 	AUDIT_ARG(value32, uap->options);
12972 
12973 	/* Get the vnode for the file we are getting info on:  */
12974 	if ((error = file_vnode(uap->fd, &vp))) {
12975 		return error;
12976 	}
12977 	fd = uap->fd;
12978 	if ((error = vnode_getwithref(vp))) {
12979 		file_drop(fd);
12980 		return error;
12981 	}
12982 
12983 #if CONFIG_MACF
12984 	if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
12985 		file_drop(fd);
12986 		vnode_put(vp);
12987 		return error;
12988 	}
12989 #endif
12990 
12991 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12992 
12993 	file_drop(fd);
12994 
12995 	/*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
12996 	if (vp) {
12997 		vnode_put(vp);
12998 	}
12999 
13000 	return error;
13001 }
13002 /* end of fsctl system call */
13003 
13004 #define FILESEC_ACCESS_ENTITLEMENT              \
13005 	"com.apple.private.vfs.filesec-access"
13006 
13007 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)13008 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
13009 {
13010 	if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
13011 		/*
13012 		 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
13013 		 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
13014 		 */
13015 		if ((!setting && vfs_context_issuser(ctx)) ||
13016 		    IOTaskHasEntitlement(vfs_context_task(ctx),
13017 		    FILESEC_ACCESS_ENTITLEMENT)) {
13018 			return 0;
13019 		}
13020 	}
13021 
13022 	return EPERM;
13023 }
13024 
13025 /*
13026  *  Retrieve the data of an extended attribute.
13027  */
13028 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)13029 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
13030 {
13031 	vnode_t vp;
13032 	struct nameidata nd;
13033 	char attrname[XATTR_MAXNAMELEN + 1];
13034 	vfs_context_t ctx = vfs_context_current();
13035 	uio_t auio = NULL;
13036 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13037 	size_t attrsize = 0;
13038 	size_t namelen;
13039 	u_int32_t nameiflags;
13040 	int error;
13041 	UIO_STACKBUF(uio_buf, 1);
13042 
13043 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13044 		return EINVAL;
13045 	}
13046 
13047 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13048 	NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
13049 	if ((error = namei(&nd))) {
13050 		return error;
13051 	}
13052 	vp = nd.ni_vp;
13053 	nameidone(&nd);
13054 
13055 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13056 	if (error != 0) {
13057 		goto out;
13058 	}
13059 	if (xattr_protected(attrname) &&
13060 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13061 		goto out;
13062 	}
13063 	/*
13064 	 * the specific check for 0xffffffff is a hack to preserve
13065 	 * binaray compatibilty in K64 with applications that discovered
13066 	 * that passing in a buf pointer and a size of -1 resulted in
13067 	 * just the size of the indicated extended attribute being returned.
13068 	 * this isn't part of the documented behavior, but because of the
13069 	 * original implemtation's check for "uap->size > 0", this behavior
13070 	 * was allowed. In K32 that check turned into a signed comparison
13071 	 * even though uap->size is unsigned...  in K64, we blow by that
13072 	 * check because uap->size is unsigned and doesn't get sign smeared
13073 	 * in the munger for a 32 bit user app.  we also need to add a
13074 	 * check to limit the maximum size of the buffer being passed in...
13075 	 * unfortunately, the underlying fileystems seem to just malloc
13076 	 * the requested size even if the actual extended attribute is tiny.
13077 	 * because that malloc is for kernel wired memory, we have to put a
13078 	 * sane limit on it.
13079 	 *
13080 	 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
13081 	 * U64 running on K64 will yield -1 (64 bits wide)
13082 	 * U32/U64 running on K32 will yield -1 (32 bits wide)
13083 	 */
13084 	if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
13085 		goto no_uio;
13086 	}
13087 
13088 	if (uap->value) {
13089 		if (uap->size > (size_t)XATTR_MAXSIZE) {
13090 			uap->size = XATTR_MAXSIZE;
13091 		}
13092 
13093 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13094 		    &uio_buf[0], sizeof(uio_buf));
13095 		uio_addiov(auio, uap->value, uap->size);
13096 	}
13097 no_uio:
13098 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
13099 out:
13100 	vnode_put(vp);
13101 
13102 	if (auio) {
13103 		*retval = uap->size - uio_resid(auio);
13104 	} else {
13105 		*retval = (user_ssize_t)attrsize;
13106 	}
13107 
13108 	return error;
13109 }
13110 
13111 /*
13112  * Retrieve the data of an extended attribute.
13113  */
13114 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)13115 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
13116 {
13117 	vnode_t vp;
13118 	char attrname[XATTR_MAXNAMELEN + 1];
13119 	vfs_context_t ctx = vfs_context_current();
13120 	uio_t auio = NULL;
13121 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13122 	size_t attrsize = 0;
13123 	size_t namelen;
13124 	int error;
13125 	UIO_STACKBUF(uio_buf, 1);
13126 
13127 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13128 		return EINVAL;
13129 	}
13130 
13131 	if ((error = file_vnode(uap->fd, &vp))) {
13132 		return error;
13133 	}
13134 	if ((error = vnode_getwithref(vp))) {
13135 		file_drop(uap->fd);
13136 		return error;
13137 	}
13138 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13139 	if (error != 0) {
13140 		goto out;
13141 	}
13142 	if (xattr_protected(attrname) &&
13143 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13144 		goto out;
13145 	}
13146 	if (uap->value && uap->size > 0) {
13147 		if (uap->size > (size_t)XATTR_MAXSIZE) {
13148 			uap->size = XATTR_MAXSIZE;
13149 		}
13150 
13151 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13152 		    &uio_buf[0], sizeof(uio_buf));
13153 		uio_addiov(auio, uap->value, uap->size);
13154 	}
13155 
13156 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13157 out:
13158 	(void)vnode_put(vp);
13159 	file_drop(uap->fd);
13160 
13161 	if (auio) {
13162 		*retval = uap->size - uio_resid(auio);
13163 	} else {
13164 		*retval = (user_ssize_t)attrsize;
13165 	}
13166 	return error;
13167 }
13168 
13169 /* struct for checkdirs iteration */
13170 struct setxattr_ctx {
13171 	struct nameidata nd;
13172 	char attrname[XATTR_MAXNAMELEN + 1];
13173 	UIO_STACKBUF(uio_buf, 1);
13174 };
13175 
13176 /*
13177  * Set the data of an extended attribute.
13178  */
13179 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13180 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13181 {
13182 	vnode_t vp;
13183 	vfs_context_t ctx = vfs_context_current();
13184 	uio_t auio = NULL;
13185 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13186 	size_t namelen;
13187 	u_int32_t nameiflags;
13188 	int error;
13189 	struct setxattr_ctx *sactx;
13190 
13191 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13192 		return EINVAL;
13193 	}
13194 
13195 	sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13196 	if (sactx == NULL) {
13197 		return ENOMEM;
13198 	}
13199 
13200 	error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13201 	if (error != 0) {
13202 		if (error == EPERM) {
13203 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13204 			error = ENAMETOOLONG;
13205 		}
13206 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13207 		goto out;
13208 	}
13209 	if (xattr_protected(sactx->attrname) &&
13210 	    (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13211 		goto out;
13212 	}
13213 	if (uap->size != 0 && uap->value == 0) {
13214 		error = EINVAL;
13215 		goto out;
13216 	}
13217 	if (uap->size > INT_MAX) {
13218 		error = E2BIG;
13219 		goto out;
13220 	}
13221 
13222 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13223 #if CONFIG_FILE_LEASES
13224 	nameiflags |= WANTPARENT;
13225 #endif
13226 	NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13227 	if ((error = namei(&sactx->nd))) {
13228 		goto out;
13229 	}
13230 	vp = sactx->nd.ni_vp;
13231 #if CONFIG_FILE_LEASES
13232 	vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13233 	vnode_put(sactx->nd.ni_dvp);
13234 #endif
13235 	nameidone(&sactx->nd);
13236 
13237 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13238 	    &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13239 	uio_addiov(auio, uap->value, uap->size);
13240 
13241 	error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13242 #if CONFIG_FSE
13243 	if (error == 0) {
13244 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13245 		    FSE_ARG_VNODE, vp,
13246 		    FSE_ARG_DONE);
13247 	}
13248 #endif
13249 	vnode_put(vp);
13250 out:
13251 	kfree_type(struct setxattr_ctx, sactx);
13252 	*retval = 0;
13253 	return error;
13254 }
13255 
13256 /*
13257  * Set the data of an extended attribute.
13258  */
13259 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13260 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13261 {
13262 	vnode_t vp;
13263 	char attrname[XATTR_MAXNAMELEN + 1];
13264 	vfs_context_t ctx = vfs_context_current();
13265 	uio_t auio = NULL;
13266 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13267 	size_t namelen;
13268 	int error;
13269 	UIO_STACKBUF(uio_buf, 1);
13270 
13271 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13272 		return EINVAL;
13273 	}
13274 
13275 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13276 	if (error != 0) {
13277 		if (error == EPERM) {
13278 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13279 			return ENAMETOOLONG;
13280 		}
13281 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13282 		return error;
13283 	}
13284 	if (xattr_protected(attrname) &&
13285 	    (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13286 		return error;
13287 	}
13288 	if (uap->size != 0 && uap->value == 0) {
13289 		return EINVAL;
13290 	}
13291 	if (uap->size > INT_MAX) {
13292 		return E2BIG;
13293 	}
13294 	if ((error = file_vnode(uap->fd, &vp))) {
13295 		return error;
13296 	}
13297 	if ((error = vnode_getwithref(vp))) {
13298 		file_drop(uap->fd);
13299 		return error;
13300 	}
13301 
13302 #if CONFIG_FILE_LEASES
13303 	vnode_breakdirlease(vp, true, O_WRONLY);
13304 #endif
13305 
13306 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13307 	    &uio_buf[0], sizeof(uio_buf));
13308 	uio_addiov(auio, uap->value, uap->size);
13309 
13310 	error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13311 #if CONFIG_FSE
13312 	if (error == 0) {
13313 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13314 		    FSE_ARG_VNODE, vp,
13315 		    FSE_ARG_DONE);
13316 	}
13317 #endif
13318 	vnode_put(vp);
13319 	file_drop(uap->fd);
13320 	*retval = 0;
13321 	return error;
13322 }
13323 
13324 /*
13325  * Remove an extended attribute.
13326  * XXX Code duplication here.
13327  */
13328 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13329 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13330 {
13331 	vnode_t vp;
13332 	struct nameidata nd;
13333 	char attrname[XATTR_MAXNAMELEN + 1];
13334 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13335 	vfs_context_t ctx = vfs_context_current();
13336 	size_t namelen;
13337 	u_int32_t nameiflags;
13338 	int error;
13339 
13340 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13341 		return EINVAL;
13342 	}
13343 
13344 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13345 	if (error != 0) {
13346 		return error;
13347 	}
13348 	if (xattr_protected(attrname)) {
13349 		return EPERM;
13350 	}
13351 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13352 #if CONFIG_FILE_LEASES
13353 	nameiflags |= WANTPARENT;
13354 #endif
13355 	NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13356 	if ((error = namei(&nd))) {
13357 		return error;
13358 	}
13359 	vp = nd.ni_vp;
13360 #if CONFIG_FILE_LEASES
13361 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13362 	vnode_put(nd.ni_dvp);
13363 #endif
13364 	nameidone(&nd);
13365 
13366 	error = vn_removexattr(vp, attrname, uap->options, ctx);
13367 #if CONFIG_FSE
13368 	if (error == 0) {
13369 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13370 		    FSE_ARG_VNODE, vp,
13371 		    FSE_ARG_DONE);
13372 	}
13373 #endif
13374 	vnode_put(vp);
13375 	*retval = 0;
13376 	return error;
13377 }
13378 
13379 /*
13380  * Remove an extended attribute.
13381  * XXX Code duplication here.
13382  */
13383 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)13384 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13385 {
13386 	vnode_t vp;
13387 	char attrname[XATTR_MAXNAMELEN + 1];
13388 	size_t namelen;
13389 	int error;
13390 #if CONFIG_FSE
13391 	vfs_context_t ctx = vfs_context_current();
13392 #endif
13393 
13394 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13395 		return EINVAL;
13396 	}
13397 
13398 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13399 	if (error != 0) {
13400 		return error;
13401 	}
13402 	if (xattr_protected(attrname)) {
13403 		return EPERM;
13404 	}
13405 	if ((error = file_vnode(uap->fd, &vp))) {
13406 		return error;
13407 	}
13408 	if ((error = vnode_getwithref(vp))) {
13409 		file_drop(uap->fd);
13410 		return error;
13411 	}
13412 
13413 #if CONFIG_FILE_LEASES
13414 	vnode_breakdirlease(vp, true, O_WRONLY);
13415 #endif
13416 
13417 	error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13418 #if CONFIG_FSE
13419 	if (error == 0) {
13420 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13421 		    FSE_ARG_VNODE, vp,
13422 		    FSE_ARG_DONE);
13423 	}
13424 #endif
13425 	vnode_put(vp);
13426 	file_drop(uap->fd);
13427 	*retval = 0;
13428 	return error;
13429 }
13430 
13431 /*
13432  * Retrieve the list of extended attribute names.
13433  * XXX Code duplication here.
13434  */
13435 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)13436 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13437 {
13438 	vnode_t vp;
13439 	struct nameidata nd;
13440 	vfs_context_t ctx = vfs_context_current();
13441 	uio_t auio = NULL;
13442 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13443 	size_t attrsize = 0;
13444 	u_int32_t nameiflags;
13445 	int error;
13446 	UIO_STACKBUF(uio_buf, 1);
13447 
13448 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13449 		return EINVAL;
13450 	}
13451 
13452 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13453 	NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13454 	if ((error = namei(&nd))) {
13455 		return error;
13456 	}
13457 	vp = nd.ni_vp;
13458 	nameidone(&nd);
13459 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13460 		auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
13461 		    &uio_buf[0], sizeof(uio_buf));
13462 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13463 	}
13464 
13465 	error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13466 
13467 	vnode_put(vp);
13468 	if (auio) {
13469 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13470 	} else {
13471 		*retval = (user_ssize_t)attrsize;
13472 	}
13473 	return error;
13474 }
13475 
13476 /*
13477  * Retrieve the list of extended attribute names.
13478  * XXX Code duplication here.
13479  */
13480 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)13481 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13482 {
13483 	vnode_t vp;
13484 	uio_t auio = NULL;
13485 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13486 	size_t attrsize = 0;
13487 	int error;
13488 	UIO_STACKBUF(uio_buf, 1);
13489 
13490 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13491 		return EINVAL;
13492 	}
13493 
13494 	if ((error = file_vnode(uap->fd, &vp))) {
13495 		return error;
13496 	}
13497 	if ((error = vnode_getwithref(vp))) {
13498 		file_drop(uap->fd);
13499 		return error;
13500 	}
13501 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13502 		auio = uio_createwithbuffer(1, 0, spacetype,
13503 		    UIO_READ, &uio_buf[0], sizeof(uio_buf));
13504 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13505 	}
13506 
13507 	error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13508 
13509 	vnode_put(vp);
13510 	file_drop(uap->fd);
13511 	if (auio) {
13512 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13513 	} else {
13514 		*retval = (user_ssize_t)attrsize;
13515 	}
13516 	return error;
13517 }
13518 
13519 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)13520 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13521     vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13522 {
13523 	int error;
13524 	struct mount *mp = NULL;
13525 	vnode_t vp;
13526 	int length;
13527 	int bpflags;
13528 	/* maximum number of times to retry build_path */
13529 	unsigned int retries = 0x10;
13530 
13531 	if (bufsize > FSGETPATH_MAXBUFLEN) {
13532 		return EINVAL;
13533 	}
13534 
13535 	if (buf == NULL) {
13536 		return ENOMEM;
13537 	}
13538 
13539 retry:
13540 	if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13541 		error = ENOTSUP;  /* unexpected failure */
13542 		return ENOTSUP;
13543 	}
13544 
13545 #if CONFIG_UNION_MOUNTS
13546 unionget:
13547 #endif /* CONFIG_UNION_MOUNTS */
13548 	if (objid == 2) {
13549 		struct vfs_attr vfsattr;
13550 		int use_vfs_root = TRUE;
13551 
13552 		VFSATTR_INIT(&vfsattr);
13553 		VFSATTR_WANTED(&vfsattr, f_capabilities);
13554 		if (!(options & FSOPT_ISREALFSID) &&
13555 		    vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
13556 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13557 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13558 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13559 				use_vfs_root = FALSE;
13560 			}
13561 		}
13562 
13563 		if (use_vfs_root) {
13564 			error = VFS_ROOT(mp, &vp, ctx);
13565 		} else {
13566 			error = VFS_VGET(mp, objid, &vp, ctx);
13567 		}
13568 	} else {
13569 		error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13570 	}
13571 
13572 #if CONFIG_UNION_MOUNTS
13573 	if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13574 		/*
13575 		 * If the fileid isn't found and we're in a union
13576 		 * mount volume, then see if the fileid is in the
13577 		 * mounted-on volume.
13578 		 */
13579 		struct mount *tmp = mp;
13580 		mp = vnode_mount(tmp->mnt_vnodecovered);
13581 		vfs_unbusy(tmp);
13582 		if (vfs_busy(mp, LK_NOWAIT) == 0) {
13583 			goto unionget;
13584 		}
13585 	} else {
13586 		vfs_unbusy(mp);
13587 	}
13588 #else
13589 	vfs_unbusy(mp);
13590 #endif /* CONFIG_UNION_MOUNTS */
13591 
13592 	if (error) {
13593 		return error;
13594 	}
13595 
13596 #if CONFIG_MACF
13597 	error = mac_vnode_check_fsgetpath(ctx, vp);
13598 	if (error) {
13599 		vnode_put(vp);
13600 		return error;
13601 	}
13602 #endif
13603 
13604 	/* Obtain the absolute path to this vnode. */
13605 	bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13606 	if (options & FSOPT_NOFIRMLINKPATH) {
13607 		bpflags |= BUILDPATH_NO_FIRMLINK;
13608 	}
13609 	bpflags |= BUILDPATH_CHECK_MOVED;
13610 	error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
13611 	vnode_put(vp);
13612 
13613 	if (error) {
13614 		/* there was a race building the path, try a few more times */
13615 		if (error == EAGAIN) {
13616 			--retries;
13617 			if (retries > 0) {
13618 				goto retry;
13619 			}
13620 
13621 			error = ENOENT;
13622 		}
13623 		goto out;
13624 	}
13625 
13626 	AUDIT_ARG(text, buf);
13627 
13628 	if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13629 		unsigned long path_words[NUMPARMS];
13630 		size_t path_len = sizeof(path_words);
13631 
13632 		if ((size_t)length < path_len) {
13633 			memcpy((char *)path_words, buf, length);
13634 			memset((char *)path_words + length, 0, path_len - length);
13635 
13636 			path_len = length;
13637 		} else {
13638 			memcpy((char *)path_words, buf + (length - path_len), path_len);
13639 		}
13640 
13641 		kdebug_vfs_lookup(path_words, (int)path_len, vp,
13642 		    KDBG_VFS_LOOKUP_FLAG_LOOKUP);
13643 	}
13644 
13645 	*pathlen = length; /* may be superseded by error */
13646 
13647 out:
13648 	return error;
13649 }
13650 
13651 /*
13652  * Obtain the full pathname of a file system object by id.
13653  */
13654 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)13655 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13656     uint32_t options, user_ssize_t *retval)
13657 {
13658 	vfs_context_t ctx = vfs_context_current();
13659 	fsid_t fsid;
13660 	char *realpath;
13661 	int length;
13662 	int error;
13663 
13664 	if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13665 		return EINVAL;
13666 	}
13667 
13668 	if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13669 		return error;
13670 	}
13671 	AUDIT_ARG(value32, fsid.val[0]);
13672 	AUDIT_ARG(value64, objid);
13673 	/* Restrict output buffer size for now. */
13674 
13675 	if (bufsize > FSGETPATH_MAXBUFLEN || bufsize <= 0) {
13676 		return EINVAL;
13677 	}
13678 	realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13679 	if (realpath == NULL) {
13680 		return ENOMEM;
13681 	}
13682 
13683 	error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
13684 	    options, &length);
13685 
13686 	if (error) {
13687 		goto out;
13688 	}
13689 
13690 	error = copyout((caddr_t)realpath, buf, length);
13691 
13692 	*retval = (user_ssize_t)length; /* may be superseded by error */
13693 out:
13694 	kfree_data(realpath, bufsize);
13695 	return error;
13696 }
13697 
13698 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)13699 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13700 {
13701 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13702 	           0, retval);
13703 }
13704 
13705 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)13706 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13707 {
13708 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13709 	           uap->options, retval);
13710 }
13711 
13712 /*
13713  * Common routine to handle various flavors of statfs data heading out
13714  *	to user space.
13715  *
13716  * Returns:	0			Success
13717  *		EFAULT
13718  */
13719 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)13720 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
13721     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
13722     boolean_t partial_copy)
13723 {
13724 	int             error;
13725 	int             my_size, copy_size;
13726 
13727 	if (is_64_bit) {
13728 		struct user64_statfs sfs;
13729 		my_size = copy_size = sizeof(sfs);
13730 		bzero(&sfs, my_size);
13731 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13732 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13733 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13734 		sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
13735 		sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
13736 		sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
13737 		sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
13738 		sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
13739 		sfs.f_files = (user64_long_t)sfsp->f_files;
13740 		sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
13741 		sfs.f_fsid = sfsp->f_fsid;
13742 		sfs.f_owner = sfsp->f_owner;
13743 		vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
13744 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13745 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13746 
13747 		if (partial_copy) {
13748 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13749 		}
13750 		error = copyout((caddr_t)&sfs, bufp, copy_size);
13751 	} else {
13752 		struct user32_statfs sfs;
13753 
13754 		my_size = copy_size = sizeof(sfs);
13755 		bzero(&sfs, my_size);
13756 
13757 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13758 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13759 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13760 
13761 		/*
13762 		 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
13763 		 * have to fudge the numbers here in that case.   We inflate the blocksize in order
13764 		 * to reflect the filesystem size as best we can.
13765 		 */
13766 		if ((sfsp->f_blocks > INT_MAX)
13767 		    /* Hack for 4061702 . I think the real fix is for Carbon to
13768 		     * look for some volume capability and not depend on hidden
13769 		     * semantics agreed between a FS and carbon.
13770 		     * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
13771 		     * for Carbon to set bNoVolumeSizes volume attribute.
13772 		     * Without this the webdavfs files cannot be copied onto
13773 		     * disk as they look huge. This change should not affect
13774 		     * XSAN as they should not setting these to -1..
13775 		     */
13776 		    && (sfsp->f_blocks != 0xffffffffffffffffULL)
13777 		    && (sfsp->f_bfree != 0xffffffffffffffffULL)
13778 		    && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
13779 			int             shift;
13780 
13781 			/*
13782 			 * Work out how far we have to shift the block count down to make it fit.
13783 			 * Note that it's possible to have to shift so far that the resulting
13784 			 * blocksize would be unreportably large.  At that point, we will clip
13785 			 * any values that don't fit.
13786 			 *
13787 			 * For safety's sake, we also ensure that f_iosize is never reported as
13788 			 * being smaller than f_bsize.
13789 			 */
13790 			for (shift = 0; shift < 32; shift++) {
13791 				if ((sfsp->f_blocks >> shift) <= INT_MAX) {
13792 					break;
13793 				}
13794 				if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
13795 					break;
13796 				}
13797 			}
13798 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
13799 			sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
13800 			sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
13801 			sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
13802 #undef __SHIFT_OR_CLIP
13803 			sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
13804 			sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
13805 		} else {
13806 			/* filesystem is small enough to be reported honestly */
13807 			sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
13808 			sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
13809 			sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
13810 			sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
13811 			sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
13812 		}
13813 		sfs.f_files = (user32_long_t)sfsp->f_files;
13814 		sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
13815 		sfs.f_fsid = sfsp->f_fsid;
13816 		sfs.f_owner = sfsp->f_owner;
13817 		vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
13818 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13819 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13820 
13821 		if (partial_copy) {
13822 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13823 		}
13824 		error = copyout((caddr_t)&sfs, bufp, copy_size);
13825 	}
13826 
13827 	if (sizep != NULL) {
13828 		*sizep = my_size;
13829 	}
13830 	return error;
13831 }
13832 
13833 /*
13834  * copy stat structure into user_stat structure.
13835  */
13836 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)13837 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
13838 {
13839 	bzero(usbp, sizeof(*usbp));
13840 
13841 	usbp->st_dev = sbp->st_dev;
13842 	usbp->st_ino = sbp->st_ino;
13843 	usbp->st_mode = sbp->st_mode;
13844 	usbp->st_nlink = sbp->st_nlink;
13845 	usbp->st_uid = sbp->st_uid;
13846 	usbp->st_gid = sbp->st_gid;
13847 	usbp->st_rdev = sbp->st_rdev;
13848 #ifndef _POSIX_C_SOURCE
13849 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13850 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13851 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13852 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13853 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13854 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13855 #else
13856 	usbp->st_atime = sbp->st_atime;
13857 	usbp->st_atimensec = sbp->st_atimensec;
13858 	usbp->st_mtime = sbp->st_mtime;
13859 	usbp->st_mtimensec = sbp->st_mtimensec;
13860 	usbp->st_ctime = sbp->st_ctime;
13861 	usbp->st_ctimensec = sbp->st_ctimensec;
13862 #endif
13863 	usbp->st_size = sbp->st_size;
13864 	usbp->st_blocks = sbp->st_blocks;
13865 	usbp->st_blksize = sbp->st_blksize;
13866 	usbp->st_flags = sbp->st_flags;
13867 	usbp->st_gen = sbp->st_gen;
13868 	usbp->st_lspare = sbp->st_lspare;
13869 	usbp->st_qspare[0] = sbp->st_qspare[0];
13870 	usbp->st_qspare[1] = sbp->st_qspare[1];
13871 }
13872 
13873 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)13874 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
13875 {
13876 	bzero(usbp, sizeof(*usbp));
13877 
13878 	usbp->st_dev = sbp->st_dev;
13879 	usbp->st_ino = sbp->st_ino;
13880 	usbp->st_mode = sbp->st_mode;
13881 	usbp->st_nlink = sbp->st_nlink;
13882 	usbp->st_uid = sbp->st_uid;
13883 	usbp->st_gid = sbp->st_gid;
13884 	usbp->st_rdev = sbp->st_rdev;
13885 #ifndef _POSIX_C_SOURCE
13886 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13887 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13888 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13889 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13890 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13891 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13892 #else
13893 	usbp->st_atime = sbp->st_atime;
13894 	usbp->st_atimensec = sbp->st_atimensec;
13895 	usbp->st_mtime = sbp->st_mtime;
13896 	usbp->st_mtimensec = sbp->st_mtimensec;
13897 	usbp->st_ctime = sbp->st_ctime;
13898 	usbp->st_ctimensec = sbp->st_ctimensec;
13899 #endif
13900 	usbp->st_size = sbp->st_size;
13901 	usbp->st_blocks = sbp->st_blocks;
13902 	usbp->st_blksize = sbp->st_blksize;
13903 	usbp->st_flags = sbp->st_flags;
13904 	usbp->st_gen = sbp->st_gen;
13905 	usbp->st_lspare = sbp->st_lspare;
13906 	usbp->st_qspare[0] = sbp->st_qspare[0];
13907 	usbp->st_qspare[1] = sbp->st_qspare[1];
13908 }
13909 
13910 /*
13911  * copy stat64 structure into user_stat64 structure.
13912  */
13913 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)13914 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
13915 {
13916 	bzero(usbp, sizeof(*usbp));
13917 
13918 	usbp->st_dev = sbp->st_dev;
13919 	usbp->st_ino = sbp->st_ino;
13920 	usbp->st_mode = sbp->st_mode;
13921 	usbp->st_nlink = sbp->st_nlink;
13922 	usbp->st_uid = sbp->st_uid;
13923 	usbp->st_gid = sbp->st_gid;
13924 	usbp->st_rdev = sbp->st_rdev;
13925 #ifndef _POSIX_C_SOURCE
13926 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13927 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13928 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13929 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13930 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13931 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13932 	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
13933 	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
13934 #else
13935 	usbp->st_atime = sbp->st_atime;
13936 	usbp->st_atimensec = sbp->st_atimensec;
13937 	usbp->st_mtime = sbp->st_mtime;
13938 	usbp->st_mtimensec = sbp->st_mtimensec;
13939 	usbp->st_ctime = sbp->st_ctime;
13940 	usbp->st_ctimensec = sbp->st_ctimensec;
13941 	usbp->st_birthtime = sbp->st_birthtime;
13942 	usbp->st_birthtimensec = sbp->st_birthtimensec;
13943 #endif
13944 	usbp->st_size = sbp->st_size;
13945 	usbp->st_blocks = sbp->st_blocks;
13946 	usbp->st_blksize = sbp->st_blksize;
13947 	usbp->st_flags = sbp->st_flags;
13948 	usbp->st_gen = sbp->st_gen;
13949 	usbp->st_lspare = sbp->st_lspare;
13950 	usbp->st_qspare[0] = sbp->st_qspare[0];
13951 	usbp->st_qspare[1] = sbp->st_qspare[1];
13952 }
13953 
13954 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)13955 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
13956 {
13957 	bzero(usbp, sizeof(*usbp));
13958 
13959 	usbp->st_dev = sbp->st_dev;
13960 	usbp->st_ino = sbp->st_ino;
13961 	usbp->st_mode = sbp->st_mode;
13962 	usbp->st_nlink = sbp->st_nlink;
13963 	usbp->st_uid = sbp->st_uid;
13964 	usbp->st_gid = sbp->st_gid;
13965 	usbp->st_rdev = sbp->st_rdev;
13966 #ifndef _POSIX_C_SOURCE
13967 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13968 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13969 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13970 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13971 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13972 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13973 	usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
13974 	usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
13975 #else
13976 	usbp->st_atime = sbp->st_atime;
13977 	usbp->st_atimensec = sbp->st_atimensec;
13978 	usbp->st_mtime = sbp->st_mtime;
13979 	usbp->st_mtimensec = sbp->st_mtimensec;
13980 	usbp->st_ctime = sbp->st_ctime;
13981 	usbp->st_ctimensec = sbp->st_ctimensec;
13982 	usbp->st_birthtime = sbp->st_birthtime;
13983 	usbp->st_birthtimensec = sbp->st_birthtimensec;
13984 #endif
13985 	usbp->st_size = sbp->st_size;
13986 	usbp->st_blocks = sbp->st_blocks;
13987 	usbp->st_blksize = sbp->st_blksize;
13988 	usbp->st_flags = sbp->st_flags;
13989 	usbp->st_gen = sbp->st_gen;
13990 	usbp->st_lspare = sbp->st_lspare;
13991 	usbp->st_qspare[0] = sbp->st_qspare[0];
13992 	usbp->st_qspare[1] = sbp->st_qspare[1];
13993 }
13994 
13995 /*
13996  * Purge buffer cache for simulating cold starts
13997  */
13998 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)13999 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
14000 {
14001 	ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
14002 
14003 	return VNODE_RETURNED;
14004 }
14005 
14006 static int
vfs_purge_callback(mount_t mp,__unused void * arg)14007 vfs_purge_callback(mount_t mp, __unused void * arg)
14008 {
14009 	vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
14010 
14011 	return VFS_RETURNED;
14012 }
14013 
14014 static TUNABLE_WRITEABLE(boolean_t, vfs_purge_vm_pagers, "vfs_purge_vm_pagers", TRUE);
14015 SYSCTL_INT(_vfs, OID_AUTO, purge_vm_pagers, CTLFLAG_RW | CTLFLAG_LOCKED, &vfs_purge_vm_pagers, 0, "VFS purge also purges file-backed VM pagers");
14016 
14017 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)14018 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
14019 {
14020 	if (!kauth_cred_issuser(kauth_cred_get())) {
14021 		return EPERM;
14022 	}
14023 
14024 	vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
14025 
14026 	/* also flush any VM pagers backed by files */
14027 	if (vfs_purge_vm_pagers) {
14028 		vm_purge_filebacked_pagers();
14029 	}
14030 
14031 	return 0;
14032 }
14033 
14034 /*
14035  * gets the vnode associated with the (unnamed) snapshot directory
14036  * for a Filesystem. The snapshot directory vnode is returned with
14037  * an iocount on it.
14038  */
14039 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)14040 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
14041 {
14042 	return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
14043 }
14044 
14045 /*
14046  * Get the snapshot vnode.
14047  *
14048  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
14049  * needs nameidone() on ndp.
14050  *
14051  * If the snapshot vnode exists it is returned in ndp->ni_vp.
14052  *
14053  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
14054  * not needed.
14055  */
14056 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)14057 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
14058     user_addr_t name, struct nameidata *ndp, int32_t op,
14059 #if !CONFIG_TRIGGERS
14060     __unused
14061 #endif
14062     enum path_operation pathop,
14063     vfs_context_t ctx)
14064 {
14065 	int error, i;
14066 	caddr_t name_buf;
14067 	size_t name_len;
14068 	struct vfs_attr vfa;
14069 
14070 	*sdvpp = NULLVP;
14071 	*rvpp = NULLVP;
14072 
14073 	error = vnode_getfromfd(ctx, dirfd, rvpp);
14074 	if (error) {
14075 		return error;
14076 	}
14077 
14078 	if (!vnode_isvroot(*rvpp)) {
14079 		error = EINVAL;
14080 		goto out;
14081 	}
14082 
14083 	/* Make sure the filesystem supports snapshots */
14084 	VFSATTR_INIT(&vfa);
14085 	VFSATTR_WANTED(&vfa, f_capabilities);
14086 	if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
14087 	    !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
14088 	    !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
14089 	    VOL_CAP_INT_SNAPSHOT)) ||
14090 	    !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
14091 	    VOL_CAP_INT_SNAPSHOT))) {
14092 		error = ENOTSUP;
14093 		goto out;
14094 	}
14095 
14096 	error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
14097 	if (error) {
14098 		goto out;
14099 	}
14100 
14101 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14102 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14103 	if (error) {
14104 		goto out1;
14105 	}
14106 
14107 	/*
14108 	 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
14109 	 * (the length returned by copyinstr includes the terminating NUL)
14110 	 */
14111 	if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
14112 	    (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
14113 		error = EINVAL;
14114 		goto out1;
14115 	}
14116 	for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
14117 		;
14118 	}
14119 	if (i < (int)name_len) {
14120 		error = EINVAL;
14121 		goto out1;
14122 	}
14123 
14124 #if CONFIG_MACF
14125 	if (op == CREATE) {
14126 		error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
14127 		    name_buf);
14128 	} else if (op == DELETE) {
14129 		error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
14130 		    name_buf);
14131 	}
14132 	if (error) {
14133 		goto out1;
14134 	}
14135 #endif
14136 
14137 	/* Check if the snapshot already exists ... */
14138 	NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
14139 	    UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14140 	ndp->ni_dvp = *sdvpp;
14141 
14142 	error = namei(ndp);
14143 out1:
14144 	zfree(ZV_NAMEI, name_buf);
14145 out:
14146 	if (error) {
14147 		if (*sdvpp) {
14148 			vnode_put(*sdvpp);
14149 			*sdvpp = NULLVP;
14150 		}
14151 		if (*rvpp) {
14152 			vnode_put(*rvpp);
14153 			*rvpp = NULLVP;
14154 		}
14155 	}
14156 	return error;
14157 }
14158 
14159 /*
14160  * create a filesystem snapshot (for supporting filesystems)
14161  *
14162  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14163  * We get to the (unnamed) snapshot directory vnode and create the vnode
14164  * for the snapshot in it.
14165  *
14166  * Restrictions:
14167  *
14168  *    a) Passed in name for snapshot cannot have slashes.
14169  *    b) name can't be "." or ".."
14170  *
14171  * Since this requires superuser privileges, vnode_authorize calls are not
14172  * made.
14173  */
14174 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14175 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
14176     vfs_context_t ctx)
14177 {
14178 	vnode_t rvp, snapdvp;
14179 	int error;
14180 	struct nameidata *ndp;
14181 
14182 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14183 
14184 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14185 	    OP_LINK, ctx);
14186 	if (error) {
14187 		goto out;
14188 	}
14189 
14190 	if (ndp->ni_vp) {
14191 		vnode_put(ndp->ni_vp);
14192 		error = EEXIST;
14193 	} else {
14194 		struct vnode_attr *vap;
14195 		vnode_t vp = NULLVP;
14196 
14197 		vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14198 
14199 		VATTR_INIT(vap);
14200 		VATTR_SET(vap, va_type, VREG);
14201 		VATTR_SET(vap, va_mode, 0);
14202 
14203 		error = vn_create(snapdvp, &vp, ndp, vap,
14204 		    VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14205 		if (!error && vp) {
14206 			vnode_put(vp);
14207 		}
14208 
14209 		kfree_type(struct vnode_attr, vap);
14210 	}
14211 
14212 	nameidone(ndp);
14213 	vnode_put(snapdvp);
14214 	vnode_put(rvp);
14215 out:
14216 	kfree_type(struct nameidata, ndp);
14217 
14218 	return error;
14219 }
14220 
14221 /*
14222  * Delete a Filesystem snapshot
14223  *
14224  * get the vnode for the unnamed snapshot directory and the snapshot and
14225  * delete the snapshot.
14226  */
14227 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14228 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14229     vfs_context_t ctx)
14230 {
14231 	vnode_t rvp, snapdvp;
14232 	int error;
14233 	struct nameidata *ndp;
14234 
14235 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14236 
14237 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14238 	    OP_UNLINK, ctx);
14239 	if (error) {
14240 		goto out;
14241 	}
14242 
14243 	error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14244 	    VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14245 
14246 	vnode_put(ndp->ni_vp);
14247 	nameidone(ndp);
14248 	vnode_put(snapdvp);
14249 	vnode_put(rvp);
14250 out:
14251 	kfree_type(struct nameidata, ndp);
14252 
14253 	return error;
14254 }
14255 
14256 /*
14257  * Revert a filesystem to a snapshot
14258  *
14259  * Marks the filesystem to revert to the given snapshot on next mount.
14260  */
14261 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14262 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14263     vfs_context_t ctx)
14264 {
14265 	int error;
14266 	vnode_t rvp;
14267 	mount_t mp;
14268 	struct fs_snapshot_revert_args revert_data;
14269 	struct componentname cnp;
14270 	caddr_t name_buf;
14271 	size_t name_len;
14272 
14273 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14274 	if (error) {
14275 		return error;
14276 	}
14277 	mp = vnode_mount(rvp);
14278 
14279 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14280 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14281 	if (error) {
14282 		zfree(ZV_NAMEI, name_buf);
14283 		vnode_put(rvp);
14284 		return error;
14285 	}
14286 
14287 #if CONFIG_MACF
14288 	error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14289 	if (error) {
14290 		zfree(ZV_NAMEI, name_buf);
14291 		vnode_put(rvp);
14292 		return error;
14293 	}
14294 #endif
14295 
14296 	/*
14297 	 * Grab mount_iterref so that we can release the vnode,
14298 	 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14299 	 */
14300 	error = mount_iterref(mp, 0);
14301 	vnode_put(rvp);
14302 	if (error) {
14303 		zfree(ZV_NAMEI, name_buf);
14304 		return error;
14305 	}
14306 
14307 	memset(&cnp, 0, sizeof(cnp));
14308 	cnp.cn_pnbuf = (char *)name_buf;
14309 	cnp.cn_nameiop = LOOKUP;
14310 	cnp.cn_flags = ISLASTCN | HASBUF;
14311 	cnp.cn_pnlen = MAXPATHLEN;
14312 	cnp.cn_nameptr = cnp.cn_pnbuf;
14313 	cnp.cn_namelen = (int)name_len;
14314 	revert_data.sr_cnp = &cnp;
14315 
14316 	error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14317 	mount_iterdrop(mp);
14318 	zfree(ZV_NAMEI, name_buf);
14319 
14320 	if (error) {
14321 		/* If there was any error, try again using VNOP_IOCTL */
14322 
14323 		vnode_t snapdvp;
14324 		struct nameidata namend;
14325 
14326 		error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14327 		    OP_LOOKUP, ctx);
14328 		if (error) {
14329 			return error;
14330 		}
14331 
14332 
14333 		error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14334 		    0, ctx);
14335 
14336 		vnode_put(namend.ni_vp);
14337 		nameidone(&namend);
14338 		vnode_put(snapdvp);
14339 		vnode_put(rvp);
14340 	}
14341 
14342 	return error;
14343 }
14344 
14345 /*
14346  * rename a Filesystem snapshot
14347  *
14348  * get the vnode for the unnamed snapshot directory and the snapshot and
14349  * rename the snapshot. This is a very specialised (and simple) case of
14350  * rename(2) (which has to deal with a lot more complications). It differs
14351  * slightly from rename(2) in that EEXIST is returned if the new name exists.
14352  */
14353 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)14354 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14355     __unused uint32_t flags, vfs_context_t ctx)
14356 {
14357 	vnode_t rvp, snapdvp;
14358 	int error, i;
14359 	caddr_t newname_buf;
14360 	size_t name_len;
14361 	vnode_t fvp;
14362 	struct nameidata *fromnd, *tond;
14363 	/* carving out a chunk for structs that are too big to be on stack. */
14364 	struct {
14365 		struct nameidata from_node;
14366 		struct nameidata to_node;
14367 	} * __rename_data;
14368 
14369 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14370 	fromnd = &__rename_data->from_node;
14371 	tond = &__rename_data->to_node;
14372 
14373 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14374 	    OP_UNLINK, ctx);
14375 	if (error) {
14376 		goto out;
14377 	}
14378 	fvp  = fromnd->ni_vp;
14379 
14380 	newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14381 	error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14382 	if (error) {
14383 		goto out1;
14384 	}
14385 
14386 	/*
14387 	 * Some sanity checks- new name can't be empty, "." or ".." or have
14388 	 * slashes.
14389 	 * (the length returned by copyinstr includes the terminating NUL)
14390 	 *
14391 	 * The FS rename VNOP is suppossed to handle this but we'll pick it
14392 	 * off here itself.
14393 	 */
14394 	if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14395 	    (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14396 		error = EINVAL;
14397 		goto out1;
14398 	}
14399 	for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14400 		;
14401 	}
14402 	if (i < (int)name_len) {
14403 		error = EINVAL;
14404 		goto out1;
14405 	}
14406 
14407 #if CONFIG_MACF
14408 	error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14409 	    newname_buf);
14410 	if (error) {
14411 		goto out1;
14412 	}
14413 #endif
14414 
14415 	NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14416 	    UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14417 	tond->ni_dvp = snapdvp;
14418 
14419 	error = namei(tond);
14420 	if (error) {
14421 		goto out2;
14422 	} else if (tond->ni_vp) {
14423 		/*
14424 		 * snapshot rename behaves differently than rename(2) - if the
14425 		 * new name exists, EEXIST is returned.
14426 		 */
14427 		vnode_put(tond->ni_vp);
14428 		error = EEXIST;
14429 		goto out2;
14430 	}
14431 
14432 	error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14433 	    &tond->ni_cnd, ctx);
14434 
14435 out2:
14436 	nameidone(tond);
14437 out1:
14438 	zfree(ZV_NAMEI, newname_buf);
14439 	vnode_put(fvp);
14440 	vnode_put(snapdvp);
14441 	vnode_put(rvp);
14442 	nameidone(fromnd);
14443 out:
14444 	kfree_type(typeof(*__rename_data), __rename_data);
14445 	return error;
14446 }
14447 
14448 /*
14449  * Mount a Filesystem snapshot
14450  *
14451  * get the vnode for the unnamed snapshot directory and the snapshot and
14452  * mount the snapshot.
14453  */
14454 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)14455 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14456     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14457 {
14458 	mount_t mp;
14459 	vnode_t rvp, snapdvp, snapvp, vp, pvp;
14460 	struct fs_snapshot_mount_args smnt_data;
14461 	int error;
14462 	struct nameidata *snapndp, *dirndp;
14463 	/* carving out a chunk for structs that are too big to be on stack. */
14464 	struct {
14465 		struct nameidata snapnd;
14466 		struct nameidata dirnd;
14467 	} * __snapshot_mount_data;
14468 
14469 	__snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14470 	snapndp = &__snapshot_mount_data->snapnd;
14471 	dirndp = &__snapshot_mount_data->dirnd;
14472 
14473 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
14474 	    OP_LOOKUP, ctx);
14475 	if (error) {
14476 		goto out;
14477 	}
14478 
14479 	snapvp  = snapndp->ni_vp;
14480 	if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
14481 		error = EIO;
14482 		goto out1;
14483 	}
14484 
14485 	/* Get the vnode to be covered */
14486 	NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14487 	    UIO_USERSPACE, directory, ctx);
14488 	error = namei(dirndp);
14489 	if (error) {
14490 		goto out1;
14491 	}
14492 
14493 	vp = dirndp->ni_vp;
14494 	pvp = dirndp->ni_dvp;
14495 	mp = vnode_mount(rvp);
14496 
14497 	if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14498 		error = EINVAL;
14499 		goto out2;
14500 	}
14501 
14502 #if CONFIG_MACF
14503 	error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
14504 	    mp->mnt_vfsstat.f_fstypename);
14505 	if (error) {
14506 		goto out2;
14507 	}
14508 #endif
14509 
14510 	smnt_data.sm_mp  = mp;
14511 	smnt_data.sm_cnp = &snapndp->ni_cnd;
14512 	error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
14513 	    &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & (MNT_DONTBROWSE | MNT_IGNORE_OWNERSHIP),
14514 	    KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14515 
14516 out2:
14517 	vnode_put(vp);
14518 	vnode_put(pvp);
14519 	nameidone(dirndp);
14520 out1:
14521 	vnode_put(snapvp);
14522 	vnode_put(snapdvp);
14523 	vnode_put(rvp);
14524 	nameidone(snapndp);
14525 out:
14526 	kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14527 	return error;
14528 }
14529 
14530 /*
14531  * Root from a snapshot of the filesystem
14532  *
14533  * Marks the filesystem to root from the given snapshot on next boot.
14534  */
14535 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14536 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14537     vfs_context_t ctx)
14538 {
14539 	int error;
14540 	vnode_t rvp;
14541 	mount_t mp;
14542 	struct fs_snapshot_root_args root_data;
14543 	struct componentname cnp;
14544 	caddr_t name_buf;
14545 	size_t name_len;
14546 
14547 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14548 	if (error) {
14549 		return error;
14550 	}
14551 	mp = vnode_mount(rvp);
14552 
14553 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14554 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14555 	if (error) {
14556 		zfree(ZV_NAMEI, name_buf);
14557 		vnode_put(rvp);
14558 		return error;
14559 	}
14560 
14561 	// XXX MAC checks ?
14562 
14563 	/*
14564 	 * Grab mount_iterref so that we can release the vnode,
14565 	 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14566 	 */
14567 	error = mount_iterref(mp, 0);
14568 	vnode_put(rvp);
14569 	if (error) {
14570 		zfree(ZV_NAMEI, name_buf);
14571 		return error;
14572 	}
14573 
14574 	memset(&cnp, 0, sizeof(cnp));
14575 	cnp.cn_pnbuf = (char *)name_buf;
14576 	cnp.cn_nameiop = LOOKUP;
14577 	cnp.cn_flags = ISLASTCN | HASBUF;
14578 	cnp.cn_pnlen = MAXPATHLEN;
14579 	cnp.cn_nameptr = cnp.cn_pnbuf;
14580 	cnp.cn_namelen = (int)name_len;
14581 	root_data.sr_cnp = &cnp;
14582 
14583 	error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
14584 
14585 	mount_iterdrop(mp);
14586 	zfree(ZV_NAMEI, name_buf);
14587 
14588 	return error;
14589 }
14590 
14591 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)14592 vfs_context_can_snapshot(vfs_context_t ctx)
14593 {
14594 	static const char * const snapshot_entitlements[] = {
14595 		"com.apple.private.vfs.snapshot",
14596 		"com.apple.developer.vfs.snapshot",
14597 		"com.apple.private.apfs.arv.limited.snapshot",
14598 	};
14599 	static const size_t nentitlements =
14600 	    sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14601 	size_t i;
14602 
14603 	task_t task = vfs_context_task(ctx);
14604 	for (i = 0; i < nentitlements; i++) {
14605 		if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
14606 			return TRUE;
14607 		}
14608 	}
14609 	return FALSE;
14610 }
14611 
14612 /*
14613  * FS snapshot operations dispatcher
14614  */
14615 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)14616 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14617     __unused int32_t *retval)
14618 {
14619 	int error;
14620 	vfs_context_t ctx = vfs_context_current();
14621 
14622 	AUDIT_ARG(fd, uap->dirfd);
14623 	AUDIT_ARG(value32, uap->op);
14624 
14625 	if (!vfs_context_can_snapshot(ctx)) {
14626 		return EPERM;
14627 	}
14628 
14629 	/*
14630 	 * Enforce user authorization for snapshot modification operations,
14631 	 * or if trying to root from snapshot.
14632 	 */
14633 	if (uap->op != SNAPSHOT_OP_MOUNT) {
14634 		vnode_t dvp = NULLVP;
14635 		vnode_t devvp = NULLVP;
14636 		mount_t mp;
14637 
14638 		error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
14639 		if (error) {
14640 			return error;
14641 		}
14642 		mp = vnode_mount(dvp);
14643 		devvp = mp->mnt_devvp;
14644 
14645 		/* get an iocount on devvp */
14646 		if (devvp == NULLVP) {
14647 			error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
14648 			/* for mounts which arent block devices */
14649 			if (error == ENOENT) {
14650 				error = ENXIO;
14651 			}
14652 		} else {
14653 			error = vnode_getwithref(devvp);
14654 		}
14655 
14656 		if (error) {
14657 			vnode_put(dvp);
14658 			return error;
14659 		}
14660 
14661 		if ((vfs_context_issuser(ctx) == 0) &&
14662 		    (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14663 		    (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
14664 			error = EPERM;
14665 		}
14666 		vnode_put(dvp);
14667 		vnode_put(devvp);
14668 
14669 		if (error) {
14670 			return error;
14671 		}
14672 	}
14673 
14674 	switch (uap->op) {
14675 	case SNAPSHOT_OP_CREATE:
14676 		error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
14677 		break;
14678 	case SNAPSHOT_OP_DELETE:
14679 		error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
14680 		break;
14681 	case SNAPSHOT_OP_RENAME:
14682 		error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
14683 		    uap->flags, ctx);
14684 		break;
14685 	case SNAPSHOT_OP_MOUNT:
14686 		error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
14687 		    uap->data, uap->flags, ctx);
14688 		break;
14689 	case SNAPSHOT_OP_REVERT:
14690 		error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
14691 		break;
14692 #if CONFIG_MNT_ROOTSNAP
14693 	case SNAPSHOT_OP_ROOT:
14694 		error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
14695 		break;
14696 #endif /* CONFIG_MNT_ROOTSNAP */
14697 	default:
14698 		error = ENOSYS;
14699 	}
14700 
14701 	return error;
14702 }
14703