xref: /xnu-10063.101.15/bsd/vfs/vfs_syscalls.c (revision 94d3b452840153a99b38a3a9659680b2a006908e)
1 /*
2  * Copyright (c) 1995-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1989, 1993
30  *	The Regents of the University of California.  All rights reserved.
31  * (c) UNIX System Laboratories, Inc.
32  * All or some portions of this file are derived from material licensed
33  * to the University of California by American Telephone and Telegraph
34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35  * the permission of UNIX System Laboratories, Inc.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. All advertising materials mentioning features or use of this software
46  *    must display the following acknowledgement:
47  *	This product includes software developed by the University of
48  *	California, Berkeley and its contributors.
49  * 4. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.41 (Berkeley) 6/15/95
66  */
67 /*
68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69  * support for mandatory and extensible security protections.  This notice
70  * is included in support of clause 2.2 (b) of the Apple Public License,
71  * Version 2.0.
72  */
73 
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112 
113 #include <vfs/vfs_disk_conditioner.h>
114 #if CONFIG_EXCLAVES
115 #include <vfs/vfs_exclave_fs.h>
116 #endif
117 
118 #include <security/audit/audit.h>
119 #include <bsm/audit_kevents.h>
120 
121 #include <mach/mach_types.h>
122 #include <kern/kern_types.h>
123 #include <kern/kalloc.h>
124 #include <kern/task.h>
125 
126 #include <vm/vm_pageout.h>
127 #include <vm/vm_protos.h>
128 
129 #include <libkern/OSAtomic.h>
130 #include <os/atomic_private.h>
131 #include <pexpert/pexpert.h>
132 #include <IOKit/IOBSD.h>
133 
134 // deps for MIG call
135 #include <kern/host.h>
136 #include <kern/ipc_misc.h>
137 #include <mach/host_priv.h>
138 #include <mach/vfs_nspace.h>
139 #include <os/log.h>
140 
141 #include <nfs/nfs_conf.h>
142 
143 #if ROUTEFS
144 #include <miscfs/routefs/routefs.h>
145 #endif /* ROUTEFS */
146 
147 #if CONFIG_MACF
148 #include <security/mac.h>
149 #include <security/mac_framework.h>
150 #endif
151 
152 #if CONFIG_FSE
153 #define GET_PATH(x) \
154 	((x) = get_pathbuff())
155 #define RELEASE_PATH(x) \
156 	release_pathbuff(x)
157 #else
158 #define GET_PATH(x)     \
159 	((x) = zalloc(ZV_NAMEI))
160 #define RELEASE_PATH(x) \
161 	zfree(ZV_NAMEI, x)
162 #endif /* CONFIG_FSE */
163 
164 #ifndef HFS_GET_BOOT_INFO
165 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
166 #endif
167 
168 #ifndef HFS_SET_BOOT_INFO
169 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
170 #endif
171 
172 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
173 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
174 #endif
175 
176 extern void disk_conditioner_unmount(mount_t mp);
177 
178 /* struct for checkdirs iteration */
179 struct cdirargs {
180 	vnode_t olddp;
181 	vnode_t newdp;
182 };
183 /* callback  for checkdirs iteration */
184 static int checkdirs_callback(proc_t p, void * arg);
185 
186 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
187 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
188 void enablequotas(struct mount *mp, vfs_context_t ctx);
189 static int getfsstat_callback(mount_t mp, void * arg);
190 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
191 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
192 static int sync_callback(mount_t, void *);
193 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
194     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
195     boolean_t partial_copy);
196 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
197 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
198     struct componentname *cnp, user_addr_t fsmountargs,
199     int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
200 void vfs_notify_mount(vnode_t pdvp);
201 
202 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
203 
204 struct fd_vn_data * fg_vn_data_alloc(void);
205 
206 /*
207  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
208  * Concurrent lookups (or lookups by ids) on hard links can cause the
209  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
210  * does) to return ENOENT as the path cannot be returned from the name cache
211  * alone. We have no option but to retry and hope to get one namei->reverse path
212  * generation done without an intervening lookup, lookup by id on the hard link
213  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
214  * which currently are the MAC hooks for rename, unlink and rmdir.
215  */
216 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
217 
218 /* Max retry limit for rename due to vnode recycling. */
219 #define MAX_RENAME_ERECYCLE_RETRIES 1024
220 
221 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
222     int unlink_flags);
223 
224 #ifdef CONFIG_IMGSRC_ACCESS
225 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
226 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
227 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
228 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
229 static void mount_end_update(mount_t mp);
230 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
231 #endif /* CONFIG_IMGSRC_ACCESS */
232 
233 //snapshot functions
234 #if CONFIG_MNT_ROOTSNAP
235 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
236 #else
237 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
238 #endif
239 
240 __private_extern__
241 int sync_internal(void);
242 
243 __private_extern__
244 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
245 
246 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
247 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
248 
249 /* vars for sync mutex */
250 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
251 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
252 
253 extern lck_rw_t rootvnode_rw_lock;
254 
255 VFS_SMR_DECLARE;
256 extern uint32_t nc_smr_enabled;
257 
258 /*
259  * incremented each time a mount or unmount operation occurs
260  * used to invalidate the cached value of the rootvp in the
261  * mount structure utilized by cache_lookup_path
262  */
263 uint32_t mount_generation = 0;
264 
265 /* counts number of mount and unmount operations */
266 unsigned int vfs_nummntops = 0;
267 
268 /* system-wide, per-boot unique mount ID */
269 static _Atomic uint64_t mount_unique_id = 1;
270 
271 extern const struct fileops vnops;
272 #if CONFIG_APPLEDOUBLE
273 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
274 #endif /* CONFIG_APPLEDOUBLE */
275 
276 /* Maximum buffer length supported by fsgetpath(2) */
277 #define FSGETPATH_MAXBUFLEN  8192
278 
279 /*
280  * Virtual File System System Calls
281  */
282 
283 /*
284  * Private in-kernel mounting spi (specific use-cases only)
285  */
286 boolean_t
vfs_iskernelmount(mount_t mp)287 vfs_iskernelmount(mount_t mp)
288 {
289 	return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
290 }
291 
292 __private_extern__
293 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)294 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
295     void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
296     vfs_context_t ctx)
297 {
298 	struct nameidata nd;
299 	boolean_t did_namei;
300 	int error;
301 
302 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
303 	    UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
304 
305 	kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
306 
307 	/*
308 	 * Get the vnode to be covered if it's not supplied
309 	 */
310 	if (vp == NULLVP) {
311 		error = namei(&nd);
312 		if (error) {
313 			if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
314 				printf("failed to locate mount-on path: %s ", path);
315 			}
316 			return error;
317 		}
318 		vp = nd.ni_vp;
319 		pvp = nd.ni_dvp;
320 		did_namei = TRUE;
321 	} else {
322 		char *pnbuf = CAST_DOWN(char *, path);
323 
324 		nd.ni_cnd.cn_pnbuf = pnbuf;
325 		nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
326 		did_namei = FALSE;
327 	}
328 
329 	kern_flags |= KERNEL_MOUNT_KMOUNT;
330 	error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
331 	    syscall_flags, kern_flags, NULL, ctx);
332 
333 	if (did_namei) {
334 		vnode_put(vp);
335 		vnode_put(pvp);
336 		nameidone(&nd);
337 	}
338 
339 	return error;
340 }
341 
342 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)343 vfs_mount_at_path(const char *fstype, const char *path,
344     vnode_t pvp, vnode_t vp, void *data, size_t datalen,
345     int mnt_flags, int flags)
346 {
347 	int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
348 	int error, km_flags = 0;
349 	vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
350 
351 	/*
352 	 * This call is currently restricted to specific use cases.
353 	 */
354 	if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
355 		return ENOTSUP;
356 	}
357 
358 #if !defined(XNU_TARGET_OS_OSX)
359 	if (strcmp(fstype, "lifs") == 0) {
360 		syscall_flags |= MNT_NOEXEC;
361 	}
362 #endif
363 
364 	if (flags & VFS_MOUNT_FLAG_NOAUTH) {
365 		km_flags |= KERNEL_MOUNT_NOAUTH;
366 	}
367 	if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
368 		km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
369 	}
370 
371 	error = kernel_mount(fstype, pvp, vp, path, data, datalen,
372 	    syscall_flags, km_flags, ctx);
373 	if (error) {
374 		printf("%s: mount on %s failed, error %d\n", __func__, path,
375 		    error);
376 	}
377 
378 	return error;
379 }
380 
381 /*
382  * Mount a file system.
383  */
384 /* ARGSUSED */
385 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)386 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
387 {
388 	struct __mac_mount_args muap;
389 
390 	muap.type = uap->type;
391 	muap.path = uap->path;
392 	muap.flags = uap->flags;
393 	muap.data = uap->data;
394 	muap.mac_p = USER_ADDR_NULL;
395 	return __mac_mount(p, &muap, retval);
396 }
397 
398 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)399 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
400 {
401 	struct componentname    cn;
402 	vfs_context_t           ctx = vfs_context_current();
403 	size_t                  dummy = 0;
404 	int                     error;
405 	int                     flags = uap->flags;
406 	char                    fstypename[MFSNAMELEN];
407 	char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
408 	vnode_t                 pvp;
409 	vnode_t                 vp;
410 
411 	AUDIT_ARG(fd, uap->fd);
412 	AUDIT_ARG(fflags, flags);
413 	/* fstypename will get audited by mount_common */
414 
415 	/* Sanity check the flags */
416 	if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
417 		return ENOTSUP;
418 	}
419 
420 	if (flags & MNT_UNION) {
421 		return EPERM;
422 	}
423 
424 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
425 	if (error) {
426 		return error;
427 	}
428 
429 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
430 		return error;
431 	}
432 
433 	if ((error = vnode_getwithref(vp)) != 0) {
434 		file_drop(uap->fd);
435 		return error;
436 	}
437 
438 	pvp = vnode_getparent(vp);
439 	if (pvp == NULL) {
440 		if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
441 			error = EBUSY;
442 		} else {
443 			error = EINVAL;
444 		}
445 		vnode_put(vp);
446 		file_drop(uap->fd);
447 		return error;
448 	}
449 
450 	memset(&cn, 0, sizeof(struct componentname));
451 	cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
452 	cn.cn_pnlen = MAXPATHLEN;
453 
454 	if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
455 		zfree(ZV_NAMEI, cn.cn_pnbuf);
456 		vnode_put(pvp);
457 		vnode_put(vp);
458 		file_drop(uap->fd);
459 		return error;
460 	}
461 
462 	error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
463 
464 	zfree(ZV_NAMEI, cn.cn_pnbuf);
465 	vnode_put(pvp);
466 	vnode_put(vp);
467 	file_drop(uap->fd);
468 
469 	return error;
470 }
471 
472 #define MAX_GRAFT_METADATA_SIZE             16384 /* bytes */
473 
474 /*
475  * Get the size of a graft file (a manifest or payload file).
476  * The vp should be an iocounted vnode.
477  */
478 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)479 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
480 {
481 	struct stat64 sb = {};
482 	int error;
483 
484 	*size = 0;
485 
486 	error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
487 	if (error) {
488 		return error;
489 	}
490 
491 	if (sb.st_size == 0) {
492 		error = ENODATA;
493 	} else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
494 		error = EFBIG;
495 	} else {
496 		*size = (size_t) sb.st_size;
497 	}
498 
499 	return error;
500 }
501 
502 /*
503  * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
504  * `size` must already be validated.
505  */
506 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)507 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
508 {
509 	return vn_rdwr(UIO_READ, graft_vp,
510 	           (caddr_t) buf, (int) size, /* offset */ 0,
511 	           UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
512 	           vfs_context_ucred(vctx), /* resid */ NULL,
513 	           vfs_context_proc(vctx));
514 }
515 
516 /*
517  * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
518  * and read it into `buf`.
519  */
520 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,size_t * size,void * buf)521 graft_secureboot_read_fd(int fd, vfs_context_t vctx, size_t *size, void *buf)
522 {
523 	vnode_t metadata_vp = NULLVP;
524 	int error;
525 
526 	// Convert this graft fd to a vnode.
527 	if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
528 		goto out;
529 	}
530 
531 	// Get (and validate) size information.
532 	if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
533 		goto out;
534 	}
535 
536 	// Read each file into the provided buffer - we must get the expected amount of bytes.
537 	if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
538 		goto out;
539 	}
540 
541 out:
542 	if (metadata_vp) {
543 		vnode_put(metadata_vp);
544 		metadata_vp = NULLVP;
545 	}
546 
547 	return error;
548 }
549 
550 /*
551  * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
552  * provided in `gfs`, saving the size of data read in `gfs`.
553  */
554 static int
graft_secureboot_read_metadata(secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)555 graft_secureboot_read_metadata(secure_boot_cryptex_args_t *sbc_args, vfs_context_t vctx,
556     fsioc_graft_fs_t *gfs)
557 {
558 	int error;
559 
560 	// Read the authentic manifest.
561 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
562 	    &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
563 		return error;
564 	}
565 
566 	// The user manifest is currently unused, but set its size.
567 	gfs->user_manifest_size = 0;
568 
569 	// Read the payload.
570 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
571 	    &gfs->payload_size, gfs->payload))) {
572 		return error;
573 	}
574 
575 	return 0;
576 }
577 
578 /*
579  * Call into the filesystem to verify and graft a cryptex.
580  */
581 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)582 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
583     vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
584 {
585 	fsioc_graft_fs_t gfs = {};
586 	uint64_t graft_dir_ino = 0;
587 	struct stat64 sb = {};
588 	int error;
589 
590 	// Pre-flight arguments.
591 	if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
592 		// Make sure that this graft version matches what we support.
593 		return ENOTSUP;
594 	} else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
595 		// For this type, cryptex VP must live on same volume as the target of graft.
596 		return EXDEV;
597 	} else if (mounton_vp && mounton_vp->v_type != VDIR) {
598 		// We cannot graft upon non-directories.
599 		return ENOTDIR;
600 	} else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
601 	    sbc_args->sbc_payload_fd < 0) {
602 		// We cannot graft without a manifest and payload.
603 		return EINVAL;
604 	}
605 
606 	if (mounton_vp) {
607 		// Get the mounton's inode number.
608 		error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
609 		if (error) {
610 			return error;
611 		}
612 		graft_dir_ino = (uint64_t) sb.st_ino;
613 	}
614 
615 	// Create buffers (of our maximum-defined size) to store authentication info.
616 	gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
617 	gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
618 
619 	if (!gfs.authentic_manifest || !gfs.payload) {
620 		error = ENOMEM;
621 		goto out;
622 	}
623 
624 	// Read our fd's into our buffers.
625 	// (Note that this will set the buffer size fields in `gfs`.)
626 	error = graft_secureboot_read_metadata(sbc_args, vctx, &gfs);
627 	if (error) {
628 		goto out;
629 	}
630 
631 	gfs.graft_version = FSIOC_GRAFT_VERSION;
632 	gfs.graft_type = graft_type;
633 	gfs.graft_4cc = sbc_args->sbc_4cc;
634 	if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
635 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
636 	}
637 	if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
638 		gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
639 	}
640 	if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
641 		gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
642 	}
643 	if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
644 		gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
645 	}
646 	if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
647 		gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
648 	}
649 	if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
650 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
651 	}
652 	gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
653 
654 	// Call into the FS to perform the graft (and validation).
655 	error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
656 
657 out:
658 	if (gfs.authentic_manifest) {
659 		kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
660 		gfs.authentic_manifest = NULL;
661 	}
662 	if (gfs.payload) {
663 		kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
664 		gfs.payload = NULL;
665 	}
666 
667 	return error;
668 }
669 
670 #define GRAFTDMG_ENTITLEMENT  "com.apple.private.vfs.graftdmg"
671 
672 /*
673  * Graft a cryptex disk image (via FD) onto the appropriate mount-point
674  * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
675  */
676 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)677 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
678 {
679 	int ua_dmgfd = uap->dmg_fd;
680 	user_addr_t ua_mountdir = uap->mountdir;
681 	uint32_t ua_grafttype = uap->graft_type;
682 	user_addr_t ua_graftargs = uap->gda;
683 
684 	graftdmg_args_un kern_gda = {};
685 	int error = 0;
686 	secure_boot_cryptex_args_t *sbc_args = NULL;
687 
688 	vnode_t cryptex_vp = NULLVP;
689 	vnode_t mounton_vp = NULLVP;
690 	struct nameidata nd = {};
691 	vfs_context_t ctx = vfs_context_current();
692 
693 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
694 		return EPERM;
695 	}
696 
697 	error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
698 	if (error) {
699 		return error;
700 	}
701 
702 	// Copy mount dir in, if provided.
703 	if (ua_mountdir != USER_ADDR_NULL) {
704 		// Acquire vnode for mount-on path
705 		NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
706 		    UIO_USERSPACE, ua_mountdir, ctx);
707 
708 		error = namei(&nd);
709 		if (error) {
710 			return error;
711 		}
712 		mounton_vp = nd.ni_vp;
713 	}
714 
715 	// Convert fd to vnode.
716 	error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
717 	if (error) {
718 		goto graftout;
719 	}
720 
721 	if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_MAX) {
722 		error = EINVAL;
723 	} else {
724 		sbc_args = &kern_gda.sbc_args;
725 		error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp);
726 	}
727 
728 graftout:
729 	if (cryptex_vp) {
730 		vnode_put(cryptex_vp);
731 		cryptex_vp = NULLVP;
732 	}
733 	if (mounton_vp) {
734 		vnode_put(mounton_vp);
735 		mounton_vp = NULLVP;
736 	}
737 	if (ua_mountdir != USER_ADDR_NULL) {
738 		nameidone(&nd);
739 	}
740 
741 	return error;
742 }
743 
744 /*
745  * Ungraft a cryptex disk image (via mount dir FD)
746  * { int ungraftdmg(const char *mountdir, uint64_t flags); }
747  */
748 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)749 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
750 {
751 	int error = 0;
752 	user_addr_t ua_mountdir = uap->mountdir;
753 	fsioc_ungraft_fs_t ugfs;
754 	vnode_t mounton_vp = NULLVP;
755 	struct nameidata nd = {};
756 	vfs_context_t ctx = vfs_context_current();
757 
758 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
759 		return EPERM;
760 	}
761 
762 	if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
763 		return EINVAL;
764 	}
765 
766 	ugfs.ungraft_flags = 0;
767 
768 	// Acquire vnode for mount-on path
769 	NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
770 	    UIO_USERSPACE, ua_mountdir, ctx);
771 
772 	error = namei(&nd);
773 	if (error) {
774 		return error;
775 	}
776 	mounton_vp = nd.ni_vp;
777 
778 	// Call into the FS to perform the ungraft
779 	error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
780 
781 	vnode_put(mounton_vp);
782 	nameidone(&nd);
783 
784 	return error;
785 }
786 
787 
788 void
vfs_notify_mount(vnode_t pdvp)789 vfs_notify_mount(vnode_t pdvp)
790 {
791 	vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
792 	lock_vnode_and_post(pdvp, NOTE_WRITE);
793 }
794 
795 /*
796  * __mac_mount:
797  *	Mount a file system taking into account MAC label behavior.
798  *	See mount(2) man page for more information
799  *
800  * Parameters:    p                        Process requesting the mount
801  *                uap                      User argument descriptor (see below)
802  *                retval                   (ignored)
803  *
804  * Indirect:      uap->type                Filesystem type
805  *                uap->path                Path to mount
806  *                uap->data                Mount arguments
807  *                uap->mac_p               MAC info
808  *                uap->flags               Mount flags
809  *
810  *
811  * Returns:        0                       Success
812  *                !0                       Not success
813  */
814 boolean_t root_fs_upgrade_try = FALSE;
815 
816 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)817 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
818 {
819 	vnode_t pvp = NULL;
820 	vnode_t vp = NULL;
821 	int need_nameidone = 0;
822 	vfs_context_t ctx = vfs_context_current();
823 	char fstypename[MFSNAMELEN];
824 	struct nameidata nd;
825 	size_t dummy = 0;
826 	char *labelstr = NULL;
827 	size_t labelsz = 0;
828 	int flags = uap->flags;
829 	int error;
830 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
831 	boolean_t is_64bit = IS_64BIT_PROCESS(p);
832 #else
833 #pragma unused(p)
834 #endif
835 	/*
836 	 * Get the fs type name from user space
837 	 */
838 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
839 	if (error) {
840 		return error;
841 	}
842 
843 	/*
844 	 * Get the vnode to be covered
845 	 */
846 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
847 	    UIO_USERSPACE, uap->path, ctx);
848 	if (flags & MNT_NOFOLLOW) {
849 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
850 	}
851 	error = namei(&nd);
852 	if (error) {
853 		goto out;
854 	}
855 	need_nameidone = 1;
856 	vp = nd.ni_vp;
857 	pvp = nd.ni_dvp;
858 
859 #ifdef CONFIG_IMGSRC_ACCESS
860 	/* Mounting image source cannot be batched with other operations */
861 	if (flags == MNT_IMGSRC_BY_INDEX) {
862 		error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
863 		    ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
864 		goto out;
865 	}
866 #endif /* CONFIG_IMGSRC_ACCESS */
867 
868 #if CONFIG_MACF
869 	/*
870 	 * Get the label string (if any) from user space
871 	 */
872 	if (uap->mac_p != USER_ADDR_NULL) {
873 		struct user_mac mac;
874 		size_t ulen = 0;
875 
876 		if (is_64bit) {
877 			struct user64_mac mac64;
878 			error = copyin(uap->mac_p, &mac64, sizeof(mac64));
879 			mac.m_buflen = (user_size_t)mac64.m_buflen;
880 			mac.m_string = (user_addr_t)mac64.m_string;
881 		} else {
882 			struct user32_mac mac32;
883 			error = copyin(uap->mac_p, &mac32, sizeof(mac32));
884 			mac.m_buflen = mac32.m_buflen;
885 			mac.m_string = mac32.m_string;
886 		}
887 		if (error) {
888 			goto out;
889 		}
890 		if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
891 		    (mac.m_buflen < 2)) {
892 			error = EINVAL;
893 			goto out;
894 		}
895 		labelsz = mac.m_buflen;
896 		labelstr = kalloc_data(labelsz, Z_WAITOK);
897 		error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
898 		if (error) {
899 			goto out;
900 		}
901 		AUDIT_ARG(mac_string, labelstr);
902 	}
903 #endif /* CONFIG_MACF */
904 
905 	AUDIT_ARG(fflags, flags);
906 
907 #if !CONFIG_UNION_MOUNTS
908 	if (flags & MNT_UNION) {
909 		error = EPERM;
910 		goto out;
911 	}
912 #endif
913 
914 	if ((vp->v_flag & VROOT) &&
915 	    (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
916 #if CONFIG_UNION_MOUNTS
917 		if (!(flags & MNT_UNION)) {
918 			flags |= MNT_UPDATE;
919 		} else {
920 			/*
921 			 * For a union mount on '/', treat it as fresh
922 			 * mount instead of update.
923 			 * Otherwise, union mouting on '/' used to panic the
924 			 * system before, since mnt_vnodecovered was found to
925 			 * be NULL for '/' which is required for unionlookup
926 			 * after it gets ENOENT on union mount.
927 			 */
928 			flags = (flags & ~(MNT_UPDATE));
929 		}
930 #else
931 		flags |= MNT_UPDATE;
932 #endif /* CONFIG_UNION_MOUNTS */
933 
934 #if SECURE_KERNEL
935 		if ((flags & MNT_RDONLY) == 0) {
936 			/* Release kernels are not allowed to mount "/" as rw */
937 			error = EPERM;
938 			goto out;
939 		}
940 #endif
941 
942 		/*
943 		 * See 7392553 for more details on why this check exists.
944 		 * Suffice to say: If this check is ON and something tries
945 		 * to mount the rootFS RW, we'll turn off the codesign
946 		 * bitmap optimization.
947 		 */
948 #if CHECK_CS_VALIDATION_BITMAP
949 		if ((flags & MNT_RDONLY) == 0) {
950 			root_fs_upgrade_try = TRUE;
951 		}
952 #endif
953 	}
954 
955 	error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
956 	    labelstr, ctx);
957 
958 out:
959 
960 #if CONFIG_MACF
961 	kfree_data(labelstr, labelsz);
962 #endif /* CONFIG_MACF */
963 
964 	if (vp) {
965 		vnode_put(vp);
966 	}
967 	if (pvp) {
968 		vnode_put(pvp);
969 	}
970 	if (need_nameidone) {
971 		nameidone(&nd);
972 	}
973 
974 	return error;
975 }
976 
977 /*
978  * common mount implementation (final stage of mounting)
979  *
980  * Arguments:
981  *  fstypename	file system type (ie it's vfs name)
982  *  pvp		parent of covered vnode
983  *  vp		covered vnode
984  *  cnp		component name (ie path) of covered vnode
985  *  flags	generic mount flags
986  *  fsmountargs	file system specific data
987  *  labelstr	optional MAC label
988  *  kernelmount	TRUE for mounts initiated from inside the kernel
989  *  ctx		caller's context
990  */
991 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)992 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
993     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
994     char *labelstr, vfs_context_t ctx)
995 {
996 #if !CONFIG_MACF
997 #pragma unused(labelstr)
998 #endif
999 	struct vnode *devvp = NULLVP;
1000 	struct vnode *device_vnode = NULLVP;
1001 #if CONFIG_MACF
1002 	struct vnode *rvp;
1003 #endif
1004 	struct mount *mp = NULL;
1005 	struct vfstable *vfsp = (struct vfstable *)0;
1006 	struct proc *p = vfs_context_proc(ctx);
1007 	int error, flag = 0;
1008 	bool flag_set = false;
1009 	user_addr_t devpath = USER_ADDR_NULL;
1010 	int ronly = 0;
1011 	int mntalloc = 0;
1012 	boolean_t vfsp_ref = FALSE;
1013 	boolean_t is_rwlock_locked = FALSE;
1014 	boolean_t did_rele = FALSE;
1015 	boolean_t have_usecount = FALSE;
1016 	boolean_t did_set_lmount = FALSE;
1017 	boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1018 
1019 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1020 	/* Check for mutually-exclusive flag bits */
1021 	uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1022 	int bitcount = 0;
1023 	while (checkflags != 0) {
1024 		checkflags &= (checkflags - 1);
1025 		bitcount++;
1026 	}
1027 
1028 	if (bitcount > 1) {
1029 		//not allowed to request multiple mount-by-role flags
1030 		error = EINVAL;
1031 		goto out1;
1032 	}
1033 #endif
1034 
1035 	/*
1036 	 * Process an update for an existing mount
1037 	 */
1038 	if (flags & MNT_UPDATE) {
1039 		if ((vp->v_flag & VROOT) == 0) {
1040 			error = EINVAL;
1041 			goto out1;
1042 		}
1043 		mp = vp->v_mount;
1044 
1045 		/* if unmount or mount in progress, return error */
1046 		mount_lock_spin(mp);
1047 		if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1048 			mount_unlock(mp);
1049 			error = EBUSY;
1050 			goto out1;
1051 		}
1052 		mp->mnt_lflag |= MNT_LMOUNT;
1053 		did_set_lmount = TRUE;
1054 		mount_unlock(mp);
1055 		lck_rw_lock_exclusive(&mp->mnt_rwlock);
1056 		is_rwlock_locked = TRUE;
1057 		/*
1058 		 * We only allow the filesystem to be reloaded if it
1059 		 * is currently mounted read-only.
1060 		 */
1061 		if ((flags & MNT_RELOAD) &&
1062 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1063 			error = ENOTSUP;
1064 			goto out1;
1065 		}
1066 
1067 		/*
1068 		 * If content protection is enabled, update mounts are not
1069 		 * allowed to turn it off.
1070 		 */
1071 		if ((mp->mnt_flag & MNT_CPROTECT) &&
1072 		    ((flags & MNT_CPROTECT) == 0)) {
1073 			error = EINVAL;
1074 			goto out1;
1075 		}
1076 
1077 		/*
1078 		 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1079 		 * failure to return an error for this so we'll just silently
1080 		 * add it if it is not passed in.
1081 		 */
1082 		if ((mp->mnt_flag & MNT_REMOVABLE) &&
1083 		    ((flags & MNT_REMOVABLE) == 0)) {
1084 			flags |= MNT_REMOVABLE;
1085 		}
1086 
1087 		/* Can't downgrade the backer of the root FS */
1088 		if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1089 		    (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1090 			error = ENOTSUP;
1091 			goto out1;
1092 		}
1093 
1094 		/*
1095 		 * Only root, or the user that did the original mount is
1096 		 * permitted to update it.
1097 		 */
1098 		if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1099 		    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1100 			goto out1;
1101 		}
1102 #if CONFIG_MACF
1103 		error = mac_mount_check_remount(ctx, mp);
1104 		if (error != 0) {
1105 			goto out1;
1106 		}
1107 #endif
1108 		/*
1109 		 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1110 		 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1111 		 */
1112 		if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1113 			flags |= MNT_NOSUID | MNT_NODEV;
1114 			if (mp->mnt_flag & MNT_NOEXEC) {
1115 				flags |= MNT_NOEXEC;
1116 			}
1117 		}
1118 		flag = mp->mnt_flag;
1119 		flag_set = true;
1120 
1121 
1122 
1123 		mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1124 
1125 		vfsp = mp->mnt_vtable;
1126 		goto update;
1127 	} // MNT_UPDATE
1128 
1129 	/*
1130 	 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1131 	 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1132 	 */
1133 	if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1134 		flags |= MNT_NOSUID | MNT_NODEV;
1135 		if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1136 			flags |= MNT_NOEXEC;
1137 		}
1138 	}
1139 
1140 	/* XXXAUDIT: Should we capture the type on the error path as well? */
1141 	/* XXX cast-away const (audit_arg_text() does not modify its input) */
1142 	AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1143 	mount_list_lock();
1144 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1145 		if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1146 			vfsp->vfc_refcount++;
1147 			vfsp_ref = TRUE;
1148 			break;
1149 		}
1150 	}
1151 	mount_list_unlock();
1152 	if (vfsp == NULL) {
1153 		error = ENODEV;
1154 		goto out1;
1155 	}
1156 
1157 	/*
1158 	 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1159 	 * except in ROSV configs and for the initial BaseSystem root.
1160 	 */
1161 	if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1162 	    ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1163 	    ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1164 		error = EINVAL;  /* unsupported request */
1165 		goto out1;
1166 	}
1167 
1168 	error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1169 	if (error != 0) {
1170 		goto out1;
1171 	}
1172 
1173 	/*
1174 	 * Allocate and initialize the filesystem (mount_t)
1175 	 */
1176 	mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1177 	mntalloc = 1;
1178 
1179 	/* Initialize the default IO constraints */
1180 	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1181 	mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1182 	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1183 	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1184 	mp->mnt_devblocksize = DEV_BSIZE;
1185 	mp->mnt_alignmentmask = PAGE_MASK;
1186 	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1187 	mp->mnt_ioscale = 1;
1188 	mp->mnt_ioflags = 0;
1189 	mp->mnt_realrootvp = NULLVP;
1190 	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1191 
1192 	mp->mnt_lflag |= MNT_LMOUNT;
1193 	did_set_lmount = TRUE;
1194 
1195 	TAILQ_INIT(&mp->mnt_vnodelist);
1196 	TAILQ_INIT(&mp->mnt_workerqueue);
1197 	TAILQ_INIT(&mp->mnt_newvnodes);
1198 	mount_lock_init(mp);
1199 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
1200 	is_rwlock_locked = TRUE;
1201 	mp->mnt_op = vfsp->vfc_vfsops;
1202 	mp->mnt_vtable = vfsp;
1203 	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
1204 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1205 	strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1206 	do {
1207 		size_t pathlen = MAXPATHLEN;
1208 
1209 		if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1210 			strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1211 		}
1212 	} while (0);
1213 	mp->mnt_vnodecovered = vp;
1214 	mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1215 	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1216 	mp->mnt_devbsdunit = 0;
1217 	mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1218 
1219 	/* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1220 	vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1221 
1222 	if (kernelmount) {
1223 		mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1224 	}
1225 	if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1226 		mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1227 	}
1228 
1229 	if (KERNEL_MOUNT_DEVFS & internal_flags) {
1230 		// kernel mounted devfs
1231 		mp->mnt_kern_flag |= MNTK_SYSTEM;
1232 	}
1233 
1234 update:
1235 
1236 	/*
1237 	 * Set the mount level flags.
1238 	 */
1239 	if (flags & MNT_RDONLY) {
1240 		mp->mnt_flag |= MNT_RDONLY;
1241 	} else if (mp->mnt_flag & MNT_RDONLY) {
1242 		// disallow read/write upgrades of file systems that
1243 		// had the TYPENAME_OVERRIDE feature set.
1244 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1245 			error = EPERM;
1246 			goto out1;
1247 		}
1248 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
1249 	}
1250 	mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1251 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1252 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1253 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1254 	    MNT_QUARANTINE | MNT_CPROTECT);
1255 
1256 #if SECURE_KERNEL
1257 #if !CONFIG_MNT_SUID
1258 	/*
1259 	 * On release builds of iOS based platforms, always enforce NOSUID on
1260 	 * all mounts. We do this here because we can catch update mounts as well as
1261 	 * non-update mounts in this case.
1262 	 */
1263 	mp->mnt_flag |= (MNT_NOSUID);
1264 #endif
1265 #endif
1266 
1267 	mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1268 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1269 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1270 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1271 	    MNT_QUARANTINE | MNT_CPROTECT);
1272 
1273 #if CONFIG_MACF
1274 	if (flags & MNT_MULTILABEL) {
1275 		if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1276 			error = EINVAL;
1277 			goto out1;
1278 		}
1279 		mp->mnt_flag |= MNT_MULTILABEL;
1280 	}
1281 #endif
1282 	/*
1283 	 * Process device path for local file systems if requested.
1284 	 *
1285 	 * Snapshot and mount-by-role mounts do not use this path; they are
1286 	 * passing other opaque data in the device path field.
1287 	 *
1288 	 * Basesystemroot mounts pass a device path to be resolved here,
1289 	 * but it's just a char * already inside the kernel, which
1290 	 * kernel_mount() shoved into a user_addr_t to call us. So for such
1291 	 * mounts we must skip copyin (both of the address and of the string
1292 	 * (in NDINIT).
1293 	 */
1294 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1295 	    !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1296 		boolean_t do_copyin_devpath = true;
1297 #if CONFIG_BASESYSTEMROOT
1298 		if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1299 			// KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1300 			// We have been passed fsmountargs, which is typed as a user_addr_t,
1301 			// but is actually a char ** pointing to a (kernelspace) string.
1302 			// We manually unpack it with a series of casts and dereferences
1303 			// that reverses what was done just above us on the stack in
1304 			// imageboot_pivot_image().
1305 			// After retrieving the path to the dev node (which we will NDINIT
1306 			// in a moment), we pass NULL fsmountargs on to the filesystem.
1307 			_Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1308 			char **devnamepp = (char **)fsmountargs;
1309 			char *devnamep = *devnamepp;
1310 			devpath = CAST_USER_ADDR_T(devnamep);
1311 			do_copyin_devpath = false;
1312 			fsmountargs = USER_ADDR_NULL;
1313 
1314 			//Now that we have a mp, denote that this mount is for the basesystem.
1315 			mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1316 		}
1317 #endif // CONFIG_BASESYSTEMROOT
1318 
1319 		if (do_copyin_devpath) {
1320 			if (vfs_context_is64bit(ctx)) {
1321 				if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1322 					goto out1;
1323 				}
1324 				fsmountargs += sizeof(devpath);
1325 			} else {
1326 				user32_addr_t tmp;
1327 				if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1328 					goto out1;
1329 				}
1330 				/* munge into LP64 addr */
1331 				devpath = CAST_USER_ADDR_T(tmp);
1332 				fsmountargs += sizeof(tmp);
1333 			}
1334 		}
1335 
1336 		/* Lookup device and authorize access to it */
1337 		if ((devpath)) {
1338 			struct nameidata nd;
1339 
1340 			enum uio_seg seg = UIO_USERSPACE;
1341 #if CONFIG_BASESYSTEMROOT
1342 			if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1343 				seg = UIO_SYSSPACE;
1344 			}
1345 #endif // CONFIG_BASESYSTEMROOT
1346 
1347 			NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1348 			if ((error = namei(&nd))) {
1349 				goto out1;
1350 			}
1351 
1352 			strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1353 			devvp = nd.ni_vp;
1354 
1355 			nameidone(&nd);
1356 
1357 			if (devvp->v_type != VBLK) {
1358 				error = ENOTBLK;
1359 				goto out2;
1360 			}
1361 			if (major(devvp->v_rdev) >= nblkdev) {
1362 				error = ENXIO;
1363 				goto out2;
1364 			}
1365 			/*
1366 			 * If mount by non-root, then verify that user has necessary
1367 			 * permissions on the device.
1368 			 */
1369 			if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1370 				kauth_action_t accessmode = KAUTH_VNODE_READ_DATA;
1371 
1372 				if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1373 					accessmode |= KAUTH_VNODE_WRITE_DATA;
1374 				}
1375 				if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1376 					goto out2;
1377 				}
1378 			}
1379 		}
1380 		/* On first mount, preflight and open device */
1381 		if (devpath && ((flags & MNT_UPDATE) == 0)) {
1382 			if ((error = vnode_ref(devvp))) {
1383 				goto out2;
1384 			}
1385 			/*
1386 			 * Disallow multiple mounts of the same device.
1387 			 * Disallow mounting of a device that is currently in use
1388 			 * (except for root, which might share swap device for miniroot).
1389 			 * Flush out any old buffers remaining from a previous use.
1390 			 */
1391 			if ((error = vfs_setmounting(devvp))) {
1392 				vnode_rele(devvp);
1393 				goto out2;
1394 			}
1395 
1396 			if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1397 				error = EBUSY;
1398 				goto out3;
1399 			}
1400 			if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1401 				error = ENOTBLK;
1402 				goto out3;
1403 			}
1404 			if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1405 				goto out3;
1406 			}
1407 
1408 			ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1409 #if CONFIG_MACF
1410 			error = mac_vnode_check_open(ctx,
1411 			    devvp,
1412 			    ronly ? FREAD : FREAD | FWRITE);
1413 			if (error) {
1414 				goto out3;
1415 			}
1416 #endif /* MAC */
1417 			if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1418 				goto out3;
1419 			}
1420 
1421 			mp->mnt_devvp = devvp;
1422 			device_vnode = devvp;
1423 		} else if ((mp->mnt_flag & MNT_RDONLY) &&
1424 		    (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1425 		    (device_vnode = mp->mnt_devvp)) {
1426 			dev_t dev;
1427 			int maj;
1428 			/*
1429 			 * If upgrade to read-write by non-root, then verify
1430 			 * that user has necessary permissions on the device.
1431 			 */
1432 			vnode_getalways(device_vnode);
1433 
1434 			if (suser(vfs_context_ucred(ctx), NULL) &&
1435 			    (error = vnode_authorize(device_vnode, NULL,
1436 			    KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1437 			    ctx)) != 0) {
1438 				vnode_put(device_vnode);
1439 				goto out2;
1440 			}
1441 
1442 			/* Tell the device that we're upgrading */
1443 			dev = (dev_t)device_vnode->v_rdev;
1444 			maj = major(dev);
1445 
1446 			if ((u_int)maj >= (u_int)nblkdev) {
1447 				panic("Volume mounted on a device with invalid major number.");
1448 			}
1449 
1450 			error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1451 			vnode_put(device_vnode);
1452 			device_vnode = NULLVP;
1453 			if (error != 0) {
1454 				goto out2;
1455 			}
1456 		}
1457 	} // localargs && !(snapshot | data | vm)
1458 
1459 #if CONFIG_MACF
1460 	if ((flags & MNT_UPDATE) == 0) {
1461 		mac_mount_label_init(mp);
1462 		mac_mount_label_associate(ctx, mp);
1463 	}
1464 	if (labelstr) {
1465 		if ((flags & MNT_UPDATE) != 0) {
1466 			error = mac_mount_check_label_update(ctx, mp);
1467 			if (error != 0) {
1468 				goto out3;
1469 			}
1470 		}
1471 	}
1472 #endif
1473 	/*
1474 	 * Mount the filesystem.  We already asserted that internal_flags
1475 	 * cannot have more than one mount-by-role bit set.
1476 	 */
1477 	if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1478 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1479 		    (caddr_t)fsmountargs, 0, ctx);
1480 	} else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1481 #if CONFIG_ROSV_STARTUP
1482 		struct mount *origin_mp = (struct mount*)fsmountargs;
1483 		fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1484 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1485 		if (error) {
1486 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1487 		} else {
1488 			/* Mark volume associated with system volume */
1489 			mp->mnt_kern_flag |= MNTK_SYSTEM;
1490 
1491 			/* Attempt to acquire the mnt_devvp and set it up */
1492 			struct vnode *mp_devvp = NULL;
1493 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1494 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1495 				    0, &mp_devvp, vfs_context_kernel());
1496 				if (!lerr) {
1497 					mp->mnt_devvp = mp_devvp;
1498 					//vnode_lookup took an iocount, need to drop it.
1499 					vnode_put(mp_devvp);
1500 					// now set `device_vnode` to the devvp that was acquired.
1501 					// this is needed in order to ensure vfs_init_io_attributes is invoked.
1502 					// note that though the iocount above was dropped, the mount acquires
1503 					// an implicit reference against the device.
1504 					device_vnode = mp_devvp;
1505 				}
1506 			}
1507 		}
1508 #else
1509 		error = EINVAL;
1510 #endif
1511 	} else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1512 #if CONFIG_MOUNT_VM
1513 		struct mount *origin_mp = (struct mount*)fsmountargs;
1514 		fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1515 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1516 		if (error) {
1517 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1518 		} else {
1519 			/* Mark volume associated with system volume and a swap mount */
1520 			mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1521 			/* Attempt to acquire the mnt_devvp and set it up */
1522 			struct vnode *mp_devvp = NULL;
1523 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1524 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1525 				    0, &mp_devvp, vfs_context_kernel());
1526 				if (!lerr) {
1527 					mp->mnt_devvp = mp_devvp;
1528 					//vnode_lookup took an iocount, need to drop it.
1529 					vnode_put(mp_devvp);
1530 
1531 					// now set `device_vnode` to the devvp that was acquired.
1532 					// note that though the iocount above was dropped, the mount acquires
1533 					// an implicit reference against the device.
1534 					device_vnode = mp_devvp;
1535 				}
1536 			}
1537 		}
1538 #else
1539 		error = EINVAL;
1540 #endif
1541 	} else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1542 #if CONFIG_MOUNT_PREBOOTRECOVERY
1543 		struct mount *origin_mp = (struct mount*)fsmountargs;
1544 		uint32_t mount_role = 0;
1545 		if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1546 			mount_role = VFS_PREBOOT_ROLE;
1547 		} else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1548 			mount_role = VFS_RECOVERY_ROLE;
1549 		}
1550 
1551 		if (mount_role != 0) {
1552 			fs_role_mount_args_t frma = {origin_mp, mount_role};
1553 			error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1554 			if (error) {
1555 				printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1556 			} else {
1557 				// NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1558 				/* Mark volume associated with system volume */
1559 				//mp->mnt_kern_flag |= MNTK_SYSTEM;
1560 				/* Attempt to acquire the mnt_devvp and set it up */
1561 				struct vnode *mp_devvp = NULL;
1562 				if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1563 					errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1564 					    0, &mp_devvp, vfs_context_kernel());
1565 					if (!lerr) {
1566 						mp->mnt_devvp = mp_devvp;
1567 						//vnode_lookup took an iocount, need to drop it.
1568 						vnode_put(mp_devvp);
1569 
1570 						// now set `device_vnode` to the devvp that was acquired.
1571 						// note that though the iocount above was dropped, the mount acquires
1572 						// an implicit reference against the device.
1573 						device_vnode = mp_devvp;
1574 					}
1575 				}
1576 			}
1577 		} else {
1578 			printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1579 			error = EINVAL;
1580 		}
1581 #else
1582 		error = EINVAL;
1583 #endif
1584 	} else {
1585 		error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1586 	}
1587 
1588 	if (flags & MNT_UPDATE) {
1589 		if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1590 			mp->mnt_flag &= ~MNT_RDONLY;
1591 		}
1592 		mp->mnt_flag &= ~
1593 		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1594 		mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1595 		if (error) {
1596 			mp->mnt_flag = flag;  /* restore flag value */
1597 		}
1598 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1599 		lck_rw_done(&mp->mnt_rwlock);
1600 		is_rwlock_locked = FALSE;
1601 		if (!error) {
1602 			enablequotas(mp, ctx);
1603 		}
1604 		goto exit;
1605 	}
1606 
1607 	/*
1608 	 * Put the new filesystem on the mount list after root.
1609 	 */
1610 	if (error == 0) {
1611 		struct vfs_attr vfsattr;
1612 		if (device_vnode) {
1613 			/*
1614 			 *   cache the IO attributes for the underlying physical media...
1615 			 *   an error return indicates the underlying driver doesn't
1616 			 *   support all the queries necessary... however, reasonable
1617 			 *   defaults will have been set, so no reason to bail or care
1618 			 *
1619 			 *   Need to do this before calling the MAC hook as it needs
1620 			 *   information from this call.
1621 			 */
1622 			vfs_init_io_attributes(device_vnode, mp);
1623 		}
1624 
1625 #if CONFIG_MACF
1626 		error = mac_mount_check_mount_late(ctx, mp);
1627 		if (error != 0) {
1628 			goto out4;
1629 		}
1630 
1631 		if (vfs_flags(mp) & MNT_MULTILABEL) {
1632 			error = VFS_ROOT(mp, &rvp, ctx);
1633 			if (error) {
1634 				printf("%s() VFS_ROOT returned %d\n", __func__, error);
1635 				goto out4;
1636 			}
1637 			error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1638 			/*
1639 			 * drop reference provided by VFS_ROOT
1640 			 */
1641 			vnode_put(rvp);
1642 
1643 			if (error) {
1644 				goto out4;
1645 			}
1646 		}
1647 #endif  /* MAC */
1648 
1649 		vnode_lock_spin(vp);
1650 		CLR(vp->v_flag, VMOUNT);
1651 		vp->v_mountedhere = mp;
1652 		SET(vp->v_flag, VMOUNTEDHERE);
1653 		vnode_unlock(vp);
1654 
1655 		/*
1656 		 * taking the name_cache_lock exclusively will
1657 		 * insure that everyone is out of the fast path who
1658 		 * might be trying to use a now stale copy of
1659 		 * vp->v_mountedhere->mnt_realrootvp
1660 		 * bumping mount_generation causes the cached values
1661 		 * to be invalidated
1662 		 */
1663 		name_cache_lock();
1664 		mount_generation++;
1665 		name_cache_unlock();
1666 
1667 		error = vnode_ref(vp);
1668 		if (error != 0) {
1669 			goto out4;
1670 		}
1671 
1672 		have_usecount = TRUE;
1673 
1674 		error = checkdirs(vp, ctx);
1675 		if (error != 0) {
1676 			/* Unmount the filesystem as cdir/rdirs cannot be updated */
1677 			goto out4;
1678 		}
1679 		/*
1680 		 * there is no cleanup code here so I have made it void
1681 		 * we need to revisit this
1682 		 */
1683 		(void)VFS_START(mp, 0, ctx);
1684 
1685 		if (mount_list_add(mp) != 0) {
1686 			/*
1687 			 * The system is shutting down trying to umount
1688 			 * everything, so fail with a plausible errno.
1689 			 */
1690 			error = EBUSY;
1691 			goto out4;
1692 		}
1693 		lck_rw_done(&mp->mnt_rwlock);
1694 		is_rwlock_locked = FALSE;
1695 
1696 		/* Check if this mounted file system supports EAs or named streams. */
1697 		/* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1698 		VFSATTR_INIT(&vfsattr);
1699 		VFSATTR_WANTED(&vfsattr, f_capabilities);
1700 		if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1701 		    vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1702 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1703 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1704 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1705 				mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1706 			}
1707 #if NAMEDSTREAMS
1708 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1709 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1710 				mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1711 			}
1712 #endif
1713 			/* Check if this file system supports path from id lookups. */
1714 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1715 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1716 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1717 			} else if (mp->mnt_flag & MNT_DOVOLFS) {
1718 				/* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1719 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1720 			}
1721 
1722 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1723 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1724 				mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1725 			}
1726 		}
1727 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1728 			mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1729 		}
1730 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1731 			mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1732 		}
1733 		/* increment the operations count */
1734 		OSAddAtomic(1, &vfs_nummntops);
1735 		enablequotas(mp, ctx);
1736 
1737 		if (device_vnode) {
1738 			vfs_setmountedon(device_vnode);
1739 		}
1740 
1741 		/* Now that mount is setup, notify the listeners */
1742 		vfs_notify_mount(pvp);
1743 		IOBSDMountChange(mp, kIOMountChangeMount);
1744 	} else {
1745 		/* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1746 		if (mp->mnt_vnodelist.tqh_first != NULL) {
1747 			panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1748 			    mp->mnt_vtable->vfc_name, error);
1749 		}
1750 
1751 		vnode_lock_spin(vp);
1752 		CLR(vp->v_flag, VMOUNT);
1753 		vnode_unlock(vp);
1754 		mount_list_lock();
1755 		mp->mnt_vtable->vfc_refcount--;
1756 		mount_list_unlock();
1757 
1758 		if (device_vnode) {
1759 			vnode_rele(device_vnode);
1760 			VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1761 			vfs_clearmounting(device_vnode);
1762 		}
1763 		lck_rw_done(&mp->mnt_rwlock);
1764 		is_rwlock_locked = FALSE;
1765 
1766 		if (nc_smr_enabled) {
1767 			vfs_smr_synchronize();
1768 		}
1769 
1770 		/*
1771 		 * if we get here, we have a mount structure that needs to be freed,
1772 		 * but since the coveredvp hasn't yet been updated to point at it,
1773 		 * no need to worry about other threads holding a crossref on this mp
1774 		 * so it's ok to just free it
1775 		 */
1776 		mount_lock_destroy(mp);
1777 #if CONFIG_MACF
1778 		mac_mount_label_destroy(mp);
1779 #endif
1780 		zfree(mount_zone, mp);
1781 		did_set_lmount = false;
1782 	}
1783 exit:
1784 	/*
1785 	 * drop I/O count on the device vp if there was one
1786 	 */
1787 	if (devpath && devvp) {
1788 		vnode_put(devvp);
1789 	}
1790 
1791 	if (did_set_lmount) {
1792 		mount_lock_spin(mp);
1793 		mp->mnt_lflag &= ~MNT_LMOUNT;
1794 		mount_unlock(mp);
1795 	}
1796 
1797 	return error;
1798 
1799 /* Error condition exits */
1800 out4:
1801 	(void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1802 
1803 	/*
1804 	 * If the mount has been placed on the covered vp,
1805 	 * it may have been discovered by now, so we have
1806 	 * to treat this just like an unmount
1807 	 */
1808 	mount_lock_spin(mp);
1809 	mp->mnt_lflag |= MNT_LDEAD;
1810 	mount_unlock(mp);
1811 
1812 	if (device_vnode != NULLVP) {
1813 		vnode_rele(device_vnode);
1814 		VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1815 		    ctx);
1816 		vfs_clearmounting(device_vnode);
1817 		did_rele = TRUE;
1818 	}
1819 
1820 	vnode_lock_spin(vp);
1821 
1822 	mp->mnt_crossref++;
1823 	CLR(vp->v_flag, VMOUNTEDHERE);
1824 	vp->v_mountedhere = (mount_t) 0;
1825 
1826 	vnode_unlock(vp);
1827 
1828 	if (have_usecount) {
1829 		vnode_rele(vp);
1830 	}
1831 out3:
1832 	if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1833 		vnode_rele(devvp);
1834 		vfs_clearmounting(devvp);
1835 	}
1836 out2:
1837 	if (devpath && devvp) {
1838 		vnode_put(devvp);
1839 	}
1840 out1:
1841 	/* Release mnt_rwlock only when it was taken */
1842 	if (is_rwlock_locked == TRUE) {
1843 		if (flag_set) {
1844 			mp->mnt_flag = flag;  /* restore mnt_flag value */
1845 		}
1846 		lck_rw_done(&mp->mnt_rwlock);
1847 	}
1848 
1849 	if (did_set_lmount) {
1850 		mount_lock_spin(mp);
1851 		mp->mnt_lflag &= ~MNT_LMOUNT;
1852 		mount_unlock(mp);
1853 	}
1854 
1855 	if (mntalloc) {
1856 		if (mp->mnt_crossref) {
1857 			mount_dropcrossref(mp, vp, 0);
1858 		} else {
1859 			if (nc_smr_enabled) {
1860 				vfs_smr_synchronize();
1861 			}
1862 
1863 			mount_lock_destroy(mp);
1864 #if CONFIG_MACF
1865 			mac_mount_label_destroy(mp);
1866 #endif
1867 			zfree(mount_zone, mp);
1868 		}
1869 	}
1870 	if (vfsp_ref) {
1871 		mount_list_lock();
1872 		vfsp->vfc_refcount--;
1873 		mount_list_unlock();
1874 	}
1875 
1876 	return error;
1877 }
1878 
1879 /*
1880  * Flush in-core data, check for competing mount attempts,
1881  * and set VMOUNT
1882  */
1883 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)1884 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1885 {
1886 #if !CONFIG_MACF
1887 #pragma unused(cnp,fsname)
1888 #endif
1889 	struct vnode_attr va;
1890 	int error;
1891 	boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1892 	boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1893 	boolean_t is_busy;
1894 
1895 	if (!skip_auth) {
1896 		/*
1897 		 * If the user is not root, ensure that they own the directory
1898 		 * onto which we are attempting to mount.
1899 		 */
1900 		VATTR_INIT(&va);
1901 		VATTR_WANTED(&va, va_uid);
1902 		if ((error = vnode_getattr(vp, &va, ctx)) ||
1903 		    (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1904 		    (!vfs_context_issuser(ctx)))) {
1905 			error = EPERM;
1906 			goto out;
1907 		}
1908 	}
1909 
1910 	if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1911 		goto out;
1912 	}
1913 
1914 	if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1915 		goto out;
1916 	}
1917 
1918 	if (vp->v_type != VDIR) {
1919 		error = ENOTDIR;
1920 		goto out;
1921 	}
1922 
1923 	vnode_lock_spin(vp);
1924 	is_busy = is_fmount ?
1925 	    (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL)) :
1926 	    (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL));
1927 	if (is_busy) {
1928 		vnode_unlock(vp);
1929 		error = EBUSY;
1930 		goto out;
1931 	}
1932 	SET(vp->v_flag, VMOUNT);
1933 	vnode_unlock(vp);
1934 
1935 #if CONFIG_MACF
1936 	error = mac_mount_check_mount(ctx, vp,
1937 	    cnp, fsname);
1938 	if (error != 0) {
1939 		vnode_lock_spin(vp);
1940 		CLR(vp->v_flag, VMOUNT);
1941 		vnode_unlock(vp);
1942 	}
1943 #endif
1944 
1945 out:
1946 	return error;
1947 }
1948 
1949 #if CONFIG_IMGSRC_ACCESS
1950 
1951 #define DEBUG_IMGSRC 0
1952 
1953 #if DEBUG_IMGSRC
1954 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1955 #else
1956 #define IMGSRC_DEBUG(args...) do { } while(0)
1957 #endif
1958 
1959 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)1960 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1961 {
1962 	struct nameidata nd;
1963 	vnode_t vp, realdevvp;
1964 	kauth_action_t accessmode;
1965 	int error;
1966 	enum uio_seg uio = UIO_USERSPACE;
1967 
1968 	if (ctx == vfs_context_kernel()) {
1969 		uio = UIO_SYSSPACE;
1970 	}
1971 
1972 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1973 	if ((error = namei(&nd))) {
1974 		IMGSRC_DEBUG("namei() failed with %d\n", error);
1975 		return error;
1976 	}
1977 
1978 	vp = nd.ni_vp;
1979 
1980 	if (!vnode_isblk(vp)) {
1981 		IMGSRC_DEBUG("Not block device.\n");
1982 		error = ENOTBLK;
1983 		goto out;
1984 	}
1985 
1986 	realdevvp = mp->mnt_devvp;
1987 	if (realdevvp == NULLVP) {
1988 		IMGSRC_DEBUG("No device backs the mount.\n");
1989 		error = ENXIO;
1990 		goto out;
1991 	}
1992 
1993 	error = vnode_getwithref(realdevvp);
1994 	if (error != 0) {
1995 		IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1996 		goto out;
1997 	}
1998 
1999 	if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
2000 		IMGSRC_DEBUG("Wrong dev_t.\n");
2001 		error = ENXIO;
2002 		goto out1;
2003 	}
2004 
2005 	strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2006 
2007 	/*
2008 	 * If mount by non-root, then verify that user has necessary
2009 	 * permissions on the device.
2010 	 */
2011 	if (!vfs_context_issuser(ctx)) {
2012 		accessmode = KAUTH_VNODE_READ_DATA;
2013 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2014 			accessmode |= KAUTH_VNODE_WRITE_DATA;
2015 		}
2016 		if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2017 			IMGSRC_DEBUG("Access denied.\n");
2018 			goto out1;
2019 		}
2020 	}
2021 
2022 	*devvpp = vp;
2023 
2024 out1:
2025 	vnode_put(realdevvp);
2026 
2027 out:
2028 	nameidone(&nd);
2029 
2030 	if (error) {
2031 		vnode_put(vp);
2032 	}
2033 
2034 	return error;
2035 }
2036 
2037 /*
2038  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2039  * and call checkdirs()
2040  */
2041 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2042 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2043 {
2044 	int error;
2045 
2046 	mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2047 
2048 	IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2049 	    mp->mnt_vtable->vfc_name, vnode_getname(vp));
2050 
2051 	vnode_lock_spin(vp);
2052 	CLR(vp->v_flag, VMOUNT);
2053 	vp->v_mountedhere = mp;
2054 	SET(vp->v_flag, VMOUNTEDHERE);
2055 	vnode_unlock(vp);
2056 
2057 	/*
2058 	 * taking the name_cache_lock exclusively will
2059 	 * insure that everyone is out of the fast path who
2060 	 * might be trying to use a now stale copy of
2061 	 * vp->v_mountedhere->mnt_realrootvp
2062 	 * bumping mount_generation causes the cached values
2063 	 * to be invalidated
2064 	 */
2065 	name_cache_lock();
2066 	mount_generation++;
2067 	name_cache_unlock();
2068 
2069 	error = vnode_ref(vp);
2070 	if (error != 0) {
2071 		goto out;
2072 	}
2073 
2074 	error = checkdirs(vp, ctx);
2075 	if (error != 0) {
2076 		/* Unmount the filesystem as cdir/rdirs cannot be updated */
2077 		vnode_rele(vp);
2078 		goto out;
2079 	}
2080 
2081 out:
2082 	if (error != 0) {
2083 		mp->mnt_vnodecovered = NULLVP;
2084 	}
2085 	return error;
2086 }
2087 
2088 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2089 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2090 {
2091 	vnode_rele(vp);
2092 	vnode_lock_spin(vp);
2093 	CLR(vp->v_flag, (VMOUNT | VMOUNTEDHERE));
2094 	vp->v_mountedhere = (mount_t)NULL;
2095 	vnode_unlock(vp);
2096 
2097 	mp->mnt_vnodecovered = NULLVP;
2098 }
2099 
2100 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2101 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2102 {
2103 	int error;
2104 
2105 	/* unmount in progress return error */
2106 	mount_lock_spin(mp);
2107 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2108 		mount_unlock(mp);
2109 		return EBUSY;
2110 	}
2111 	mount_unlock(mp);
2112 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2113 
2114 	/*
2115 	 * We only allow the filesystem to be reloaded if it
2116 	 * is currently mounted read-only.
2117 	 */
2118 	if ((flags & MNT_RELOAD) &&
2119 	    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2120 		error = ENOTSUP;
2121 		goto out;
2122 	}
2123 
2124 	/*
2125 	 * Only root, or the user that did the original mount is
2126 	 * permitted to update it.
2127 	 */
2128 	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2129 	    (!vfs_context_issuser(ctx))) {
2130 		error = EPERM;
2131 		goto out;
2132 	}
2133 #if CONFIG_MACF
2134 	error = mac_mount_check_remount(ctx, mp);
2135 	if (error != 0) {
2136 		goto out;
2137 	}
2138 #endif
2139 
2140 out:
2141 	if (error) {
2142 		lck_rw_done(&mp->mnt_rwlock);
2143 	}
2144 
2145 	return error;
2146 }
2147 
2148 static void
mount_end_update(mount_t mp)2149 mount_end_update(mount_t mp)
2150 {
2151 	lck_rw_done(&mp->mnt_rwlock);
2152 }
2153 
2154 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2155 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2156 {
2157 	vnode_t vp;
2158 
2159 	if (height >= MAX_IMAGEBOOT_NESTING) {
2160 		return EINVAL;
2161 	}
2162 
2163 	vp = imgsrc_rootvnodes[height];
2164 	if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2165 		*rvpp = vp;
2166 		return 0;
2167 	} else {
2168 		return ENOENT;
2169 	}
2170 }
2171 
2172 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2173 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2174     struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2175     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2176 {
2177 	int error;
2178 	mount_t mp;
2179 	boolean_t placed = FALSE;
2180 	struct vfstable *vfsp;
2181 	user_addr_t devpath;
2182 	char *old_mntonname;
2183 	vnode_t rvp;
2184 	vnode_t devvp;
2185 	uint32_t height;
2186 	uint32_t flags;
2187 
2188 	/* If we didn't imageboot, nothing to move */
2189 	if (imgsrc_rootvnodes[0] == NULLVP) {
2190 		return EINVAL;
2191 	}
2192 
2193 	/* Only root can do this */
2194 	if (!vfs_context_issuser(ctx)) {
2195 		return EPERM;
2196 	}
2197 
2198 	IMGSRC_DEBUG("looking for root vnode.\n");
2199 
2200 	/*
2201 	 * Get root vnode of filesystem we're moving.
2202 	 */
2203 	if (by_index) {
2204 		if (is64bit) {
2205 			struct user64_mnt_imgsrc_args mia64;
2206 			error = copyin(fsmountargs, &mia64, sizeof(mia64));
2207 			if (error != 0) {
2208 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2209 				return error;
2210 			}
2211 
2212 			height = mia64.mi_height;
2213 			flags = mia64.mi_flags;
2214 			devpath = (user_addr_t)mia64.mi_devpath;
2215 		} else {
2216 			struct user32_mnt_imgsrc_args mia32;
2217 			error = copyin(fsmountargs, &mia32, sizeof(mia32));
2218 			if (error != 0) {
2219 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2220 				return error;
2221 			}
2222 
2223 			height = mia32.mi_height;
2224 			flags = mia32.mi_flags;
2225 			devpath = mia32.mi_devpath;
2226 		}
2227 	} else {
2228 		/*
2229 		 * For binary compatibility--assumes one level of nesting.
2230 		 */
2231 		if (is64bit) {
2232 			if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2233 				return error;
2234 			}
2235 		} else {
2236 			user32_addr_t tmp;
2237 			if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2238 				return error;
2239 			}
2240 
2241 			/* munge into LP64 addr */
2242 			devpath = CAST_USER_ADDR_T(tmp);
2243 		}
2244 
2245 		height = 0;
2246 		flags = 0;
2247 	}
2248 
2249 	if (flags != 0) {
2250 		IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2251 		return EINVAL;
2252 	}
2253 
2254 	error = get_imgsrc_rootvnode(height, &rvp);
2255 	if (error != 0) {
2256 		IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2257 		return error;
2258 	}
2259 
2260 	IMGSRC_DEBUG("got old root vnode\n");
2261 
2262 	old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2263 
2264 	/* Can only move once */
2265 	mp = vnode_mount(rvp);
2266 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2267 		IMGSRC_DEBUG("Already moved.\n");
2268 		error = EBUSY;
2269 		goto out0;
2270 	}
2271 
2272 	IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2273 	IMGSRC_DEBUG("Starting updated.\n");
2274 
2275 	/* Get exclusive rwlock on mount, authorize update on mp */
2276 	error = mount_begin_update(mp, ctx, 0);
2277 	if (error != 0) {
2278 		IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2279 		goto out0;
2280 	}
2281 
2282 	/*
2283 	 * It can only be moved once.  Flag is set under the rwlock,
2284 	 * so we're now safe to proceed.
2285 	 */
2286 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2287 		IMGSRC_DEBUG("Already moved [2]\n");
2288 		goto out1;
2289 	}
2290 
2291 	IMGSRC_DEBUG("Preparing coveredvp.\n");
2292 
2293 	/* Mark covered vnode as mount in progress, authorize placing mount on top */
2294 	error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2295 	if (error != 0) {
2296 		IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2297 		goto out1;
2298 	}
2299 
2300 	IMGSRC_DEBUG("Covered vp OK.\n");
2301 
2302 	/* Sanity check the name caller has provided */
2303 	vfsp = mp->mnt_vtable;
2304 	if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2305 		IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2306 		    vfsp->vfc_name, fsname);
2307 		error = EINVAL;
2308 		goto out2;
2309 	}
2310 
2311 	/* Check the device vnode and update mount-from name, for local filesystems */
2312 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2313 		IMGSRC_DEBUG("Local, doing device validation.\n");
2314 
2315 		if (devpath != USER_ADDR_NULL) {
2316 			error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2317 			if (error) {
2318 				IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2319 				goto out2;
2320 			}
2321 
2322 			vnode_put(devvp);
2323 		}
2324 	}
2325 
2326 	/*
2327 	 * Place mp on top of vnode, ref the vnode,  call checkdirs(),
2328 	 * and increment the name cache's mount generation
2329 	 */
2330 
2331 	IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2332 	error = place_mount_and_checkdirs(mp, vp, ctx);
2333 	if (error != 0) {
2334 		goto out2;
2335 	}
2336 
2337 	placed = TRUE;
2338 
2339 	strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2340 	strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2341 
2342 	/* Forbid future moves */
2343 	mount_lock(mp);
2344 	mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2345 	mount_unlock(mp);
2346 
2347 	/* Finally, add to mount list, completely ready to go */
2348 	if (mount_list_add(mp) != 0) {
2349 		/*
2350 		 * The system is shutting down trying to umount
2351 		 * everything, so fail with a plausible errno.
2352 		 */
2353 		error = EBUSY;
2354 		goto out3;
2355 	}
2356 
2357 	mount_end_update(mp);
2358 	vnode_put(rvp);
2359 	zfree(ZV_NAMEI, old_mntonname);
2360 
2361 	vfs_notify_mount(pvp);
2362 
2363 	return 0;
2364 out3:
2365 	strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2366 
2367 	mount_lock(mp);
2368 	mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2369 	mount_unlock(mp);
2370 
2371 out2:
2372 	/*
2373 	 * Placing the mp on the vnode clears VMOUNT,
2374 	 * so cleanup is different after that point
2375 	 */
2376 	if (placed) {
2377 		/* Rele the vp, clear VMOUNT and v_mountedhere */
2378 		undo_place_on_covered_vp(mp, vp);
2379 	} else {
2380 		vnode_lock_spin(vp);
2381 		CLR(vp->v_flag, VMOUNT);
2382 		vnode_unlock(vp);
2383 	}
2384 out1:
2385 	mount_end_update(mp);
2386 
2387 out0:
2388 	vnode_put(rvp);
2389 	zfree(ZV_NAMEI, old_mntonname);
2390 	return error;
2391 }
2392 
2393 #endif /* CONFIG_IMGSRC_ACCESS */
2394 
2395 void
enablequotas(struct mount * mp,vfs_context_t ctx)2396 enablequotas(struct mount *mp, vfs_context_t ctx)
2397 {
2398 	struct nameidata qnd;
2399 	int type;
2400 	char qfpath[MAXPATHLEN];
2401 	const char *qfname = QUOTAFILENAME;
2402 	const char *qfopsname = QUOTAOPSNAME;
2403 	const char *qfextension[] = INITQFNAMES;
2404 
2405 	/* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2406 	if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2407 		return;
2408 	}
2409 	/*
2410 	 * Enable filesystem disk quotas if necessary.
2411 	 * We ignore errors as this should not interfere with final mount
2412 	 */
2413 	for (type = 0; type < MAXQUOTAS; type++) {
2414 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2415 		NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2416 		    CAST_USER_ADDR_T(qfpath), ctx);
2417 		if (namei(&qnd) != 0) {
2418 			continue;           /* option file to trigger quotas is not present */
2419 		}
2420 		vnode_put(qnd.ni_vp);
2421 		nameidone(&qnd);
2422 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2423 
2424 		(void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2425 	}
2426 	return;
2427 }
2428 
2429 
2430 static int
checkdirs_callback(proc_t p,void * arg)2431 checkdirs_callback(proc_t p, void * arg)
2432 {
2433 	struct cdirargs *cdrp = (struct cdirargs *)arg;
2434 	vnode_t olddp = cdrp->olddp;
2435 	vnode_t newdp = cdrp->newdp;
2436 	struct filedesc *fdp = &p->p_fd;
2437 	vnode_t new_cvp = newdp;
2438 	vnode_t new_rvp = newdp;
2439 	vnode_t old_cvp = NULL;
2440 	vnode_t old_rvp = NULL;
2441 
2442 	/*
2443 	 * XXX Also needs to iterate each thread in the process to see if it
2444 	 * XXX is using a per-thread current working directory, and, if so,
2445 	 * XXX update that as well.
2446 	 */
2447 
2448 	/*
2449 	 * First, with the proc_fdlock held, check to see if we will need
2450 	 * to do any work.  If not, we will get out fast.
2451 	 */
2452 	proc_fdlock(p);
2453 	if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2454 		proc_fdunlock(p);
2455 		return PROC_RETURNED;
2456 	}
2457 	proc_fdunlock(p);
2458 
2459 	/*
2460 	 * Ok, we will have to do some work.  Always take two refs
2461 	 * because we might need that many.  We'll dispose of whatever
2462 	 * we ended up not using.
2463 	 */
2464 	if (vnode_ref(newdp) != 0) {
2465 		return PROC_RETURNED;
2466 	}
2467 	if (vnode_ref(newdp) != 0) {
2468 		vnode_rele(newdp);
2469 		return PROC_RETURNED;
2470 	}
2471 
2472 	proc_dirs_lock_exclusive(p);
2473 	/*
2474 	 * Now do the work.  Note: we dropped the proc_fdlock, so we
2475 	 * have to do all of the checks again.
2476 	 */
2477 	proc_fdlock(p);
2478 	if (fdp->fd_cdir == olddp) {
2479 		old_cvp = olddp;
2480 		fdp->fd_cdir = newdp;
2481 		new_cvp = NULL;
2482 	}
2483 	if (fdp->fd_rdir == olddp) {
2484 		old_rvp = olddp;
2485 		fdp->fd_rdir = newdp;
2486 		new_rvp = NULL;
2487 	}
2488 	proc_fdunlock(p);
2489 	proc_dirs_unlock_exclusive(p);
2490 
2491 	/*
2492 	 * Dispose of any references that are no longer needed.
2493 	 */
2494 	if (old_cvp != NULL) {
2495 		vnode_rele(old_cvp);
2496 	}
2497 	if (old_rvp != NULL) {
2498 		vnode_rele(old_rvp);
2499 	}
2500 	if (new_cvp != NULL) {
2501 		vnode_rele(new_cvp);
2502 	}
2503 	if (new_rvp != NULL) {
2504 		vnode_rele(new_rvp);
2505 	}
2506 
2507 	return PROC_RETURNED;
2508 }
2509 
2510 
2511 
2512 /*
2513  * Scan all active processes to see if any of them have a current
2514  * or root directory onto which the new filesystem has just been
2515  * mounted. If so, replace them with the new mount point.
2516  */
2517 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2518 checkdirs(vnode_t olddp, vfs_context_t ctx)
2519 {
2520 	vnode_t newdp;
2521 	vnode_t tvp;
2522 	int err;
2523 	struct cdirargs cdr;
2524 
2525 	if (olddp->v_usecount == 1) {
2526 		return 0;
2527 	}
2528 	err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2529 
2530 	if (err != 0) {
2531 #if DIAGNOSTIC
2532 		panic("mount: lost mount: error %d", err);
2533 #endif
2534 		return err;
2535 	}
2536 
2537 	cdr.olddp = olddp;
2538 	cdr.newdp = newdp;
2539 	/* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2540 	proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2541 
2542 	if (rootvnode == olddp) {
2543 		vnode_ref(newdp);
2544 		lck_rw_lock_exclusive(&rootvnode_rw_lock);
2545 		tvp = rootvnode;
2546 		rootvnode = newdp;
2547 		lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2548 		vnode_rele(tvp);
2549 	}
2550 
2551 	vnode_put(newdp);
2552 	return 0;
2553 }
2554 
2555 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT        \
2556 	"com.apple.private.vfs.role-account-unmount"
2557 
2558 /*
2559  * Unmount a file system.
2560  *
2561  * Note: unmount takes a path to the vnode mounted on as argument,
2562  * not special file (as before).
2563  */
2564 /* ARGSUSED */
2565 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2566 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2567 {
2568 	vnode_t vp;
2569 	struct mount *mp;
2570 	int error;
2571 	struct nameidata nd;
2572 	vfs_context_t ctx;
2573 
2574 	/*
2575 	 * If the process has the entitlement, use the kernel's context when
2576 	 * performing lookup on the mount path as the process might lack proper
2577 	 * permission to access the directory.
2578 	 */
2579 	ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2580 	    vfs_context_kernel() : vfs_context_current();
2581 
2582 	NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2583 	    UIO_USERSPACE, uap->path, ctx);
2584 	error = namei(&nd);
2585 	if (error) {
2586 		return error;
2587 	}
2588 	vp = nd.ni_vp;
2589 	mp = vp->v_mount;
2590 	nameidone(&nd);
2591 
2592 	/*
2593 	 * Must be the root of the filesystem
2594 	 */
2595 	if ((vp->v_flag & VROOT) == 0) {
2596 		vnode_put(vp);
2597 		return EINVAL;
2598 	}
2599 #if CONFIG_MACF
2600 	error = mac_mount_check_umount(ctx, mp);
2601 	if (error != 0) {
2602 		vnode_put(vp);
2603 		return error;
2604 	}
2605 #endif
2606 	mount_ref(mp, 0);
2607 	vnode_put(vp);
2608 	/* safedounmount consumes the mount ref */
2609 	return safedounmount(mp, uap->flags, ctx);
2610 }
2611 
2612 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2613 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2614 {
2615 	mount_t mp;
2616 
2617 	mp = mount_list_lookupby_fsid(fsid, 0, 1);
2618 	if (mp == (mount_t)0) {
2619 		return ENOENT;
2620 	}
2621 	mount_ref(mp, 0);
2622 	mount_iterdrop(mp);
2623 	/* safedounmount consumes the mount ref */
2624 	return safedounmount(mp, flags, ctx);
2625 }
2626 
2627 /*
2628  * The mount struct comes with a mount ref which will be consumed.
2629  * Do the actual file system unmount, prevent some common foot shooting.
2630  */
2631 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2632 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2633 {
2634 	int error;
2635 	proc_t p = vfs_context_proc(ctx);
2636 
2637 	/*
2638 	 * If the file system is not responding and MNT_NOBLOCK
2639 	 * is set and not a forced unmount then return EBUSY.
2640 	 */
2641 	if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2642 	    (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2643 		error = EBUSY;
2644 		goto out;
2645 	}
2646 
2647 	/*
2648 	 * Skip authorization in two cases:
2649 	 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2650 	 *   This entitlement allows non-root processes unmount volumes mounted by
2651 	 *   other processes.
2652 	 * - If the mount is tagged as permissive and this is not a forced-unmount
2653 	 *   attempt.
2654 	 */
2655 	if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2656 	    (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2657 		/*
2658 		 * Only root, or the user that did the original mount is
2659 		 * permitted to unmount this filesystem.
2660 		 */
2661 		if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2662 		    (error = suser(kauth_cred_get(), &p->p_acflag))) {
2663 			goto out;
2664 		}
2665 	}
2666 	/*
2667 	 * Don't allow unmounting the root file system, or other volumes
2668 	 * associated with it (for example, the associated VM or DATA mounts) .
2669 	 */
2670 	if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2671 		if (!(mp->mnt_flag & MNT_ROOTFS)) {
2672 			printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2673 			    mp->mnt_vfsstat.f_mntonname);
2674 		}
2675 		error = EBUSY; /* the root (or associated volumes) is always busy */
2676 		goto out;
2677 	}
2678 
2679 	/*
2680 	 * If the mount is providing the root filesystem's disk image
2681 	 * (i.e. imageboot), don't allow unmounting
2682 	 */
2683 	if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2684 		error = EBUSY;
2685 		goto out;
2686 	}
2687 
2688 	return dounmount(mp, flags, 1, ctx);
2689 
2690 out:
2691 	mount_drop(mp, 0);
2692 	return error;
2693 }
2694 
2695 /*
2696  * Do the actual file system unmount.
2697  */
2698 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2699 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2700 {
2701 	vnode_t coveredvp = (vnode_t)0;
2702 	int error;
2703 	int needwakeup = 0;
2704 	int forcedunmount = 0;
2705 	int lflags = 0;
2706 	struct vnode *devvp = NULLVP;
2707 #if CONFIG_TRIGGERS
2708 	proc_t p = vfs_context_proc(ctx);
2709 	int did_vflush = 0;
2710 	int pflags_save = 0;
2711 #endif /* CONFIG_TRIGGERS */
2712 
2713 #if CONFIG_FSE
2714 	if (!(flags & MNT_FORCE)) {
2715 		fsevent_unmount(mp, ctx);  /* has to come first! */
2716 	}
2717 #endif
2718 
2719 	mount_lock(mp);
2720 
2721 	/*
2722 	 * If already an unmount in progress just return EBUSY.
2723 	 * Even a forced unmount cannot override.
2724 	 */
2725 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2726 		if (withref != 0) {
2727 			mount_drop(mp, 1);
2728 		}
2729 		mount_unlock(mp);
2730 		return EBUSY;
2731 	}
2732 
2733 	if (flags & MNT_FORCE) {
2734 		forcedunmount = 1;
2735 		mp->mnt_lflag |= MNT_LFORCE;
2736 	}
2737 
2738 #if CONFIG_TRIGGERS
2739 	if (flags & MNT_NOBLOCK && p != kernproc) {
2740 		pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2741 	}
2742 #endif
2743 
2744 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
2745 	mp->mnt_lflag |= MNT_LUNMOUNT;
2746 	mp->mnt_flag &= ~MNT_ASYNC;
2747 	/*
2748 	 * anyone currently in the fast path that
2749 	 * trips over the cached rootvp will be
2750 	 * dumped out and forced into the slow path
2751 	 * to regenerate a new cached value
2752 	 */
2753 	mp->mnt_realrootvp = NULLVP;
2754 	mount_unlock(mp);
2755 
2756 	if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2757 		/*
2758 		 * Force unmount any mounts in this filesystem.
2759 		 * If any unmounts fail - just leave them dangling.
2760 		 * Avoids recursion.
2761 		 */
2762 		(void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2763 	}
2764 
2765 	/*
2766 	 * taking the name_cache_lock exclusively will
2767 	 * insure that everyone is out of the fast path who
2768 	 * might be trying to use a now stale copy of
2769 	 * vp->v_mountedhere->mnt_realrootvp
2770 	 * bumping mount_generation causes the cached values
2771 	 * to be invalidated
2772 	 */
2773 	name_cache_lock();
2774 	mount_generation++;
2775 	name_cache_unlock();
2776 
2777 
2778 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2779 	if (withref != 0) {
2780 		mount_drop(mp, 0);
2781 	}
2782 	error = 0;
2783 	if (forcedunmount == 0) {
2784 		ubc_umount(mp); /* release cached vnodes */
2785 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2786 			error = VFS_SYNC(mp, MNT_WAIT, ctx);
2787 			if (error) {
2788 				mount_lock(mp);
2789 				mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2790 				mp->mnt_lflag &= ~MNT_LUNMOUNT;
2791 				mp->mnt_lflag &= ~MNT_LFORCE;
2792 				goto out;
2793 			}
2794 		}
2795 	}
2796 
2797 	IOBSDMountChange(mp, kIOMountChangeUnmount);
2798 
2799 #if CONFIG_TRIGGERS
2800 	vfs_nested_trigger_unmounts(mp, flags, ctx);
2801 	did_vflush = 1;
2802 #endif
2803 	if (forcedunmount) {
2804 		lflags |= FORCECLOSE;
2805 	}
2806 	error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2807 	if ((forcedunmount == 0) && error) {
2808 		mount_lock(mp);
2809 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2810 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2811 		mp->mnt_lflag &= ~MNT_LFORCE;
2812 		goto out;
2813 	}
2814 
2815 	/* make sure there are no one in the mount iterations or lookup */
2816 	mount_iterdrain(mp);
2817 
2818 	error = VFS_UNMOUNT(mp, flags, ctx);
2819 	if (error) {
2820 		mount_iterreset(mp);
2821 		mount_lock(mp);
2822 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2823 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2824 		mp->mnt_lflag &= ~MNT_LFORCE;
2825 		goto out;
2826 	}
2827 
2828 	/* increment the operations count */
2829 	if (!error) {
2830 		OSAddAtomic(1, &vfs_nummntops);
2831 	}
2832 
2833 	if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2834 		/* hold an io reference and drop the usecount before close */
2835 		devvp = mp->mnt_devvp;
2836 		vnode_getalways(devvp);
2837 		vnode_rele(devvp);
2838 		VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2839 		    ctx);
2840 		vnode_clearmountedon(devvp);
2841 		vnode_put(devvp);
2842 	}
2843 	lck_rw_done(&mp->mnt_rwlock);
2844 	mount_list_remove(mp);
2845 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2846 
2847 	/* mark the mount point hook in the vp but not drop the ref yet */
2848 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2849 		/*
2850 		 * The covered vnode needs special handling. Trying to get an
2851 		 * iocount must not block here as this may lead to deadlocks
2852 		 * if the Filesystem to which the covered vnode belongs is
2853 		 * undergoing forced unmounts. Since we hold a usecount, the
2854 		 * vnode cannot be reused (it can, however, still be terminated)
2855 		 */
2856 		vnode_getalways(coveredvp);
2857 		vnode_lock_spin(coveredvp);
2858 
2859 		mp->mnt_crossref++;
2860 		coveredvp->v_mountedhere = (struct mount *)0;
2861 		CLR(coveredvp->v_flag, VMOUNT | VMOUNTEDHERE);
2862 		vnode_unlock(coveredvp);
2863 		vnode_put(coveredvp);
2864 	}
2865 
2866 	mount_list_lock();
2867 	mp->mnt_vtable->vfc_refcount--;
2868 	mount_list_unlock();
2869 
2870 	cache_purgevfs(mp);     /* remove cache entries for this file sys */
2871 	vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2872 	mount_lock(mp);
2873 	mp->mnt_lflag |= MNT_LDEAD;
2874 
2875 	if (mp->mnt_lflag & MNT_LWAIT) {
2876 		/*
2877 		 * do the wakeup here
2878 		 * in case we block in mount_refdrain
2879 		 * which will drop the mount lock
2880 		 * and allow anyone blocked in vfs_busy
2881 		 * to wakeup and see the LDEAD state
2882 		 */
2883 		mp->mnt_lflag &= ~MNT_LWAIT;
2884 		wakeup((caddr_t)mp);
2885 	}
2886 	mount_refdrain(mp);
2887 
2888 	/* free disk_conditioner_info structure for this mount */
2889 	disk_conditioner_unmount(mp);
2890 
2891 out:
2892 	if (mp->mnt_lflag & MNT_LWAIT) {
2893 		mp->mnt_lflag &= ~MNT_LWAIT;
2894 		needwakeup = 1;
2895 	}
2896 
2897 #if CONFIG_TRIGGERS
2898 	if (flags & MNT_NOBLOCK && p != kernproc) {
2899 		// Restore P_NOREMOTEHANG bit to its previous value
2900 		if ((pflags_save & P_NOREMOTEHANG) == 0) {
2901 			OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2902 		}
2903 	}
2904 
2905 	/*
2906 	 * Callback and context are set together under the mount lock, and
2907 	 * never cleared, so we're safe to examine them here, drop the lock,
2908 	 * and call out.
2909 	 */
2910 	if (mp->mnt_triggercallback != NULL) {
2911 		mount_unlock(mp);
2912 		if (error == 0) {
2913 			mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2914 		} else if (did_vflush) {
2915 			mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2916 		}
2917 	} else {
2918 		mount_unlock(mp);
2919 	}
2920 #else
2921 	mount_unlock(mp);
2922 #endif /* CONFIG_TRIGGERS */
2923 
2924 	lck_rw_done(&mp->mnt_rwlock);
2925 
2926 	if (needwakeup) {
2927 		wakeup((caddr_t)mp);
2928 	}
2929 
2930 	if (!error) {
2931 		if ((coveredvp != NULLVP)) {
2932 			vnode_t pvp = NULLVP;
2933 
2934 			/*
2935 			 * The covered vnode needs special handling. Trying to
2936 			 * get an iocount must not block here as this may lead
2937 			 * to deadlocks if the Filesystem to which the covered
2938 			 * vnode belongs is undergoing forced unmounts. Since we
2939 			 * hold a usecount, the  vnode cannot be reused
2940 			 * (it can, however, still be terminated).
2941 			 */
2942 			vnode_getalways(coveredvp);
2943 
2944 			mount_dropcrossref(mp, coveredvp, 0);
2945 			/*
2946 			 * We'll _try_ to detect if this really needs to be
2947 			 * done. The coveredvp can only be in termination (or
2948 			 * terminated) if the coveredvp's mount point is in a
2949 			 * forced unmount (or has been) since we still hold the
2950 			 * ref.
2951 			 */
2952 			if (!vnode_isrecycled(coveredvp)) {
2953 				pvp = vnode_getparent(coveredvp);
2954 #if CONFIG_TRIGGERS
2955 				if (coveredvp->v_resolve) {
2956 					vnode_trigger_rearm(coveredvp, ctx);
2957 				}
2958 #endif
2959 			}
2960 
2961 			vnode_rele(coveredvp);
2962 			vnode_put(coveredvp);
2963 			coveredvp = NULLVP;
2964 
2965 			if (pvp) {
2966 				lock_vnode_and_post(pvp, NOTE_WRITE);
2967 				vnode_put(pvp);
2968 			}
2969 		} else if (mp->mnt_flag & MNT_ROOTFS) {
2970 			if (nc_smr_enabled) {
2971 				vfs_smr_synchronize();
2972 			}
2973 
2974 			mount_lock_destroy(mp);
2975 #if CONFIG_MACF
2976 			mac_mount_label_destroy(mp);
2977 #endif
2978 			zfree(mount_zone, mp);
2979 		} else {
2980 			panic("dounmount: no coveredvp");
2981 		}
2982 	}
2983 	return error;
2984 }
2985 
2986 /*
2987  * Unmount any mounts in this filesystem.
2988  */
2989 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)2990 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2991 {
2992 	mount_t smp;
2993 	fsid_t *fsids, fsid;
2994 	int fsids_sz;
2995 	int count = 0, i, m = 0;
2996 	vnode_t vp;
2997 
2998 	mount_list_lock();
2999 
3000 	// Get an array to hold the submounts fsids.
3001 	TAILQ_FOREACH(smp, &mountlist, mnt_list)
3002 	count++;
3003 	fsids_sz = count * sizeof(fsid_t);
3004 	fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3005 	if (fsids == NULL) {
3006 		mount_list_unlock();
3007 		goto out;
3008 	}
3009 	fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
3010 
3011 	/*
3012 	 * Fill the array with submount fsids.
3013 	 * Since mounts are always added to the tail of the mount list, the
3014 	 * list is always in mount order.
3015 	 * For each mount check if the mounted-on vnode belongs to a
3016 	 * mount that's already added to our array of mounts to be unmounted.
3017 	 */
3018 	for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3019 		vp = smp->mnt_vnodecovered;
3020 		if (vp == NULL) {
3021 			continue;
3022 		}
3023 		fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
3024 		for (i = 0; i <= m; i++) {
3025 			if (fsids[i].val[0] == fsid.val[0] &&
3026 			    fsids[i].val[1] == fsid.val[1]) {
3027 				fsids[++m] = smp->mnt_vfsstat.f_fsid;
3028 				break;
3029 			}
3030 		}
3031 	}
3032 	mount_list_unlock();
3033 
3034 	// Unmount the submounts in reverse order. Ignore errors.
3035 	for (i = m; i > 0; i--) {
3036 		smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3037 		if (smp) {
3038 			mount_ref(smp, 0);
3039 			mount_iterdrop(smp);
3040 			(void) dounmount(smp, flags, 1, ctx);
3041 		}
3042 	}
3043 out:
3044 	kfree_data(fsids, fsids_sz);
3045 }
3046 
3047 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3048 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3049 {
3050 	vnode_hold(dp);
3051 	vnode_lock(dp);
3052 	mp->mnt_crossref--;
3053 
3054 	if (mp->mnt_crossref < 0) {
3055 		panic("mount cross refs -ve");
3056 	}
3057 
3058 	if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3059 		if (need_put) {
3060 			vnode_put_locked(dp);
3061 		}
3062 		vnode_drop_and_unlock(dp);
3063 
3064 		if (nc_smr_enabled) {
3065 			vfs_smr_synchronize();
3066 		}
3067 
3068 		mount_lock_destroy(mp);
3069 #if CONFIG_MACF
3070 		mac_mount_label_destroy(mp);
3071 #endif
3072 		zfree(mount_zone, mp);
3073 		return;
3074 	}
3075 	if (need_put) {
3076 		vnode_put_locked(dp);
3077 	}
3078 	vnode_drop_and_unlock(dp);
3079 }
3080 
3081 
3082 /*
3083  * Sync each mounted filesystem.
3084  */
3085 #if DIAGNOSTIC
3086 int syncprt = 0;
3087 #endif
3088 
3089 int print_vmpage_stat = 0;
3090 
3091 /*
3092  * sync_callback:	simple wrapper that calls VFS_SYNC() on volumes
3093  *			mounted read-write with the passed waitfor value.
3094  *
3095  * Parameters:	mp	mount-point descriptor per mounted file-system instance.
3096  *		arg	user argument (please see below)
3097  *
3098  * User argument is a pointer to 32 bit unsigned integer which describes the
3099  * type of waitfor value to set for calling VFS_SYNC().  If user argument is
3100  * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3101  * waitfor value.
3102  *
3103  * Returns:		VFS_RETURNED
3104  */
3105 static int
sync_callback(mount_t mp,void * arg)3106 sync_callback(mount_t mp, void *arg)
3107 {
3108 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3109 		int asyncflag = mp->mnt_flag & MNT_ASYNC;
3110 		unsigned waitfor = MNT_NOWAIT;
3111 
3112 		if (arg) {
3113 			waitfor = *(uint32_t*)arg;
3114 		}
3115 
3116 		/* Sanity check for flags - these are the only valid combinations for the flag bits*/
3117 		if (waitfor != MNT_WAIT &&
3118 		    waitfor != (MNT_WAIT | MNT_VOLUME) &&
3119 		    waitfor != MNT_NOWAIT &&
3120 		    waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3121 		    waitfor != MNT_DWAIT &&
3122 		    waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3123 			panic("Passed inappropriate waitfor %u to "
3124 			    "sync_callback()", waitfor);
3125 		}
3126 
3127 		mp->mnt_flag &= ~MNT_ASYNC;
3128 		(void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3129 		if (asyncflag) {
3130 			mp->mnt_flag |= MNT_ASYNC;
3131 		}
3132 	}
3133 
3134 	return VFS_RETURNED;
3135 }
3136 
3137 /* ARGSUSED */
3138 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3139 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3140 {
3141 	vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3142 
3143 	if (print_vmpage_stat) {
3144 		vm_countdirtypages();
3145 	}
3146 
3147 #if DIAGNOSTIC
3148 	if (syncprt) {
3149 		vfs_bufstats();
3150 	}
3151 #endif /* DIAGNOSTIC */
3152 	return 0;
3153 }
3154 
3155 typedef enum {
3156 	SYNC_ALL = 0,
3157 	SYNC_ONLY_RELIABLE_MEDIA = 1,
3158 	SYNC_ONLY_UNRELIABLE_MEDIA = 2
3159 } sync_type_t;
3160 
3161 static int
sync_internal_callback(mount_t mp,void * arg)3162 sync_internal_callback(mount_t mp, void *arg)
3163 {
3164 	if (arg) {
3165 		int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3166 		    (mp->mnt_flag & MNT_LOCAL);
3167 		sync_type_t sync_type = *((sync_type_t *)arg);
3168 
3169 		if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3170 			return VFS_RETURNED;
3171 		} else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3172 			return VFS_RETURNED;
3173 		}
3174 	}
3175 
3176 	(void)sync_callback(mp, NULL);
3177 
3178 	return VFS_RETURNED;
3179 }
3180 
3181 int sync_thread_state = 0;
3182 int sync_timeout_seconds = 5;
3183 
3184 #define SYNC_THREAD_RUN       0x0001
3185 #define SYNC_THREAD_RUNNING   0x0002
3186 
3187 #if CONFIG_PHYS_WRITE_ACCT
3188 thread_t pm_sync_thread;
3189 #endif /* CONFIG_PHYS_WRITE_ACCT */
3190 
3191 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3192 sync_thread(__unused void *arg, __unused wait_result_t wr)
3193 {
3194 	sync_type_t sync_type;
3195 #if CONFIG_PHYS_WRITE_ACCT
3196 	pm_sync_thread = current_thread();
3197 #endif /* CONFIG_PHYS_WRITE_ACCT */
3198 
3199 	lck_mtx_lock(&sync_mtx_lck);
3200 	while (sync_thread_state & SYNC_THREAD_RUN) {
3201 		sync_thread_state &= ~SYNC_THREAD_RUN;
3202 		lck_mtx_unlock(&sync_mtx_lck);
3203 
3204 		sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3205 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3206 		sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3207 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3208 
3209 		lck_mtx_lock(&sync_mtx_lck);
3210 	}
3211 	/*
3212 	 * This wakeup _has_ to be issued before the lock is released otherwise
3213 	 * we may end up waking up a thread in sync_internal which is
3214 	 * expecting a wakeup from a thread it just created and not from this
3215 	 * thread which is about to exit.
3216 	 */
3217 	wakeup(&sync_thread_state);
3218 	sync_thread_state &= ~SYNC_THREAD_RUNNING;
3219 #if CONFIG_PHYS_WRITE_ACCT
3220 	pm_sync_thread = NULL;
3221 #endif /* CONFIG_PHYS_WRITE_ACCT */
3222 	lck_mtx_unlock(&sync_mtx_lck);
3223 
3224 	if (print_vmpage_stat) {
3225 		vm_countdirtypages();
3226 	}
3227 
3228 #if DIAGNOSTIC
3229 	if (syncprt) {
3230 		vfs_bufstats();
3231 	}
3232 #endif /* DIAGNOSTIC */
3233 }
3234 
3235 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3236 
3237 /*
3238  * An in-kernel sync for power management to call.
3239  * This function always returns within sync_timeout seconds.
3240  */
3241 __private_extern__ int
sync_internal(void)3242 sync_internal(void)
3243 {
3244 	thread_t thd = NULL;
3245 	int error;
3246 	int thread_created = FALSE;
3247 	struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3248 
3249 	lck_mtx_lock(&sync_mtx_lck);
3250 	sync_thread_state |= SYNC_THREAD_RUN;
3251 	if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3252 		int kr;
3253 
3254 		sync_thread_state |= SYNC_THREAD_RUNNING;
3255 		kr = kernel_thread_start(sync_thread, NULL, &thd);
3256 		if (kr != KERN_SUCCESS) {
3257 			sync_thread_state &= ~SYNC_THREAD_RUNNING;
3258 			lck_mtx_unlock(&sync_mtx_lck);
3259 			printf("sync_thread failed\n");
3260 			return 0;
3261 		}
3262 		thread_created = TRUE;
3263 	}
3264 
3265 	error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3266 	    (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3267 	if (error) {
3268 		struct timeval now;
3269 
3270 		microtime(&now);
3271 		if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3272 			printf("sync timed out: %d sec\n", sync_timeout_seconds);
3273 			sync_timeout_last_print.tv_sec = now.tv_sec;
3274 		}
3275 	}
3276 
3277 	if (thread_created) {
3278 		thread_deallocate(thd);
3279 	}
3280 
3281 	return 0;
3282 } /* end of sync_internal call */
3283 
3284 /*
3285  * Change filesystem quotas.
3286  */
3287 #if QUOTA
3288 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3289 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3290 {
3291 	struct mount *mp;
3292 	int error, quota_cmd, quota_status = 0;
3293 	caddr_t datap;
3294 	size_t fnamelen;
3295 	struct nameidata nd;
3296 	vfs_context_t ctx = vfs_context_current();
3297 	struct dqblk my_dqblk = {};
3298 
3299 	AUDIT_ARG(uid, uap->uid);
3300 	AUDIT_ARG(cmd, uap->cmd);
3301 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3302 	    uap->path, ctx);
3303 	error = namei(&nd);
3304 	if (error) {
3305 		return error;
3306 	}
3307 	mp = nd.ni_vp->v_mount;
3308 	mount_ref(mp, 0);
3309 	vnode_put(nd.ni_vp);
3310 	nameidone(&nd);
3311 
3312 #if CONFIG_MACF
3313 	error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3314 	if (error != 0) {
3315 		goto out;
3316 	}
3317 #endif
3318 
3319 	/* copyin any data we will need for downstream code */
3320 	quota_cmd = uap->cmd >> SUBCMDSHIFT;
3321 
3322 	switch (quota_cmd) {
3323 	case Q_QUOTAON:
3324 		/* uap->arg specifies a file from which to take the quotas */
3325 		fnamelen = MAXPATHLEN;
3326 		datap = zalloc(ZV_NAMEI);
3327 		error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3328 		break;
3329 	case Q_GETQUOTA:
3330 		/* uap->arg is a pointer to a dqblk structure. */
3331 		datap = (caddr_t) &my_dqblk;
3332 		break;
3333 	case Q_SETQUOTA:
3334 	case Q_SETUSE:
3335 		/* uap->arg is a pointer to a dqblk structure. */
3336 		datap = (caddr_t) &my_dqblk;
3337 		if (proc_is64bit(p)) {
3338 			struct user_dqblk       my_dqblk64;
3339 			error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3340 			if (error == 0) {
3341 				munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3342 			}
3343 		} else {
3344 			error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3345 		}
3346 		break;
3347 	case Q_QUOTASTAT:
3348 		/* uap->arg is a pointer to an integer */
3349 		datap = (caddr_t) &quota_status;
3350 		break;
3351 	default:
3352 		datap = NULL;
3353 		break;
3354 	} /* switch */
3355 
3356 	if (error == 0) {
3357 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3358 	}
3359 
3360 	switch (quota_cmd) {
3361 	case Q_QUOTAON:
3362 		if (datap != NULL) {
3363 			zfree(ZV_NAMEI, datap);
3364 		}
3365 		break;
3366 	case Q_GETQUOTA:
3367 		/* uap->arg is a pointer to a dqblk structure we need to copy out to */
3368 		if (error == 0) {
3369 			if (proc_is64bit(p)) {
3370 				struct user_dqblk       my_dqblk64;
3371 
3372 				memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3373 				munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3374 				error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3375 			} else {
3376 				error = copyout(datap, uap->arg, sizeof(struct dqblk));
3377 			}
3378 		}
3379 		break;
3380 	case Q_QUOTASTAT:
3381 		/* uap->arg is a pointer to an integer */
3382 		if (error == 0) {
3383 			error = copyout(datap, uap->arg, sizeof(quota_status));
3384 		}
3385 		break;
3386 	default:
3387 		break;
3388 	} /* switch */
3389 
3390 out:
3391 	mount_drop(mp, 0);
3392 	return error;
3393 }
3394 #else
3395 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3396 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3397 {
3398 	return EOPNOTSUPP;
3399 }
3400 #endif /* QUOTA */
3401 
3402 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3403 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3404 {
3405 	int error;
3406 	vfs_context_t ctx = vfs_context_current();
3407 
3408 #if CONFIG_MACF
3409 	error = mac_mount_check_stat(ctx, mp);
3410 	if (error != 0) {
3411 		return error;
3412 	}
3413 #endif
3414 
3415 	error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3416 	if (error != 0) {
3417 		return error;
3418 	}
3419 
3420 	return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3421 }
3422 
3423 /*
3424  * Get filesystem statistics.
3425  *
3426  * Returns:	0			Success
3427  *	namei:???
3428  *	vfs_update_vfsstat:???
3429  *	munge_statfs:EFAULT
3430  */
3431 /* ARGSUSED */
3432 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3433 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3434 {
3435 	int error;
3436 	struct mount *mp;
3437 	struct nameidata nd;
3438 	vfs_context_t ctx = vfs_context_current();
3439 	vnode_t vp;
3440 
3441 	NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3442 	    UIO_USERSPACE, uap->path, ctx);
3443 	error = namei(&nd);
3444 	if (error != 0) {
3445 		return error;
3446 	}
3447 	vp = nd.ni_vp;
3448 	mp = vp->v_mount;
3449 	nameidone(&nd);
3450 
3451 	error = statfs_internal(p, mp, uap->buf);
3452 	vnode_put(vp);
3453 
3454 	return error;
3455 }
3456 
3457 /*
3458  * Get filesystem statistics.
3459  */
3460 /* ARGSUSED */
3461 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3462 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3463 {
3464 	int error;
3465 	vnode_t vp = NULL;
3466 	struct mount *mp;
3467 
3468 	AUDIT_ARG(fd, uap->fd);
3469 
3470 	if ((error = file_vnode(uap->fd, &vp)) ||
3471 	    (error = vnode_getwithref(vp))) {
3472 		goto out;
3473 	}
3474 
3475 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3476 
3477 	mp = vp->v_mount;
3478 	if (!mp) {
3479 		error = EBADF;
3480 		goto out_vnode;
3481 	}
3482 
3483 	error = statfs_internal(p, mp, uap->buf);
3484 
3485 out_vnode:
3486 	vnode_put(vp);
3487 
3488 out:
3489 	if (vp != NULL) {
3490 		file_drop(uap->fd);
3491 	}
3492 
3493 	return error;
3494 }
3495 
3496 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3497 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3498 {
3499 	struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3500 
3501 	bzero(sfs, sizeof(*sfs));
3502 
3503 	sfs->f_bsize = vsfs->f_bsize;
3504 	sfs->f_iosize = (int32_t)vsfs->f_iosize;
3505 	sfs->f_blocks = vsfs->f_blocks;
3506 	sfs->f_bfree = vsfs->f_bfree;
3507 	sfs->f_bavail = vsfs->f_bavail;
3508 	sfs->f_files = vsfs->f_files;
3509 	sfs->f_ffree = vsfs->f_ffree;
3510 	sfs->f_fsid = vsfs->f_fsid;
3511 	sfs->f_owner = vsfs->f_owner;
3512 	sfs->f_type = mp->mnt_vtable->vfc_typenum;
3513 	sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3514 	sfs->f_fssubtype = vsfs->f_fssubtype;
3515 	sfs->f_flags_ext = 0;
3516 	if (mp->mnt_kern_flag & MNTK_SYSTEMDATA) {
3517 		sfs->f_flags_ext |= MNT_EXT_ROOT_DATA_VOL;
3518 	}
3519 	if (mp->mnt_kern_flag & MNTK_FSKIT) {
3520 		sfs->f_flags_ext |= MNT_EXT_FSKIT;
3521 	}
3522 	vfs_getfstypename(mp, sfs->f_fstypename, MFSTYPENAMELEN);
3523 	strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3524 	strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3525 }
3526 
3527 /*
3528  * Get file system statistics in 64-bit mode
3529  */
3530 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3531 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3532 {
3533 	struct mount *mp;
3534 	int error;
3535 	struct nameidata *ndp;
3536 	struct statfs64 *sfsp;
3537 	vfs_context_t ctxp = vfs_context_current();
3538 	vnode_t vp;
3539 	struct {
3540 		struct nameidata nd;
3541 		struct statfs64 sfs;
3542 	} *__nameidata_statfs64;
3543 
3544 	__nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3545 	    Z_WAITOK);
3546 	ndp = &__nameidata_statfs64->nd;
3547 
3548 	NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3549 	    UIO_USERSPACE, uap->path, ctxp);
3550 	error = namei(ndp);
3551 	if (error != 0) {
3552 		goto out;
3553 	}
3554 	vp = ndp->ni_vp;
3555 	mp = vp->v_mount;
3556 	nameidone(ndp);
3557 
3558 #if CONFIG_MACF
3559 	error = mac_mount_check_stat(ctxp, mp);
3560 	if (error != 0) {
3561 		vnode_put(vp);
3562 		goto out;
3563 	}
3564 #endif
3565 
3566 	error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3567 	if (error != 0) {
3568 		vnode_put(vp);
3569 		goto out;
3570 	}
3571 
3572 	sfsp = &__nameidata_statfs64->sfs;
3573 	vfs_get_statfs64(mp, sfsp);
3574 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3575 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3576 		/* This process does not want to see a seperate data volume mountpoint */
3577 		strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3578 	}
3579 	error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3580 	vnode_put(vp);
3581 
3582 out:
3583 	kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3584 
3585 	return error;
3586 }
3587 
3588 /*
3589  * Get file system statistics in 64-bit mode
3590  */
3591 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3592 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3593 {
3594 	struct vnode *vp;
3595 	struct mount *mp;
3596 	struct statfs64 sfs;
3597 	int error;
3598 
3599 	AUDIT_ARG(fd, uap->fd);
3600 
3601 	if ((error = file_vnode(uap->fd, &vp))) {
3602 		return error;
3603 	}
3604 
3605 	error = vnode_getwithref(vp);
3606 	if (error) {
3607 		file_drop(uap->fd);
3608 		return error;
3609 	}
3610 
3611 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3612 
3613 	mp = vp->v_mount;
3614 	if (!mp) {
3615 		error = EBADF;
3616 		goto out;
3617 	}
3618 
3619 #if CONFIG_MACF
3620 	error = mac_mount_check_stat(vfs_context_current(), mp);
3621 	if (error != 0) {
3622 		goto out;
3623 	}
3624 #endif
3625 
3626 	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3627 		goto out;
3628 	}
3629 
3630 	vfs_get_statfs64(mp, &sfs);
3631 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3632 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3633 		/* This process does not want to see a seperate data volume mountpoint */
3634 		strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3635 	}
3636 	error = copyout(&sfs, uap->buf, sizeof(sfs));
3637 
3638 out:
3639 	file_drop(uap->fd);
3640 	vnode_put(vp);
3641 
3642 	return error;
3643 }
3644 
3645 struct getfsstat_struct {
3646 	user_addr_t     sfsp;
3647 	user_addr_t     *mp;
3648 	int             count;
3649 	int             maxcount;
3650 	int             flags;
3651 	int             error;
3652 };
3653 
3654 
3655 static int
getfsstat_callback(mount_t mp,void * arg)3656 getfsstat_callback(mount_t mp, void * arg)
3657 {
3658 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3659 	struct vfsstatfs *sp;
3660 	int error, my_size;
3661 	vfs_context_t ctx = vfs_context_current();
3662 
3663 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3664 #if CONFIG_MACF
3665 		error = mac_mount_check_stat(ctx, mp);
3666 		if (error != 0) {
3667 			fstp->error = error;
3668 			return VFS_RETURNED_DONE;
3669 		}
3670 #endif
3671 		sp = &mp->mnt_vfsstat;
3672 		/*
3673 		 * If MNT_NOWAIT is specified, do not refresh the
3674 		 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3675 		 */
3676 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3677 		    (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3678 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3679 		    (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3680 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3681 			return VFS_RETURNED;
3682 		}
3683 
3684 		/*
3685 		 * Need to handle LP64 version of struct statfs
3686 		 */
3687 		error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3688 		if (error) {
3689 			fstp->error = error;
3690 			return VFS_RETURNED_DONE;
3691 		}
3692 		fstp->sfsp += my_size;
3693 
3694 		if (fstp->mp) {
3695 #if CONFIG_MACF
3696 			error = mac_mount_label_get(mp, *fstp->mp);
3697 			if (error) {
3698 				fstp->error = error;
3699 				return VFS_RETURNED_DONE;
3700 			}
3701 #endif
3702 			fstp->mp++;
3703 		}
3704 	}
3705 	fstp->count++;
3706 	return VFS_RETURNED;
3707 }
3708 
3709 /*
3710  * Get statistics on all filesystems.
3711  */
3712 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3713 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3714 {
3715 	struct __mac_getfsstat_args muap;
3716 
3717 	muap.buf = uap->buf;
3718 	muap.bufsize = uap->bufsize;
3719 	muap.mac = USER_ADDR_NULL;
3720 	muap.macsize = 0;
3721 	muap.flags = uap->flags;
3722 
3723 	return __mac_getfsstat(p, &muap, retval);
3724 }
3725 
3726 /*
3727  * __mac_getfsstat: Get MAC-related file system statistics
3728  *
3729  * Parameters:    p                        (ignored)
3730  *                uap                      User argument descriptor (see below)
3731  *                retval                   Count of file system statistics (N stats)
3732  *
3733  * Indirect:      uap->bufsize             Buffer size
3734  *                uap->macsize             MAC info size
3735  *                uap->buf                 Buffer where information will be returned
3736  *                uap->mac                 MAC info
3737  *                uap->flags               File system flags
3738  *
3739  *
3740  * Returns:        0                       Success
3741  *                !0                       Not success
3742  *
3743  */
3744 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3745 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3746 {
3747 	user_addr_t sfsp;
3748 	user_addr_t *mp;
3749 	size_t count, maxcount, bufsize, macsize;
3750 	struct getfsstat_struct fst;
3751 
3752 	if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3753 		return EINVAL;
3754 	}
3755 
3756 	bufsize = (size_t) uap->bufsize;
3757 	macsize = (size_t) uap->macsize;
3758 
3759 	if (IS_64BIT_PROCESS(p)) {
3760 		maxcount = bufsize / sizeof(struct user64_statfs);
3761 	} else {
3762 		maxcount = bufsize / sizeof(struct user32_statfs);
3763 	}
3764 	sfsp = uap->buf;
3765 	count = 0;
3766 
3767 	mp = NULL;
3768 
3769 #if CONFIG_MACF
3770 	if (uap->mac != USER_ADDR_NULL) {
3771 		u_int32_t *mp0;
3772 		int error;
3773 		unsigned int i;
3774 
3775 		count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3776 		if (count != maxcount) {
3777 			return EINVAL;
3778 		}
3779 
3780 		/* Copy in the array */
3781 		mp0 = kalloc_data(macsize, Z_WAITOK);
3782 		if (mp0 == NULL) {
3783 			return ENOMEM;
3784 		}
3785 
3786 		error = copyin(uap->mac, mp0, macsize);
3787 		if (error) {
3788 			kfree_data(mp0, macsize);
3789 			return error;
3790 		}
3791 
3792 		/* Normalize to an array of user_addr_t */
3793 		mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3794 		if (mp == NULL) {
3795 			kfree_data(mp0, macsize);
3796 			return ENOMEM;
3797 		}
3798 
3799 		for (i = 0; i < count; i++) {
3800 			if (IS_64BIT_PROCESS(p)) {
3801 				mp[i] = ((user_addr_t *)mp0)[i];
3802 			} else {
3803 				mp[i] = (user_addr_t)mp0[i];
3804 			}
3805 		}
3806 		kfree_data(mp0, macsize);
3807 	}
3808 #endif
3809 
3810 
3811 	fst.sfsp = sfsp;
3812 	fst.mp = mp;
3813 	fst.flags = uap->flags;
3814 	fst.count = 0;
3815 	fst.error = 0;
3816 	fst.maxcount = (int)maxcount;
3817 
3818 
3819 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3820 
3821 	if (mp) {
3822 		kfree_data(mp, count * sizeof(user_addr_t));
3823 	}
3824 
3825 	if (fst.error) {
3826 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3827 		return fst.error;
3828 	}
3829 
3830 	if (fst.sfsp && fst.count > fst.maxcount) {
3831 		*retval = fst.maxcount;
3832 	} else {
3833 		*retval = fst.count;
3834 	}
3835 	return 0;
3836 }
3837 
3838 static int
getfsstat64_callback(mount_t mp,void * arg)3839 getfsstat64_callback(mount_t mp, void * arg)
3840 {
3841 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3842 	struct vfsstatfs *sp;
3843 	struct statfs64 sfs;
3844 	int error;
3845 
3846 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3847 #if CONFIG_MACF
3848 		error = mac_mount_check_stat(vfs_context_current(), mp);
3849 		if (error != 0) {
3850 			fstp->error = error;
3851 			return VFS_RETURNED_DONE;
3852 		}
3853 #endif
3854 		sp = &mp->mnt_vfsstat;
3855 		/*
3856 		 * If MNT_NOWAIT is specified, do not refresh the fsstat
3857 		 * cache. MNT_WAIT overrides MNT_NOWAIT.
3858 		 *
3859 		 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3860 		 * getfsstat, since the constants are out of the same
3861 		 * namespace.
3862 		 */
3863 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3864 		    ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3865 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3866 		    (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3867 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3868 			return VFS_RETURNED;
3869 		}
3870 
3871 		vfs_get_statfs64(mp, &sfs);
3872 		error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3873 		if (error) {
3874 			fstp->error = error;
3875 			return VFS_RETURNED_DONE;
3876 		}
3877 		fstp->sfsp += sizeof(sfs);
3878 	}
3879 	fstp->count++;
3880 	return VFS_RETURNED;
3881 }
3882 
3883 /*
3884  * Get statistics on all file systems in 64 bit mode.
3885  */
3886 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)3887 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3888 {
3889 	user_addr_t sfsp;
3890 	int count, maxcount;
3891 	struct getfsstat_struct fst;
3892 
3893 	maxcount = uap->bufsize / sizeof(struct statfs64);
3894 
3895 	sfsp = uap->buf;
3896 	count = 0;
3897 
3898 	fst.sfsp = sfsp;
3899 	fst.flags = uap->flags;
3900 	fst.count = 0;
3901 	fst.error = 0;
3902 	fst.maxcount = maxcount;
3903 
3904 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3905 
3906 	if (fst.error) {
3907 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3908 		return fst.error;
3909 	}
3910 
3911 	if (fst.sfsp && fst.count > fst.maxcount) {
3912 		*retval = fst.maxcount;
3913 	} else {
3914 		*retval = fst.count;
3915 	}
3916 
3917 	return 0;
3918 }
3919 
3920 /*
3921  * gets the associated vnode with the file descriptor passed.
3922  * as input
3923  *
3924  * INPUT
3925  * ctx - vfs context of caller
3926  * fd - file descriptor for which vnode is required.
3927  * vpp - Pointer to pointer to vnode to be returned.
3928  *
3929  * The vnode is returned with an iocount so any vnode obtained
3930  * by this call needs a vnode_put
3931  *
3932  */
3933 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)3934 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3935 {
3936 	int error;
3937 	vnode_t vp;
3938 	struct fileproc *fp;
3939 	proc_t p = vfs_context_proc(ctx);
3940 
3941 	*vpp =  NULLVP;
3942 
3943 	error = fp_getfvp(p, fd, &fp, &vp);
3944 	if (error) {
3945 		return error;
3946 	}
3947 
3948 	error = vnode_getwithref(vp);
3949 	if (error) {
3950 		(void)fp_drop(p, fd, fp, 0);
3951 		return error;
3952 	}
3953 
3954 	(void)fp_drop(p, fd, fp, 0);
3955 	*vpp = vp;
3956 	return error;
3957 }
3958 
3959 /*
3960  * Wrapper function around namei to start lookup from a directory
3961  * specified by a file descriptor ni_dirfd.
3962  *
3963  * In addition to all the errors returned by namei, this call can
3964  * return ENOTDIR if the file descriptor does not refer to a directory.
3965  * and EBADF if the file descriptor is not valid.
3966  */
3967 int
nameiat(struct nameidata * ndp,int dirfd)3968 nameiat(struct nameidata *ndp, int dirfd)
3969 {
3970 	if ((dirfd != AT_FDCWD) &&
3971 	    !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3972 	    !(ndp->ni_cnd.cn_flags & USEDVP)) {
3973 		int error = 0;
3974 		char c;
3975 
3976 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3977 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
3978 			if (error) {
3979 				return error;
3980 			}
3981 		} else {
3982 			c = *((char *)(ndp->ni_dirp));
3983 		}
3984 
3985 		if (c != '/') {
3986 			vnode_t dvp_at;
3987 
3988 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3989 			    &dvp_at);
3990 			if (error) {
3991 				return error;
3992 			}
3993 
3994 			if (vnode_vtype(dvp_at) != VDIR) {
3995 				vnode_put(dvp_at);
3996 				return ENOTDIR;
3997 			}
3998 
3999 			ndp->ni_dvp = dvp_at;
4000 			ndp->ni_cnd.cn_flags |= USEDVP;
4001 			error = namei(ndp);
4002 			ndp->ni_cnd.cn_flags &= ~USEDVP;
4003 			vnode_put(dvp_at);
4004 			return error;
4005 		}
4006 	}
4007 
4008 	return namei(ndp);
4009 }
4010 
4011 /*
4012  * Change current working directory to a given file descriptor.
4013  */
4014 /* ARGSUSED */
4015 int
fchdir(proc_t p,vfs_context_t ctx,int fd,bool per_thread)4016 fchdir(proc_t p, vfs_context_t ctx, int fd, bool per_thread)
4017 {
4018 	vnode_t vp;
4019 	vnode_t tdp;
4020 	vnode_t tvp;
4021 	struct mount *mp;
4022 	int error, should_put = 1;
4023 
4024 	AUDIT_ARG(fd, fd);
4025 	if (per_thread && fd == -1) {
4026 		/*
4027 		 * Switching back from per-thread to per process CWD; verify we
4028 		 * in fact have one before proceeding.  The only success case
4029 		 * for this code path is to return 0 preemptively after zapping
4030 		 * the thread structure contents.
4031 		 */
4032 		thread_t th = vfs_context_thread(ctx);
4033 		if (th) {
4034 			uthread_t uth = get_bsdthread_info(th);
4035 			tvp = uth->uu_cdir;
4036 			uth->uu_cdir = NULLVP;
4037 			if (tvp != NULLVP) {
4038 				vnode_rele(tvp);
4039 				return 0;
4040 			}
4041 		}
4042 		return EBADF;
4043 	}
4044 
4045 	if ((error = file_vnode(fd, &vp))) {
4046 		return error;
4047 	}
4048 	if ((error = vnode_getwithref(vp))) {
4049 		file_drop(fd);
4050 		return error;
4051 	}
4052 
4053 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4054 
4055 	if (vp->v_type != VDIR) {
4056 		error = ENOTDIR;
4057 		goto out;
4058 	}
4059 
4060 #if CONFIG_MACF
4061 	error = mac_vnode_check_chdir(ctx, vp);
4062 	if (error) {
4063 		goto out;
4064 	}
4065 #endif
4066 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4067 	if (error) {
4068 		goto out;
4069 	}
4070 
4071 	while (!error && (mp = vp->v_mountedhere) != NULL) {
4072 		if (vfs_busy(mp, LK_NOWAIT)) {
4073 			error = EACCES;
4074 			goto out;
4075 		}
4076 		error = VFS_ROOT(mp, &tdp, ctx);
4077 		vfs_unbusy(mp);
4078 		if (error) {
4079 			break;
4080 		}
4081 		vnode_put(vp);
4082 		vp = tdp;
4083 	}
4084 	if (error) {
4085 		goto out;
4086 	}
4087 	if ((error = vnode_ref(vp))) {
4088 		goto out;
4089 	}
4090 	vnode_put(vp);
4091 	should_put = 0;
4092 
4093 	if (per_thread) {
4094 		thread_t th = vfs_context_thread(ctx);
4095 		if (th) {
4096 			uthread_t uth = get_bsdthread_info(th);
4097 			tvp = uth->uu_cdir;
4098 			uth->uu_cdir = vp;
4099 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4100 		} else {
4101 			vnode_rele(vp);
4102 			error = ENOENT;
4103 			goto out;
4104 		}
4105 	} else {
4106 		proc_dirs_lock_exclusive(p);
4107 		proc_fdlock(p);
4108 		tvp = p->p_fd.fd_cdir;
4109 		p->p_fd.fd_cdir = vp;
4110 		proc_fdunlock(p);
4111 		proc_dirs_unlock_exclusive(p);
4112 	}
4113 
4114 	if (tvp) {
4115 		vnode_rele(tvp);
4116 	}
4117 
4118 out:
4119 	if (should_put) {
4120 		vnode_put(vp);
4121 	}
4122 	file_drop(fd);
4123 
4124 	return error;
4125 }
4126 
4127 int
sys_fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4128 sys_fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4129 {
4130 	return fchdir(p, vfs_context_current(), uap->fd, false);
4131 }
4132 
4133 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4134 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4135 {
4136 	return fchdir(p, vfs_context_current(), uap->fd, true);
4137 }
4138 
4139 
4140 /*
4141  * Change current working directory (".").
4142  *
4143  * Returns:	0			Success
4144  *	change_dir:ENOTDIR
4145  *	change_dir:???
4146  *	vnode_ref:ENOENT		No such file or directory
4147  */
4148 /* ARGSUSED */
4149 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4150 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4151 {
4152 	int error;
4153 	vnode_t tvp;
4154 
4155 	error = change_dir(ndp, ctx);
4156 	if (error) {
4157 		return error;
4158 	}
4159 	if ((error = vnode_ref(ndp->ni_vp))) {
4160 		vnode_put(ndp->ni_vp);
4161 		return error;
4162 	}
4163 	/*
4164 	 * drop the iocount we picked up in change_dir
4165 	 */
4166 	vnode_put(ndp->ni_vp);
4167 
4168 	if (per_thread) {
4169 		thread_t th = vfs_context_thread(ctx);
4170 		if (th) {
4171 			uthread_t uth = get_bsdthread_info(th);
4172 			tvp = uth->uu_cdir;
4173 			uth->uu_cdir = ndp->ni_vp;
4174 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4175 		} else {
4176 			vnode_rele(ndp->ni_vp);
4177 			return ENOENT;
4178 		}
4179 	} else {
4180 		proc_dirs_lock_exclusive(p);
4181 		proc_fdlock(p);
4182 		tvp = p->p_fd.fd_cdir;
4183 		p->p_fd.fd_cdir = ndp->ni_vp;
4184 		proc_fdunlock(p);
4185 		proc_dirs_unlock_exclusive(p);
4186 	}
4187 
4188 	if (tvp) {
4189 		vnode_rele(tvp);
4190 	}
4191 
4192 	return 0;
4193 }
4194 
4195 
4196 /*
4197  * Change current working directory (".").
4198  *
4199  * Returns:	0			Success
4200  *	chdir_internal:ENOTDIR
4201  *	chdir_internal:ENOENT		No such file or directory
4202  *	chdir_internal:???
4203  */
4204 /* ARGSUSED */
4205 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4206 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4207 {
4208 	struct nameidata nd;
4209 	vfs_context_t ctx = vfs_context_current();
4210 
4211 	NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4212 	    UIO_USERSPACE, uap->path, ctx);
4213 
4214 	return chdir_internal(p, ctx, &nd, per_thread);
4215 }
4216 
4217 
4218 /*
4219  * chdir
4220  *
4221  * Change current working directory (".") for the entire process
4222  *
4223  * Parameters:  p       Process requesting the call
4224  *              uap     User argument descriptor (see below)
4225  *              retval  (ignored)
4226  *
4227  * Indirect parameters:	uap->path	Directory path
4228  *
4229  * Returns:	0			Success
4230  *              common_chdir: ENOTDIR
4231  *              common_chdir: ENOENT	No such file or directory
4232  *              common_chdir: ???
4233  *
4234  */
4235 int
sys_chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4236 sys_chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4237 {
4238 	return common_chdir(p, (void *)uap, 0);
4239 }
4240 
4241 /*
4242  * __pthread_chdir
4243  *
4244  * Change current working directory (".") for a single thread
4245  *
4246  * Parameters:  p       Process requesting the call
4247  *              uap     User argument descriptor (see below)
4248  *              retval  (ignored)
4249  *
4250  * Indirect parameters:	uap->path	Directory path
4251  *
4252  * Returns:	0			Success
4253  *              common_chdir: ENOTDIR
4254  *		common_chdir: ENOENT	No such file or directory
4255  *		common_chdir: ???
4256  *
4257  */
4258 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4259 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4260 {
4261 	return common_chdir(p, (void *)uap, 1);
4262 }
4263 
4264 
4265 /*
4266  * Change notion of root (``/'') directory.
4267  */
4268 /* ARGSUSED */
4269 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4270 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4271 {
4272 	struct filedesc *fdp = &p->p_fd;
4273 	int error;
4274 	struct nameidata nd;
4275 	vnode_t tvp;
4276 	vfs_context_t ctx = vfs_context_current();
4277 
4278 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4279 		return error;
4280 	}
4281 
4282 	NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4283 	    UIO_USERSPACE, uap->path, ctx);
4284 	error = change_dir(&nd, ctx);
4285 	if (error) {
4286 		return error;
4287 	}
4288 
4289 #if CONFIG_MACF
4290 	error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4291 	    &nd.ni_cnd);
4292 	if (error) {
4293 		vnode_put(nd.ni_vp);
4294 		return error;
4295 	}
4296 #endif
4297 
4298 	if ((error = vnode_ref(nd.ni_vp))) {
4299 		vnode_put(nd.ni_vp);
4300 		return error;
4301 	}
4302 	vnode_put(nd.ni_vp);
4303 
4304 	/*
4305 	 * This lock provides the guarantee that as long as you hold the lock
4306 	 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4307 	 * on a referenced vnode in namei when determining the rootvnode for
4308 	 * a process.
4309 	 */
4310 	/* needed for synchronization with lookup */
4311 	proc_dirs_lock_exclusive(p);
4312 	/* needed for setting the flag and other activities on the fd itself */
4313 	proc_fdlock(p);
4314 	tvp = fdp->fd_rdir;
4315 	fdp->fd_rdir = nd.ni_vp;
4316 	fdt_flag_set(fdp, FD_CHROOT);
4317 	proc_fdunlock(p);
4318 	proc_dirs_unlock_exclusive(p);
4319 
4320 	if (tvp != NULL) {
4321 		vnode_rele(tvp);
4322 	}
4323 
4324 	return 0;
4325 }
4326 
4327 #define PATHSTATICBUFLEN 256
4328 #define PIVOT_ROOT_ENTITLEMENT              \
4329        "com.apple.private.vfs.pivot-root"
4330 
4331 #if defined(XNU_TARGET_OS_OSX)
4332 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4333 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4334 {
4335 	int error;
4336 	char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4337 	char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4338 	char *new_rootfs_path_before_buf = NULL;
4339 	char *old_rootfs_path_after_buf = NULL;
4340 	char *incoming = NULL;
4341 	char *outgoing = NULL;
4342 	vnode_t incoming_rootvp = NULLVP;
4343 	size_t bytes_copied;
4344 
4345 	/*
4346 	 * XXX : Additional restrictions needed
4347 	 * - perhaps callable only once.
4348 	 */
4349 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4350 		return error;
4351 	}
4352 
4353 	/*
4354 	 * pivot_root can be executed by launchd only.
4355 	 * Enforce entitlement.
4356 	 */
4357 	if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4358 		return EPERM;
4359 	}
4360 
4361 	error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4362 	if (error == ENAMETOOLONG) {
4363 		new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4364 		error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4365 	}
4366 
4367 	if (error) {
4368 		goto out;
4369 	}
4370 
4371 	error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4372 	if (error == ENAMETOOLONG) {
4373 		old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4374 		error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4375 	}
4376 	if (error) {
4377 		goto out;
4378 	}
4379 
4380 	if (new_rootfs_path_before_buf) {
4381 		incoming = new_rootfs_path_before_buf;
4382 	} else {
4383 		incoming = &new_rootfs_path_before[0];
4384 	}
4385 
4386 	if (old_rootfs_path_after_buf) {
4387 		outgoing = old_rootfs_path_after_buf;
4388 	} else {
4389 		outgoing = &old_rootfs_path_after[0];
4390 	}
4391 
4392 	/*
4393 	 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4394 	 * Userland is not allowed to pivot to an image.
4395 	 */
4396 	error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4397 	if (error) {
4398 		goto out;
4399 	}
4400 	error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4401 	if (error) {
4402 		goto out;
4403 	}
4404 
4405 	error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4406 
4407 out:
4408 	if (incoming_rootvp != NULLVP) {
4409 		vnode_put(incoming_rootvp);
4410 		incoming_rootvp = NULLVP;
4411 	}
4412 
4413 	if (old_rootfs_path_after_buf) {
4414 		zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4415 	}
4416 
4417 	if (new_rootfs_path_before_buf) {
4418 		zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4419 	}
4420 
4421 	return error;
4422 }
4423 #else
4424 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4425 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4426 {
4427 	return nosys(p, NULL, retval);
4428 }
4429 #endif /* XNU_TARGET_OS_OSX */
4430 
4431 /*
4432  * Common routine for chroot and chdir.
4433  *
4434  * Returns:	0			Success
4435  *		ENOTDIR			Not a directory
4436  *		namei:???		[anything namei can return]
4437  *		vnode_authorize:???	[anything vnode_authorize can return]
4438  */
4439 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4440 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4441 {
4442 	vnode_t vp;
4443 	int error;
4444 
4445 	if ((error = namei(ndp))) {
4446 		return error;
4447 	}
4448 	nameidone(ndp);
4449 	vp = ndp->ni_vp;
4450 
4451 	if (vp->v_type != VDIR) {
4452 		vnode_put(vp);
4453 		return ENOTDIR;
4454 	}
4455 
4456 #if CONFIG_MACF
4457 	error = mac_vnode_check_chdir(ctx, vp);
4458 	if (error) {
4459 		vnode_put(vp);
4460 		return error;
4461 	}
4462 #endif
4463 
4464 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4465 	if (error) {
4466 		vnode_put(vp);
4467 		return error;
4468 	}
4469 
4470 	return error;
4471 }
4472 
4473 /*
4474  * Free the vnode data (for directories) associated with the file glob.
4475  */
4476 struct fd_vn_data *
fg_vn_data_alloc(void)4477 fg_vn_data_alloc(void)
4478 {
4479 	struct fd_vn_data *fvdata;
4480 
4481 	/* Allocate per fd vnode data */
4482 	fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4483 	lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4484 	return fvdata;
4485 }
4486 
4487 /*
4488  * Free the vnode data (for directories) associated with the file glob.
4489  */
4490 void
fg_vn_data_free(void * fgvndata)4491 fg_vn_data_free(void *fgvndata)
4492 {
4493 	struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4494 
4495 	kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4496 	lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4497 	kfree_type(struct fd_vn_data, fvdata);
4498 }
4499 
4500 /*
4501  * Check permissions, allocate an open file structure,
4502  * and call the device open routine if any.
4503  *
4504  * Returns:	0			Success
4505  *		EINVAL
4506  *		EINTR
4507  *	falloc:ENFILE
4508  *	falloc:EMFILE
4509  *	falloc:ENOMEM
4510  *	vn_open_auth:???
4511  *	dupfdopen:???
4512  *	VNOP_ADVLOCK:???
4513  *	vnode_setsize:???
4514  *
4515  * XXX Need to implement uid, gid
4516  */
4517 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4518 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4519     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4520 {
4521 	proc_t p = vfs_context_proc(ctx);
4522 	kauth_cred_t p_cred = current_cached_proc_cred(PROC_NULL);
4523 	uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4524 	struct fileproc *fp;
4525 	vnode_t vp;
4526 	int flags, oflags, amode;
4527 	int type, indx, error;
4528 	struct vfs_context context;
4529 	vnode_t authvp = NULLVP;
4530 
4531 	oflags = uflags;
4532 
4533 	amode = oflags & O_ACCMODE;
4534 	/*
4535 	 * Because O_RDONLY is 0, it is not possible to distinguish between
4536 	 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4537 	 * with FREAD/FWRITE.
4538 	 */
4539 	if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4540 		return EINVAL;
4541 	}
4542 
4543 	flags = FFLAGS(uflags);
4544 	CLR(flags, FENCRYPTED);
4545 	CLR(flags, FUNENCRYPTED);
4546 
4547 	AUDIT_ARG(fflags, oflags);
4548 	AUDIT_ARG(mode, vap->va_mode);
4549 
4550 	if ((error = falloc_withinit(p, p_cred, ctx, &fp, &indx, fp_init, initarg)) != 0) {
4551 		return error;
4552 	}
4553 	if (flags & O_CLOEXEC) {
4554 		fp->fp_flags |= FP_CLOEXEC;
4555 	}
4556 	if (flags & O_CLOFORK) {
4557 		fp->fp_flags |= FP_CLOFORK;
4558 	}
4559 
4560 	/* setup state to recognize when fdesc_open was called */
4561 	uu->uu_dupfd = -1;
4562 
4563 	/*
4564 	 * Disable read/write access if file is opened with O_EVTONLY and
4565 	 * the process has requested to deny read/write access.
4566 	 */
4567 	if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4568 		flags &= ~(FREAD | FWRITE);
4569 	}
4570 
4571 	if (authfd != AUTH_OPEN_NOAUTHFD) {
4572 		error = vnode_getfromfd(ctx, authfd, &authvp);
4573 		if (error) {
4574 			fp_free(p, indx, fp);
4575 			return error;
4576 		}
4577 	}
4578 
4579 	if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4580 		if (authvp != NULLVP) {
4581 			vnode_put(authvp);
4582 		}
4583 		if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4584 			if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4585 				*retval = indx;
4586 				return 0;
4587 			}
4588 		}
4589 		if (error == ERESTART) {
4590 			error = EINTR;
4591 		}
4592 		fp_free(p, indx, fp);
4593 		return error;
4594 	}
4595 
4596 	if (authvp != NULLVP) {
4597 		vnode_put(authvp);
4598 	}
4599 
4600 	uu->uu_dupfd = 0;
4601 	vp = ndp->ni_vp;
4602 
4603 	fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4604 	fp->fp_glob->fg_ops = &vnops;
4605 	fp_set_data(fp, vp);
4606 
4607 #if CONFIG_FILE_LEASES
4608 	/*
4609 	 * If we are creating a file or open with truncate, we need to break the
4610 	 * lease if there is a read lease placed on the parent dir.
4611 	 */
4612 	if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4613 		vnode_breakdirlease(vp, true, oflags);
4614 	}
4615 	/* Now check if there is a lease placed on the file itself. */
4616 	error = vnode_breaklease(vp, oflags, ctx);
4617 	if (error) {
4618 		goto bad;
4619 	}
4620 #endif /* CONFIG_FILE_LEASES */
4621 
4622 	if (flags & (O_EXLOCK | O_SHLOCK)) {
4623 		struct flock lf = {
4624 			.l_whence = SEEK_SET,
4625 		};
4626 
4627 		if (flags & O_EXLOCK) {
4628 			lf.l_type = F_WRLCK;
4629 		} else {
4630 			lf.l_type = F_RDLCK;
4631 		}
4632 		type = F_FLOCK;
4633 		if ((flags & FNONBLOCK) == 0) {
4634 			type |= F_WAIT;
4635 		}
4636 #if CONFIG_MACF
4637 		error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4638 		    F_SETLK, &lf);
4639 		if (error) {
4640 			goto bad;
4641 		}
4642 #endif
4643 		if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4644 			goto bad;
4645 		}
4646 		fp->fp_glob->fg_flag |= FWASLOCKED;
4647 	}
4648 
4649 	/* try to truncate by setting the size attribute */
4650 	if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4651 		goto bad;
4652 	}
4653 
4654 	/*
4655 	 * For directories we hold some additional information in the fd.
4656 	 */
4657 	if (vnode_vtype(vp) == VDIR) {
4658 		fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4659 	} else {
4660 		fp->fp_glob->fg_vn_data = NULL;
4661 	}
4662 
4663 #if CONFIG_SECLUDED_MEMORY
4664 	if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
4665 		memory_object_control_t moc;
4666 		const char *v_name;
4667 
4668 		moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4669 
4670 		if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4671 			/* nothing to do... */
4672 		} else if (fp->fp_glob->fg_flag & FWRITE) {
4673 			/* writable -> no longer  eligible for secluded pages */
4674 			memory_object_mark_eligible_for_secluded(moc,
4675 			    FALSE);
4676 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
4677 			char pathname[32] = { 0, };
4678 			size_t copied;
4679 			/* XXX FBDP: better way to detect /Applications/ ? */
4680 			if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4681 				(void)copyinstr(ndp->ni_dirp,
4682 				    pathname,
4683 				    sizeof(pathname),
4684 				    &copied);
4685 			} else {
4686 				copystr(CAST_DOWN(void *, ndp->ni_dirp),
4687 				    pathname,
4688 				    sizeof(pathname),
4689 				    &copied);
4690 			}
4691 			pathname[sizeof(pathname) - 1] = '\0';
4692 			if (strncmp(pathname,
4693 			    "/Applications/",
4694 			    strlen("/Applications/")) == 0 &&
4695 			    strncmp(pathname,
4696 			    "/Applications/Camera.app/",
4697 			    strlen("/Applications/Camera.app/")) != 0) {
4698 				/*
4699 				 * not writable
4700 				 * AND from "/Applications/"
4701 				 * AND not from "/Applications/Camera.app/"
4702 				 * ==> eligible for secluded
4703 				 */
4704 				memory_object_mark_eligible_for_secluded(moc,
4705 				    TRUE);
4706 			}
4707 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
4708 		    (v_name = vnode_getname(vp))) {
4709 			size_t len = strlen(v_name);
4710 
4711 			if (!strncmp(v_name, "dyld", len) ||
4712 			    !strncmp(v_name, "launchd", len) ||
4713 			    !strncmp(v_name, "Camera", len) ||
4714 			    !strncmp(v_name, "SpringBoard", len) ||
4715 			    !strncmp(v_name, "backboardd", len)) {
4716 				/*
4717 				 * This file matters when launching Camera:
4718 				 * do not store its contents in the secluded
4719 				 * pool that will be drained on Camera launch.
4720 				 */
4721 				memory_object_mark_eligible_for_secluded(moc,
4722 				    FALSE);
4723 			} else if (!strncmp(v_name, "mediaserverd", len)) {
4724 				memory_object_mark_eligible_for_secluded(moc,
4725 				    FALSE);
4726 				memory_object_mark_for_realtime(moc,
4727 				    true);
4728 			} else if (!strncmp(v_name, "bluetoothd", len)) {
4729 				/*
4730 				 * bluetoothd might be needed for realtime audio
4731 				 * playback.
4732 				 */
4733 				memory_object_mark_eligible_for_secluded(moc,
4734 				    FALSE);
4735 				memory_object_mark_for_realtime(moc,
4736 				    true);
4737 			} else {
4738 				char pathname[64] = { 0, };
4739 				size_t copied;
4740 				if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4741 					(void)copyinstr(ndp->ni_dirp,
4742 					    pathname,
4743 					    sizeof(pathname),
4744 					    &copied);
4745 				} else {
4746 					copystr(CAST_DOWN(void *, ndp->ni_dirp),
4747 					    pathname,
4748 					    sizeof(pathname),
4749 					    &copied);
4750 				}
4751 				pathname[sizeof(pathname) - 1] = '\0';
4752 				if (strncmp(pathname,
4753 				    "/Library/Audio/Plug-Ins/",
4754 				    strlen("/Library/Audio/Plug-Ins/")) == 0 ||
4755 				    strncmp(pathname,
4756 				    "/System/Library/Audio/Plug-Ins/",
4757 				    strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
4758 					/*
4759 					 * This may be an audio plugin required
4760 					 * for realtime playback.
4761 					 * ==> NOT eligible for secluded.
4762 					 */
4763 					memory_object_mark_eligible_for_secluded(moc,
4764 					    FALSE);
4765 					memory_object_mark_for_realtime(moc,
4766 					    true);
4767 				}
4768 			}
4769 			vnode_putname(v_name);
4770 		}
4771 	}
4772 #endif /* CONFIG_SECLUDED_MEMORY */
4773 
4774 	vnode_put(vp);
4775 
4776 	/*
4777 	 * The first terminal open (without a O_NOCTTY) by a session leader
4778 	 * results in it being set as the controlling terminal.
4779 	 */
4780 	if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4781 	    !(flags & O_NOCTTY)) {
4782 		int tmp = 0;
4783 
4784 		(void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4785 		    (caddr_t)&tmp, ctx);
4786 	}
4787 
4788 	proc_fdlock(p);
4789 	procfdtbl_releasefd(p, indx, NULL);
4790 
4791 	fp_drop(p, indx, fp, 1);
4792 	proc_fdunlock(p);
4793 
4794 	*retval = indx;
4795 
4796 	return 0;
4797 bad:
4798 	context = *vfs_context_current();
4799 	context.vc_ucred = fp->fp_glob->fg_cred;
4800 
4801 	if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4802 	    (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4803 		struct flock lf = {
4804 			.l_whence = SEEK_SET,
4805 			.l_type = F_UNLCK,
4806 		};
4807 
4808 		(void)VNOP_ADVLOCK(
4809 			vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4810 	}
4811 
4812 	vn_close(vp, fp->fp_glob->fg_flag, &context);
4813 	vnode_put(vp);
4814 	fp_free(p, indx, fp);
4815 
4816 	return error;
4817 }
4818 
4819 /*
4820  * While most of the *at syscall handlers can call nameiat() which
4821  * is a wrapper around namei, the use of namei and initialisation
4822  * of nameidata are far removed and in different functions  - namei
4823  * gets called in vn_open_auth for open1. So we'll just do here what
4824  * nameiat() does.
4825  */
4826 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)4827 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4828     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4829     int dirfd, int authfd)
4830 {
4831 	if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4832 		int error;
4833 		char c;
4834 
4835 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4836 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
4837 			if (error) {
4838 				return error;
4839 			}
4840 		} else {
4841 			c = *((char *)(ndp->ni_dirp));
4842 		}
4843 
4844 		if (c != '/') {
4845 			vnode_t dvp_at;
4846 
4847 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4848 			    &dvp_at);
4849 			if (error) {
4850 				return error;
4851 			}
4852 
4853 			if (vnode_vtype(dvp_at) != VDIR) {
4854 				vnode_put(dvp_at);
4855 				return ENOTDIR;
4856 			}
4857 
4858 			ndp->ni_dvp = dvp_at;
4859 			ndp->ni_cnd.cn_flags |= USEDVP;
4860 			error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4861 			    retval, authfd);
4862 			vnode_put(dvp_at);
4863 			return error;
4864 		}
4865 	}
4866 
4867 	return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
4868 }
4869 
4870 /*
4871  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4872  *
4873  * Parameters:	p			Process requesting the open
4874  *		uap			User argument descriptor (see below)
4875  *		retval			Pointer to an area to receive the
4876  *					return calue from the system call
4877  *
4878  * Indirect:	uap->path		Path to open (same as 'open')
4879  *		uap->flags		Flags to open (same as 'open'
4880  *		uap->uid		UID to set, if creating
4881  *		uap->gid		GID to set, if creating
4882  *		uap->mode		File mode, if creating (same as 'open')
4883  *		uap->xsecurity		ACL to set, if creating
4884  *
4885  * Returns:	0			Success
4886  *		!0			errno value
4887  *
4888  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
4889  *
4890  * XXX:		We should enummerate the possible errno values here, and where
4891  *		in the code they originated.
4892  */
4893 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)4894 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4895 {
4896 	int ciferror;
4897 	kauth_filesec_t xsecdst;
4898 	struct vnode_attr va;
4899 	struct nameidata nd;
4900 	int cmode;
4901 
4902 	AUDIT_ARG(owner, uap->uid, uap->gid);
4903 
4904 	xsecdst = NULL;
4905 	if ((uap->xsecurity != USER_ADDR_NULL) &&
4906 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4907 		return ciferror;
4908 	}
4909 
4910 	VATTR_INIT(&va);
4911 	cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4912 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4913 	if (uap->uid != KAUTH_UID_NONE) {
4914 		VATTR_SET(&va, va_uid, uap->uid);
4915 	}
4916 	if (uap->gid != KAUTH_GID_NONE) {
4917 		VATTR_SET(&va, va_gid, uap->gid);
4918 	}
4919 	if (xsecdst != NULL) {
4920 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4921 		va.va_vaflags |= VA_FILESEC_ACL;
4922 	}
4923 
4924 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4925 	    uap->path, vfs_context_current());
4926 
4927 	ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4928 	    NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
4929 	if (xsecdst != NULL) {
4930 		kauth_filesec_free(xsecdst);
4931 	}
4932 
4933 	return ciferror;
4934 }
4935 
4936 /*
4937  * Go through the data-protected atomically controlled open (2)
4938  *
4939  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4940  */
4941 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)4942 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4943     int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
4944 {
4945 	/*
4946 	 * Follow the same path as normal open(2)
4947 	 * Look up the item if it exists, and acquire the vnode.
4948 	 */
4949 	struct vnode_attr va;
4950 	struct nameidata nd;
4951 	int cmode;
4952 	int error;
4953 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
4954 
4955 	VATTR_INIT(&va);
4956 	/* Mask off all but regular access permissions */
4957 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4958 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4959 
4960 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
4961 	    path, ctx);
4962 
4963 	/*
4964 	 * Initialize the extra fields in vnode_attr to pass down our
4965 	 * extra fields.
4966 	 * 1. target cprotect class.
4967 	 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4968 	 */
4969 	if (flags & O_CREAT) {
4970 		/* lower level kernel code validates that the class is valid before applying it. */
4971 		if (class != PROTECTION_CLASS_DEFAULT) {
4972 			/*
4973 			 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4974 			 * file behave the same as open (2)
4975 			 */
4976 			VATTR_SET(&va, va_dataprotect_class, class);
4977 		}
4978 	}
4979 
4980 	if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
4981 		if (flags & (O_RDWR | O_WRONLY)) {
4982 			/*
4983 			 * Not allowed to write raw encrypted bytes or when opening authenticated.
4984 			 */
4985 			return EINVAL;
4986 		}
4987 		if (dpflags & O_DP_GETRAWENCRYPTED) {
4988 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4989 		}
4990 		if (dpflags & O_DP_GETRAWUNENCRYPTED) {
4991 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4992 		}
4993 		if (dpflags & O_DP_AUTHENTICATE) {
4994 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
4995 		}
4996 	}
4997 
4998 	error = open1at(vfs_context_current(), &nd, flags, &va,
4999 	    NULL, NULL, retval, fd, authfd);
5000 
5001 	return error;
5002 }
5003 
5004 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)5005 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5006 {
5007 	if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5008 		return EINVAL;
5009 	}
5010 
5011 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5012 	           uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
5013 }
5014 
5015 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)5016 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5017 {
5018 	if (uap->dpflags & O_DP_AUTHENTICATE) {
5019 		return EINVAL;
5020 	}
5021 
5022 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5023 	           uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5024 }
5025 
5026 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)5027 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5028     int fd, enum uio_seg segflg, int *retval)
5029 {
5030 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5031 	struct {
5032 		struct vnode_attr va;
5033 		struct nameidata nd;
5034 	} *__open_data;
5035 	struct vnode_attr *vap;
5036 	struct nameidata *ndp;
5037 	int cmode;
5038 	int error;
5039 
5040 	__open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5041 	vap = &__open_data->va;
5042 	ndp = &__open_data->nd;
5043 
5044 	VATTR_INIT(vap);
5045 	/* Mask off all but regular access permissions */
5046 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5047 	VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5048 
5049 	NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5050 	    segflg, path, ctx);
5051 
5052 	error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5053 
5054 	kfree_type(typeof(*__open_data), __open_data);
5055 
5056 	return error;
5057 }
5058 
5059 int
open(proc_t p,struct open_args * uap,int32_t * retval)5060 open(proc_t p, struct open_args *uap, int32_t *retval)
5061 {
5062 	__pthread_testcancel(1);
5063 	return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5064 }
5065 
5066 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5067 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5068     int32_t *retval)
5069 {
5070 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5071 	           uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
5072 }
5073 
5074 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5075 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5076     int32_t *retval)
5077 {
5078 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5079 	           uap->mode, uap->fd, UIO_USERSPACE, retval);
5080 }
5081 
5082 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5083 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5084 {
5085 	__pthread_testcancel(1);
5086 	return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5087 }
5088 
5089 #define OPEN_BY_ID_ENTITLEMENT  "com.apple.private.vfs.open-by-id"
5090 
5091 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5092 vfs_context_can_open_by_id(vfs_context_t ctx)
5093 {
5094 	if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5095 		return TRUE;
5096 	}
5097 
5098 	return IOTaskHasEntitlement(vfs_context_task(ctx),
5099 	           OPEN_BY_ID_ENTITLEMENT);
5100 }
5101 
5102 /*
5103  * openbyid_np: open a file given a file system id and a file system object id
5104  *	the hfs file system object id is an fsobj_id_t {uint32, uint32}
5105  *	file systems that don't support object ids it is a node id (uint64_t).
5106  *
5107  * Parameters:	p			Process requesting the open
5108  *		uap			User argument descriptor (see below)
5109  *		retval			Pointer to an area to receive the
5110  *					return calue from the system call
5111  *
5112  * Indirect:	uap->path		Path to open (same as 'open')
5113  *
5114  *		uap->fsid		id of target file system
5115  *		uap->objid		id of target file system object
5116  *		uap->flags		Flags to open (same as 'open')
5117  *
5118  * Returns:	0			Success
5119  *		!0			errno value
5120  *
5121  *
5122  * XXX:		We should enummerate the possible errno values here, and where
5123  *		in the code they originated.
5124  */
5125 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5126 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5127 {
5128 	fsid_t fsid;
5129 	uint64_t objid;
5130 	int error;
5131 	char *buf = NULL;
5132 	int buflen = MAXPATHLEN;
5133 	int pathlen = 0;
5134 	vfs_context_t ctx = vfs_context_current();
5135 
5136 	if (!vfs_context_can_open_by_id(ctx)) {
5137 		return EPERM;
5138 	}
5139 
5140 	if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5141 		return error;
5142 	}
5143 
5144 	/*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5145 	if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5146 		return error;
5147 	}
5148 
5149 	AUDIT_ARG(value32, fsid.val[0]);
5150 	AUDIT_ARG(value64, objid);
5151 
5152 	/*resolve path from fsis, objid*/
5153 	do {
5154 		buf = kalloc_data(buflen + 1, Z_WAITOK);
5155 		if (buf == NULL) {
5156 			return ENOMEM;
5157 		}
5158 
5159 		error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5160 		    buf, FSOPT_ISREALFSID, &pathlen);
5161 
5162 		if (error) {
5163 			kfree_data(buf, buflen + 1);
5164 			buf = NULL;
5165 		}
5166 	} while (error == ENOSPC && (buflen += MAXPATHLEN));
5167 
5168 	if (error) {
5169 		return error;
5170 	}
5171 
5172 	buf[pathlen] = 0;
5173 
5174 	error = openat_internal(
5175 		ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
5176 
5177 	kfree_data(buf, buflen + 1);
5178 
5179 	return error;
5180 }
5181 
5182 
5183 /*
5184  * Create a special file.
5185  */
5186 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5187     int fd);
5188 
5189 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5190 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5191     mode_t mode, int fd)
5192 {
5193 	vfs_context_t ctx = vfs_context_current();
5194 	struct nameidata nd;
5195 	vnode_t vp, dvp;
5196 	int error;
5197 
5198 	/* If it's a mknod() of a FIFO, call mkfifo1() instead */
5199 	if ((mode & S_IFMT) == S_IFIFO) {
5200 		return mkfifo1(ctx, upath, vap, fd);
5201 	}
5202 
5203 	AUDIT_ARG(mode, mode);
5204 	AUDIT_ARG(value32, vap->va_rdev);
5205 
5206 	if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5207 		return error;
5208 	}
5209 	NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5210 	    UIO_USERSPACE, upath, ctx);
5211 	error = nameiat(&nd, fd);
5212 	if (error) {
5213 		return error;
5214 	}
5215 	dvp = nd.ni_dvp;
5216 	vp = nd.ni_vp;
5217 
5218 	if (vp != NULL) {
5219 		error = EEXIST;
5220 		goto out;
5221 	}
5222 
5223 	switch (mode & S_IFMT) {
5224 	case S_IFCHR:
5225 		VATTR_SET(vap, va_type, VCHR);
5226 		break;
5227 	case S_IFBLK:
5228 		VATTR_SET(vap, va_type, VBLK);
5229 		break;
5230 	default:
5231 		error = EINVAL;
5232 		goto out;
5233 	}
5234 
5235 #if CONFIG_MACF
5236 	error = mac_vnode_check_create(ctx,
5237 	    nd.ni_dvp, &nd.ni_cnd, vap);
5238 	if (error) {
5239 		goto out;
5240 	}
5241 #endif
5242 
5243 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5244 		goto out;
5245 	}
5246 
5247 #if CONFIG_FILE_LEASES
5248 	vnode_breakdirlease(dvp, false, O_WRONLY);
5249 #endif
5250 
5251 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5252 		goto out;
5253 	}
5254 
5255 	if (vp) {
5256 		int     update_flags = 0;
5257 
5258 		// Make sure the name & parent pointers are hooked up
5259 		if (vp->v_name == NULL) {
5260 			update_flags |= VNODE_UPDATE_NAME;
5261 		}
5262 		if (vp->v_parent == NULLVP) {
5263 			update_flags |= VNODE_UPDATE_PARENT;
5264 		}
5265 
5266 		if (update_flags) {
5267 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5268 		}
5269 
5270 #if CONFIG_FSE
5271 		add_fsevent(FSE_CREATE_FILE, ctx,
5272 		    FSE_ARG_VNODE, vp,
5273 		    FSE_ARG_DONE);
5274 #endif
5275 	}
5276 
5277 out:
5278 	/*
5279 	 * nameidone has to happen before we vnode_put(dvp)
5280 	 * since it may need to release the fs_nodelock on the dvp
5281 	 */
5282 	nameidone(&nd);
5283 
5284 	if (vp) {
5285 		vnode_put(vp);
5286 	}
5287 	vnode_put(dvp);
5288 
5289 	return error;
5290 }
5291 
5292 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5293 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5294 {
5295 	struct vnode_attr va;
5296 
5297 	VATTR_INIT(&va);
5298 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5299 	VATTR_SET(&va, va_rdev, uap->dev);
5300 
5301 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5302 }
5303 
5304 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5305 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5306 {
5307 	struct vnode_attr va;
5308 
5309 	VATTR_INIT(&va);
5310 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5311 	VATTR_SET(&va, va_rdev, uap->dev);
5312 
5313 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5314 }
5315 
5316 /*
5317  * Create a named pipe.
5318  *
5319  * Returns:	0			Success
5320  *		EEXIST
5321  *	namei:???
5322  *	vnode_authorize:???
5323  *	vn_create:???
5324  */
5325 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5326 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5327 {
5328 	vnode_t vp, dvp;
5329 	int error;
5330 	struct nameidata nd;
5331 
5332 	NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5333 	    UIO_USERSPACE, upath, ctx);
5334 	error = nameiat(&nd, fd);
5335 	if (error) {
5336 		return error;
5337 	}
5338 	dvp = nd.ni_dvp;
5339 	vp = nd.ni_vp;
5340 
5341 	/* check that this is a new file and authorize addition */
5342 	if (vp != NULL) {
5343 		error = EEXIST;
5344 		goto out;
5345 	}
5346 	VATTR_SET(vap, va_type, VFIFO);
5347 
5348 	if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5349 		goto out;
5350 	}
5351 
5352 	error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5353 out:
5354 	/*
5355 	 * nameidone has to happen before we vnode_put(dvp)
5356 	 * since it may need to release the fs_nodelock on the dvp
5357 	 */
5358 	nameidone(&nd);
5359 
5360 	if (vp) {
5361 		vnode_put(vp);
5362 	}
5363 	vnode_put(dvp);
5364 
5365 	return error;
5366 }
5367 
5368 
5369 /*
5370  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5371  *
5372  * Parameters:	p			Process requesting the open
5373  *		uap			User argument descriptor (see below)
5374  *		retval			(Ignored)
5375  *
5376  * Indirect:	uap->path		Path to fifo (same as 'mkfifo')
5377  *		uap->uid		UID to set
5378  *		uap->gid		GID to set
5379  *		uap->mode		File mode to set (same as 'mkfifo')
5380  *		uap->xsecurity		ACL to set, if creating
5381  *
5382  * Returns:	0			Success
5383  *		!0			errno value
5384  *
5385  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
5386  *
5387  * XXX:		We should enummerate the possible errno values here, and where
5388  *		in the code they originated.
5389  */
5390 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5391 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5392 {
5393 	int ciferror;
5394 	kauth_filesec_t xsecdst;
5395 	struct vnode_attr va;
5396 
5397 	AUDIT_ARG(owner, uap->uid, uap->gid);
5398 
5399 	xsecdst = KAUTH_FILESEC_NONE;
5400 	if (uap->xsecurity != USER_ADDR_NULL) {
5401 		if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5402 			return ciferror;
5403 		}
5404 	}
5405 
5406 	VATTR_INIT(&va);
5407 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5408 	if (uap->uid != KAUTH_UID_NONE) {
5409 		VATTR_SET(&va, va_uid, uap->uid);
5410 	}
5411 	if (uap->gid != KAUTH_GID_NONE) {
5412 		VATTR_SET(&va, va_gid, uap->gid);
5413 	}
5414 	if (xsecdst != KAUTH_FILESEC_NONE) {
5415 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5416 		va.va_vaflags |= VA_FILESEC_ACL;
5417 	}
5418 
5419 	ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5420 
5421 	if (xsecdst != KAUTH_FILESEC_NONE) {
5422 		kauth_filesec_free(xsecdst);
5423 	}
5424 	return ciferror;
5425 }
5426 
5427 /* ARGSUSED */
5428 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5429 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5430 {
5431 	struct vnode_attr va;
5432 
5433 	VATTR_INIT(&va);
5434 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5435 
5436 	return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5437 }
5438 
5439 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5440 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5441 {
5442 	struct vnode_attr va;
5443 
5444 	VATTR_INIT(&va);
5445 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5446 
5447 	return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5448 }
5449 
5450 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5451 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5452 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5453 
5454 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5455 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5456 {
5457 	int ret, len = _len;
5458 
5459 	*truncated_path = 0;
5460 
5461 	if (firmlink) {
5462 		ret = vn_getpath(dvp, path, &len);
5463 	} else {
5464 		ret = vn_getpath_no_firmlink(dvp, path, &len);
5465 	}
5466 	if (ret == 0 && len < (MAXPATHLEN - 1)) {
5467 		if (leafname) {
5468 			path[len - 1] = '/';
5469 			len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5470 			if (len > MAXPATHLEN) {
5471 				char *ptr;
5472 
5473 				// the string got truncated!
5474 				*truncated_path = 1;
5475 				ptr = strrchr(path, '/');
5476 				if (ptr) {
5477 					*ptr = '\0';   // chop off the string at the last directory component
5478 				}
5479 				len = (int)strlen(path) + 1;
5480 			}
5481 		}
5482 	} else if (ret == 0) {
5483 		*truncated_path = 1;
5484 	} else if (ret != 0) {
5485 		struct vnode *mydvp = dvp;
5486 
5487 		if (ret != ENOSPC) {
5488 			printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5489 			    dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5490 		}
5491 		*truncated_path = 1;
5492 
5493 		do {
5494 			if (mydvp->v_parent != NULL) {
5495 				mydvp = mydvp->v_parent;
5496 			} else if (mydvp->v_mount) {
5497 				strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5498 				break;
5499 			} else {
5500 				// no parent and no mount point?  only thing is to punt and say "/" changed
5501 				strlcpy(path, "/", _len);
5502 				len = 2;
5503 				mydvp = NULL;
5504 			}
5505 
5506 			if (mydvp == NULL) {
5507 				break;
5508 			}
5509 
5510 			len = _len;
5511 			if (firmlink) {
5512 				ret = vn_getpath(mydvp, path, &len);
5513 			} else {
5514 				ret = vn_getpath_no_firmlink(mydvp, path, &len);
5515 			}
5516 		} while (ret == ENOSPC);
5517 	}
5518 
5519 	return len;
5520 }
5521 
5522 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5523 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5524 {
5525 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5526 }
5527 
5528 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5529 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5530 {
5531 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5532 }
5533 
5534 /*
5535  * Make a hard file link.
5536  *
5537  * Returns:	0			Success
5538  *		EPERM
5539  *		EEXIST
5540  *		EXDEV
5541  *	namei:???
5542  *	vnode_authorize:???
5543  *	VNOP_LINK:???
5544  */
5545 /* ARGSUSED */
5546 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5547 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5548     user_addr_t link, int flag, enum uio_seg segflg)
5549 {
5550 	vnode_t vp, pvp, dvp, lvp;
5551 	struct nameidata nd;
5552 	int follow;
5553 	int error;
5554 #if CONFIG_FSE
5555 	fse_info finfo;
5556 #endif
5557 	int need_event, has_listeners, need_kpath2;
5558 	char *target_path = NULL;
5559 	char  *no_firmlink_path = NULL;
5560 	int truncated = 0;
5561 	int truncated_no_firmlink_path = 0;
5562 
5563 	vp = dvp = lvp = NULLVP;
5564 
5565 	/* look up the object we are linking to */
5566 	follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5567 	NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5568 	    segflg, path, ctx);
5569 
5570 	error = nameiat(&nd, fd1);
5571 	if (error) {
5572 		return error;
5573 	}
5574 	vp = nd.ni_vp;
5575 
5576 	nameidone(&nd);
5577 
5578 	/*
5579 	 * Normally, linking to directories is not supported.
5580 	 * However, some file systems may have limited support.
5581 	 */
5582 	if (vp->v_type == VDIR) {
5583 		if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5584 			error = EPERM;   /* POSIX */
5585 			goto out;
5586 		}
5587 
5588 		/* Linking to a directory requires ownership. */
5589 		if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5590 			struct vnode_attr dva;
5591 
5592 			VATTR_INIT(&dva);
5593 			VATTR_WANTED(&dva, va_uid);
5594 			if (vnode_getattr(vp, &dva, ctx) != 0 ||
5595 			    !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5596 			    (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5597 				error = EACCES;
5598 				goto out;
5599 			}
5600 		}
5601 	}
5602 
5603 	/* lookup the target node */
5604 #if CONFIG_TRIGGERS
5605 	nd.ni_op = OP_LINK;
5606 #endif
5607 	nd.ni_cnd.cn_nameiop = CREATE;
5608 	nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5609 	nd.ni_dirp = link;
5610 	error = nameiat(&nd, fd2);
5611 	if (error != 0) {
5612 		goto out;
5613 	}
5614 	dvp = nd.ni_dvp;
5615 	lvp = nd.ni_vp;
5616 
5617 #if CONFIG_MACF
5618 	if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5619 		goto out2;
5620 	}
5621 #endif
5622 
5623 	/* or to anything that kauth doesn't want us to (eg. immutable items) */
5624 	if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5625 		goto out2;
5626 	}
5627 
5628 	/* target node must not exist */
5629 	if (lvp != NULLVP) {
5630 		error = EEXIST;
5631 		goto out2;
5632 	}
5633 	/* cannot link across mountpoints */
5634 	if (vnode_mount(vp) != vnode_mount(dvp)) {
5635 		error = EXDEV;
5636 		goto out2;
5637 	}
5638 
5639 	/* authorize creation of the target note */
5640 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5641 		goto out2;
5642 	}
5643 
5644 #if CONFIG_FILE_LEASES
5645 	vnode_breakdirlease(dvp, false, O_WRONLY);
5646 #endif
5647 
5648 	/* and finally make the link */
5649 	error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5650 	if (error) {
5651 		goto out2;
5652 	}
5653 
5654 #if CONFIG_MACF
5655 	(void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5656 #endif
5657 
5658 #if CONFIG_FSE
5659 	need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5660 #else
5661 	need_event = 0;
5662 #endif
5663 	has_listeners = kauth_authorize_fileop_has_listeners();
5664 
5665 	need_kpath2 = 0;
5666 #if CONFIG_AUDIT
5667 	if (AUDIT_RECORD_EXISTS()) {
5668 		need_kpath2 = 1;
5669 	}
5670 #endif
5671 
5672 	if (need_event || has_listeners || need_kpath2) {
5673 		char *link_to_path = NULL;
5674 		int len, link_name_len;
5675 		int  len_no_firmlink_path = 0;
5676 
5677 		/* build the path to the new link file */
5678 		GET_PATH(target_path);
5679 
5680 		len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5681 		if (no_firmlink_path == NULL) {
5682 			GET_PATH(no_firmlink_path);
5683 		}
5684 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5685 
5686 		AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5687 
5688 		if (has_listeners) {
5689 			/* build the path to file we are linking to */
5690 			GET_PATH(link_to_path);
5691 
5692 			link_name_len = MAXPATHLEN;
5693 			if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5694 				/*
5695 				 * Call out to allow 3rd party notification of rename.
5696 				 * Ignore result of kauth_authorize_fileop call.
5697 				 */
5698 				kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5699 				    (uintptr_t)link_to_path,
5700 				    (uintptr_t)target_path);
5701 			}
5702 			if (link_to_path != NULL) {
5703 				RELEASE_PATH(link_to_path);
5704 			}
5705 		}
5706 #if CONFIG_FSE
5707 		if (need_event) {
5708 			/* construct fsevent */
5709 			if (get_fse_info(vp, &finfo, ctx) == 0) {
5710 				if (truncated_no_firmlink_path) {
5711 					finfo.mode |= FSE_TRUNCATED_PATH;
5712 				}
5713 
5714 				// build the path to the destination of the link
5715 				add_fsevent(FSE_CREATE_FILE, ctx,
5716 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5717 				    FSE_ARG_FINFO, &finfo,
5718 				    FSE_ARG_DONE);
5719 			}
5720 
5721 			pvp = vp->v_parent;
5722 			// need an iocount on parent vnode in this case
5723 			if (pvp && pvp != dvp) {
5724 				pvp = vnode_getparent_if_different(vp, dvp);
5725 			}
5726 			if (pvp) {
5727 				add_fsevent(FSE_STAT_CHANGED, ctx,
5728 				    FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5729 			}
5730 			if (pvp && pvp != dvp) {
5731 				vnode_put(pvp);
5732 			}
5733 		}
5734 #endif
5735 	}
5736 out2:
5737 	/*
5738 	 * nameidone has to happen before we vnode_put(dvp)
5739 	 * since it may need to release the fs_nodelock on the dvp
5740 	 */
5741 	nameidone(&nd);
5742 	if (target_path != NULL) {
5743 		RELEASE_PATH(target_path);
5744 	}
5745 	if (no_firmlink_path != NULL) {
5746 		RELEASE_PATH(no_firmlink_path);
5747 		no_firmlink_path = NULL;
5748 	}
5749 out:
5750 	if (lvp) {
5751 		vnode_put(lvp);
5752 	}
5753 	if (dvp) {
5754 		vnode_put(dvp);
5755 	}
5756 	vnode_put(vp);
5757 	return error;
5758 }
5759 
5760 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5761 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5762 {
5763 	return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5764 	           AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5765 }
5766 
5767 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5768 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5769 {
5770 	if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5771 		return EINVAL;
5772 	}
5773 
5774 	return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5775 	           uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5776 }
5777 
5778 /*
5779  * Make a symbolic link.
5780  *
5781  * We could add support for ACLs here too...
5782  */
5783 /* ARGSUSED */
5784 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5785 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5786     user_addr_t link, enum uio_seg segflg)
5787 {
5788 	struct vnode_attr va;
5789 	char *path;
5790 	int error;
5791 	struct nameidata nd;
5792 	vnode_t vp, dvp;
5793 	size_t dummy = 0;
5794 	proc_t p;
5795 
5796 	error = 0;
5797 	if (UIO_SEG_IS_USER_SPACE(segflg)) {
5798 		path = zalloc(ZV_NAMEI);
5799 		error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5800 	} else {
5801 		path = (char *)path_data;
5802 	}
5803 	if (error) {
5804 		goto out;
5805 	}
5806 	AUDIT_ARG(text, path);  /* This is the link string */
5807 
5808 	NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5809 	    segflg, link, ctx);
5810 
5811 	error = nameiat(&nd, fd);
5812 	if (error) {
5813 		goto out;
5814 	}
5815 	dvp = nd.ni_dvp;
5816 	vp = nd.ni_vp;
5817 
5818 	p = vfs_context_proc(ctx);
5819 	VATTR_INIT(&va);
5820 	VATTR_SET(&va, va_type, VLNK);
5821 	VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5822 
5823 #if CONFIG_MACF
5824 	error = mac_vnode_check_create(ctx,
5825 	    dvp, &nd.ni_cnd, &va);
5826 #endif
5827 	if (error != 0) {
5828 		goto skipit;
5829 	}
5830 
5831 	if (vp != NULL) {
5832 		error = EEXIST;
5833 		goto skipit;
5834 	}
5835 
5836 	/* authorize */
5837 	if (error == 0) {
5838 		error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5839 	}
5840 	/* get default ownership, etc. */
5841 	if (error == 0) {
5842 		error = vnode_authattr_new(dvp, &va, 0, ctx);
5843 	}
5844 
5845 #if CONFIG_FILE_LEASES
5846 	vnode_breakdirlease(dvp, false, O_WRONLY);
5847 #endif
5848 
5849 	if (error == 0) {
5850 		error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5851 	}
5852 
5853 	/* do fallback attribute handling */
5854 	if (error == 0 && vp) {
5855 		error = vnode_setattr_fallback(vp, &va, ctx);
5856 	}
5857 
5858 #if CONFIG_MACF
5859 	if (error == 0 && vp) {
5860 		error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5861 	}
5862 #endif
5863 
5864 	if (error == 0) {
5865 		int     update_flags = 0;
5866 
5867 		/*check if a new vnode was created, else try to get one*/
5868 		if (vp == NULL) {
5869 			nd.ni_cnd.cn_nameiop = LOOKUP;
5870 #if CONFIG_TRIGGERS
5871 			nd.ni_op = OP_LOOKUP;
5872 #endif
5873 			/*
5874 			 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5875 			 * reallocated again in namei().
5876 			 */
5877 			nd.ni_cnd.cn_flags &= HASBUF;
5878 			error = nameiat(&nd, fd);
5879 			if (error) {
5880 				goto skipit;
5881 			}
5882 			vp = nd.ni_vp;
5883 		}
5884 
5885 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5886 		/* call out to allow 3rd party notification of rename.
5887 		 * Ignore result of kauth_authorize_fileop call.
5888 		 */
5889 		if (kauth_authorize_fileop_has_listeners() &&
5890 		    namei(&nd) == 0) {
5891 			char *new_link_path = NULL;
5892 			int             len;
5893 
5894 			/* build the path to the new link file */
5895 			new_link_path = get_pathbuff();
5896 			len = MAXPATHLEN;
5897 			vn_getpath(dvp, new_link_path, &len);
5898 			if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5899 				new_link_path[len - 1] = '/';
5900 				strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5901 			}
5902 
5903 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5904 			    (uintptr_t)path, (uintptr_t)new_link_path);
5905 			if (new_link_path != NULL) {
5906 				release_pathbuff(new_link_path);
5907 			}
5908 		}
5909 #endif
5910 		// Make sure the name & parent pointers are hooked up
5911 		if (vp->v_name == NULL) {
5912 			update_flags |= VNODE_UPDATE_NAME;
5913 		}
5914 		if (vp->v_parent == NULLVP) {
5915 			update_flags |= VNODE_UPDATE_PARENT;
5916 		}
5917 
5918 		if (update_flags) {
5919 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5920 		}
5921 
5922 #if CONFIG_FSE
5923 		add_fsevent(FSE_CREATE_FILE, ctx,
5924 		    FSE_ARG_VNODE, vp,
5925 		    FSE_ARG_DONE);
5926 #endif
5927 	}
5928 
5929 skipit:
5930 	/*
5931 	 * nameidone has to happen before we vnode_put(dvp)
5932 	 * since it may need to release the fs_nodelock on the dvp
5933 	 */
5934 	nameidone(&nd);
5935 
5936 	if (vp) {
5937 		vnode_put(vp);
5938 	}
5939 	vnode_put(dvp);
5940 out:
5941 	if (path && (path != (char *)path_data)) {
5942 		zfree(ZV_NAMEI, path);
5943 	}
5944 
5945 	return error;
5946 }
5947 
5948 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)5949 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5950 {
5951 	return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5952 	           uap->link, UIO_USERSPACE);
5953 }
5954 
5955 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)5956 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5957     __unused int32_t *retval)
5958 {
5959 	return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5960 	           uap->path2, UIO_USERSPACE);
5961 }
5962 
5963 /*
5964  * Delete a whiteout from the filesystem.
5965  * No longer supported.
5966  */
5967 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)5968 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5969 {
5970 	return ENOTSUP;
5971 }
5972 
5973 /*
5974  * Delete a name from the filesystem.
5975  */
5976 /* ARGSUSED */
5977 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5978 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5979     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5980 {
5981 	struct {
5982 		struct nameidata nd;
5983 #if CONFIG_FSE
5984 		struct vnode_attr va;
5985 		fse_info finfo;
5986 #endif
5987 	} *__unlink_data;
5988 	struct nameidata *ndp;
5989 	vnode_t vp, dvp;
5990 	int error;
5991 	struct componentname *cnp;
5992 	char  *path = NULL;
5993 	char  *no_firmlink_path = NULL;
5994 	int  len_path = 0;
5995 	int  len_no_firmlink_path = 0;
5996 	int flags;
5997 	int need_event;
5998 	int has_listeners;
5999 	int truncated_path;
6000 	int truncated_no_firmlink_path;
6001 	int batched;
6002 	struct vnode_attr *vap;
6003 	int do_retry;
6004 	int retry_count = 0;
6005 	int cn_flags;
6006 	int nofollow_any = 0;
6007 
6008 	cn_flags = LOCKPARENT;
6009 	if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6010 		cn_flags |= AUDITVNPATH1;
6011 	}
6012 	if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
6013 		nofollow_any = NAMEI_NOFOLLOW_ANY;
6014 		unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
6015 	}
6016 	/* If a starting dvp is passed, it trumps any fd passed. */
6017 	if (start_dvp) {
6018 		cn_flags |= USEDVP;
6019 	}
6020 
6021 #if NAMEDRSRCFORK
6022 	/* unlink or delete is allowed on rsrc forks and named streams */
6023 	cn_flags |= CN_ALLOWRSRCFORK;
6024 #endif
6025 
6026 	__unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6027 	ndp = &__unlink_data->nd;
6028 #if CONFIG_FSE
6029 	fse_info *finfop = &__unlink_data->finfo;
6030 #endif
6031 
6032 retry:
6033 	do_retry = 0;
6034 	flags = 0;
6035 	need_event = 0;
6036 	has_listeners = 0;
6037 	truncated_path = 0;
6038 	truncated_no_firmlink_path = 0;
6039 	vap = NULL;
6040 
6041 	NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6042 
6043 	ndp->ni_dvp = start_dvp;
6044 	ndp->ni_flag |= NAMEI_COMPOUNDREMOVE | nofollow_any;
6045 	cnp = &ndp->ni_cnd;
6046 
6047 continue_lookup:
6048 	error = nameiat(ndp, fd);
6049 	if (error) {
6050 		goto early_out;
6051 	}
6052 
6053 	dvp = ndp->ni_dvp;
6054 	vp = ndp->ni_vp;
6055 
6056 	/* With Carbon delete semantics, busy files cannot be deleted */
6057 	if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6058 		flags |= VNODE_REMOVE_NODELETEBUSY;
6059 	}
6060 
6061 	/* Skip any potential upcalls if told to. */
6062 	if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6063 		flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6064 	}
6065 
6066 	if (vp) {
6067 		batched = vnode_compound_remove_available(vp);
6068 		/*
6069 		 * The root of a mounted filesystem cannot be deleted.
6070 		 */
6071 		if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6072 			error = EBUSY;
6073 			goto out;
6074 		}
6075 
6076 #if DEVELOPMENT || DEBUG
6077 		/*
6078 		 * XXX VSWAP: Check for entitlements or special flag here
6079 		 * so we can restrict access appropriately.
6080 		 */
6081 #else /* DEVELOPMENT || DEBUG */
6082 
6083 		if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6084 			error = EPERM;
6085 			goto out;
6086 		}
6087 #endif /* DEVELOPMENT || DEBUG */
6088 
6089 		if (!batched) {
6090 			error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6091 			if (error) {
6092 				if (error == ENOENT) {
6093 					if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6094 						do_retry = 1;
6095 						retry_count++;
6096 					}
6097 				}
6098 				goto out;
6099 			}
6100 		}
6101 	} else {
6102 		batched = 1;
6103 
6104 		if (!vnode_compound_remove_available(dvp)) {
6105 			panic("No vp, but no compound remove?");
6106 		}
6107 	}
6108 
6109 #if CONFIG_FSE
6110 	need_event = need_fsevent(FSE_DELETE, dvp);
6111 	if (need_event) {
6112 		if (!batched) {
6113 			if ((vp->v_flag & VISHARDLINK) == 0) {
6114 				/* XXX need to get these data in batched VNOP */
6115 				get_fse_info(vp, finfop, ctx);
6116 			}
6117 		} else {
6118 			error =
6119 			    vfs_get_notify_attributes(&__unlink_data->va);
6120 			if (error) {
6121 				goto out;
6122 			}
6123 
6124 			vap = &__unlink_data->va;
6125 		}
6126 	}
6127 #endif
6128 	has_listeners = kauth_authorize_fileop_has_listeners();
6129 	if (need_event || has_listeners) {
6130 		if (path == NULL) {
6131 			GET_PATH(path);
6132 		}
6133 		len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6134 		if (no_firmlink_path == NULL) {
6135 			GET_PATH(no_firmlink_path);
6136 		}
6137 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6138 	}
6139 
6140 #if NAMEDRSRCFORK
6141 	if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6142 		error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6143 	} else
6144 #endif
6145 	{
6146 #if CONFIG_FILE_LEASES
6147 		vnode_breakdirlease(dvp, false, O_WRONLY);
6148 #endif
6149 
6150 		error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6151 		vp = ndp->ni_vp;
6152 		if (error == EKEEPLOOKING) {
6153 			if (!batched) {
6154 				panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6155 			}
6156 
6157 			if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6158 				panic("EKEEPLOOKING, but continue flag not set?");
6159 			}
6160 
6161 			if (vnode_isdir(vp)) {
6162 				error = EISDIR;
6163 				goto out;
6164 			}
6165 			goto continue_lookup;
6166 		} else if (error == ENOENT && batched) {
6167 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6168 				/*
6169 				 * For compound VNOPs, the authorization callback may
6170 				 * return ENOENT in case of racing hardlink lookups
6171 				 * hitting the name  cache, redrive the lookup.
6172 				 */
6173 				do_retry = 1;
6174 				retry_count += 1;
6175 				goto out;
6176 			}
6177 		}
6178 	}
6179 
6180 	/*
6181 	 * Call out to allow 3rd party notification of delete.
6182 	 * Ignore result of kauth_authorize_fileop call.
6183 	 */
6184 	if (!error) {
6185 		if (has_listeners) {
6186 			kauth_authorize_fileop(vfs_context_ucred(ctx),
6187 			    KAUTH_FILEOP_DELETE,
6188 			    (uintptr_t)vp,
6189 			    (uintptr_t)path);
6190 		}
6191 
6192 		if (vp->v_flag & VISHARDLINK) {
6193 			//
6194 			// if a hardlink gets deleted we want to blow away the
6195 			// v_parent link because the path that got us to this
6196 			// instance of the link is no longer valid.  this will
6197 			// force the next call to get the path to ask the file
6198 			// system instead of just following the v_parent link.
6199 			//
6200 			vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6201 		}
6202 
6203 #if CONFIG_FSE
6204 		if (need_event) {
6205 			if (vp->v_flag & VISHARDLINK) {
6206 				get_fse_info(vp, finfop, ctx);
6207 			} else if (vap) {
6208 				vnode_get_fse_info_from_vap(vp, finfop, vap);
6209 			}
6210 			if (truncated_path) {
6211 				finfop->mode |= FSE_TRUNCATED_PATH;
6212 			}
6213 			add_fsevent(FSE_DELETE, ctx,
6214 			    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6215 			    FSE_ARG_FINFO, finfop,
6216 			    FSE_ARG_DONE);
6217 		}
6218 #endif
6219 
6220 #if CONFIG_MACF
6221 		mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6222 #endif
6223 	}
6224 
6225 out:
6226 	if (path != NULL) {
6227 		RELEASE_PATH(path);
6228 		path = NULL;
6229 	}
6230 
6231 	if (no_firmlink_path != NULL) {
6232 		RELEASE_PATH(no_firmlink_path);
6233 		no_firmlink_path = NULL;
6234 	}
6235 #if NAMEDRSRCFORK
6236 	/* recycle the deleted rsrc fork vnode to force a reclaim, which
6237 	 * will cause its shadow file to go away if necessary.
6238 	 */
6239 	if (vp && (vnode_isnamedstream(vp)) &&
6240 	    (vp->v_parent != NULLVP) &&
6241 	    vnode_isshadow(vp)) {
6242 		vnode_recycle(vp);
6243 	}
6244 #endif
6245 	/*
6246 	 * nameidone has to happen before we vnode_put(dvp)
6247 	 * since it may need to release the fs_nodelock on the dvp
6248 	 */
6249 	nameidone(ndp);
6250 	vnode_put(dvp);
6251 	if (vp) {
6252 		vnode_put(vp);
6253 	}
6254 
6255 	if (do_retry) {
6256 		goto retry;
6257 	}
6258 
6259 early_out:
6260 	kfree_type(typeof(*__unlink_data), __unlink_data);
6261 	return error;
6262 }
6263 
6264 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6265 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6266     enum uio_seg segflg, int unlink_flags)
6267 {
6268 	return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6269 	           unlink_flags);
6270 }
6271 
6272 /*
6273  * Delete a name from the filesystem using Carbon semantics.
6274  */
6275 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6276 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6277 {
6278 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6279 	           uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6280 }
6281 
6282 /*
6283  * Delete a name from the filesystem using POSIX semantics.
6284  */
6285 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6286 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6287 {
6288 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6289 	           uap->path, UIO_USERSPACE, 0);
6290 }
6291 
6292 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6293 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6294 {
6295 	int unlink_flags = 0;
6296 
6297 	if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS | AT_SYMLINK_NOFOLLOW_ANY)) {
6298 		return EINVAL;
6299 	}
6300 
6301 	if (uap->flag & AT_SYMLINK_NOFOLLOW_ANY) {
6302 		unlink_flags |= VNODE_REMOVE_NOFOLLOW_ANY;
6303 	}
6304 
6305 	if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6306 		if (uap->flag & AT_REMOVEDIR_DATALESS) {
6307 			unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6308 		}
6309 		return rmdirat_internal(vfs_context_current(), uap->fd,
6310 		           uap->path, UIO_USERSPACE, unlink_flags);
6311 	} else {
6312 		return unlinkat_internal(vfs_context_current(), uap->fd,
6313 		           NULLVP, uap->path, UIO_USERSPACE, unlink_flags);
6314 	}
6315 }
6316 
6317 /*
6318  * Reposition read/write file offset.
6319  */
6320 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6321 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6322 {
6323 	struct fileproc *fp;
6324 	vnode_t vp;
6325 	struct vfs_context *ctx;
6326 	off_t offset = uap->offset, file_size;
6327 	int error;
6328 
6329 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6330 		if (error == ENOTSUP) {
6331 			return ESPIPE;
6332 		}
6333 		return error;
6334 	}
6335 	if (vnode_isfifo(vp)) {
6336 		file_drop(uap->fd);
6337 		return ESPIPE;
6338 	}
6339 
6340 
6341 	ctx = vfs_context_current();
6342 #if CONFIG_MACF
6343 	if (uap->whence == L_INCR && uap->offset == 0) {
6344 		error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6345 		    fp->fp_glob);
6346 	} else {
6347 		error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6348 		    fp->fp_glob);
6349 	}
6350 	if (error) {
6351 		file_drop(uap->fd);
6352 		return error;
6353 	}
6354 #endif
6355 	if ((error = vnode_getwithref(vp))) {
6356 		file_drop(uap->fd);
6357 		return error;
6358 	}
6359 
6360 	switch (uap->whence) {
6361 	case L_INCR:
6362 		offset += fp->fp_glob->fg_offset;
6363 		break;
6364 	case L_XTND:
6365 		if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6366 			break;
6367 		}
6368 		offset += file_size;
6369 		break;
6370 	case L_SET:
6371 		break;
6372 	case SEEK_HOLE:
6373 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6374 		break;
6375 	case SEEK_DATA:
6376 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6377 		break;
6378 	default:
6379 		error = EINVAL;
6380 	}
6381 	if (error == 0) {
6382 		if (uap->offset > 0 && offset < 0) {
6383 			/* Incremented/relative move past max size */
6384 			error = EOVERFLOW;
6385 		} else {
6386 			/*
6387 			 * Allow negative offsets on character devices, per
6388 			 * POSIX 1003.1-2001.  Most likely for writing disk
6389 			 * labels.
6390 			 */
6391 			if (offset < 0 && vp->v_type != VCHR) {
6392 				/* Decremented/relative move before start */
6393 				error = EINVAL;
6394 			} else {
6395 				/* Success */
6396 				fp->fp_glob->fg_offset = offset;
6397 				*retval = fp->fp_glob->fg_offset;
6398 			}
6399 		}
6400 	}
6401 
6402 	/*
6403 	 * An lseek can affect whether data is "available to read."  Use
6404 	 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6405 	 */
6406 	post_event_if_success(vp, error, NOTE_NONE);
6407 	(void)vnode_put(vp);
6408 	file_drop(uap->fd);
6409 	return error;
6410 }
6411 
6412 
6413 /*
6414  * Check access permissions.
6415  *
6416  * Returns:	0			Success
6417  *		vnode_authorize:???
6418  */
6419 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6420 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6421 {
6422 	kauth_action_t action;
6423 	int error;
6424 
6425 	/*
6426 	 * If just the regular access bits, convert them to something
6427 	 * that vnode_authorize will understand.
6428 	 */
6429 	if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6430 		action = 0;
6431 		if (uflags & R_OK) {
6432 			action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
6433 		}
6434 		if (uflags & W_OK) {
6435 			if (vnode_isdir(vp)) {
6436 				action |= KAUTH_VNODE_ADD_FILE |
6437 				    KAUTH_VNODE_ADD_SUBDIRECTORY;
6438 				/* might want delete rights here too */
6439 			} else {
6440 				action |= KAUTH_VNODE_WRITE_DATA;
6441 			}
6442 		}
6443 		if (uflags & X_OK) {
6444 			if (vnode_isdir(vp)) {
6445 				action |= KAUTH_VNODE_SEARCH;
6446 			} else {
6447 				action |= KAUTH_VNODE_EXECUTE;
6448 			}
6449 		}
6450 	} else {
6451 		/* take advantage of definition of uflags */
6452 		action = uflags >> 8;
6453 	}
6454 
6455 #if CONFIG_MACF
6456 	error = mac_vnode_check_access(ctx, vp, uflags);
6457 	if (error) {
6458 		return error;
6459 	}
6460 #endif /* MAC */
6461 
6462 	/* action == 0 means only check for existence */
6463 	if (action != 0) {
6464 		error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6465 	} else {
6466 		error = 0;
6467 	}
6468 
6469 	return error;
6470 }
6471 
6472 
6473 
6474 /*
6475  * access_extended: Check access permissions in bulk.
6476  *
6477  * Description:	uap->entries		Pointer to an array of accessx
6478  *                                      descriptor structs, plus one or
6479  *                                      more NULL terminated strings (see
6480  *                                      "Notes" section below).
6481  *		uap->size		Size of the area pointed to by
6482  *					uap->entries.
6483  *		uap->results		Pointer to the results array.
6484  *
6485  * Returns:	0			Success
6486  *		ENOMEM			Insufficient memory
6487  *		EINVAL			Invalid arguments
6488  *		namei:EFAULT		Bad address
6489  *		namei:ENAMETOOLONG	Filename too long
6490  *		namei:ENOENT		No such file or directory
6491  *		namei:ELOOP		Too many levels of symbolic links
6492  *		namei:EBADF		Bad file descriptor
6493  *		namei:ENOTDIR		Not a directory
6494  *		namei:???
6495  *		access1:
6496  *
6497  * Implicit returns:
6498  *		uap->results		Array contents modified
6499  *
6500  * Notes:	The uap->entries are structured as an arbitrary length array
6501  *		of accessx descriptors, followed by one or more NULL terminated
6502  *		strings
6503  *
6504  *			struct accessx_descriptor[0]
6505  *			...
6506  *			struct accessx_descriptor[n]
6507  *			char name_data[0];
6508  *
6509  *		We determine the entry count by walking the buffer containing
6510  *		the uap->entries argument descriptor.  For each descriptor we
6511  *		see, the valid values for the offset ad_name_offset will be
6512  *		in the byte range:
6513  *
6514  *			[ uap->entries + sizeof(struct accessx_descriptor) ]
6515  *						to
6516  *				[ uap->entries + uap->size - 2 ]
6517  *
6518  *		since we must have at least one string, and the string must
6519  *		be at least one character plus the NULL terminator in length.
6520  *
6521  * XXX:		Need to support the check-as uid argument
6522  */
6523 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6524 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6525 {
6526 	struct accessx_descriptor *input = NULL;
6527 	errno_t *result = NULL;
6528 	errno_t error = 0;
6529 	int wantdelete = 0;
6530 	size_t desc_max, desc_actual = 0;
6531 	unsigned int i, j;
6532 	struct vfs_context context;
6533 	struct nameidata nd;
6534 	int niopts;
6535 	vnode_t vp = NULL;
6536 	vnode_t dvp = NULL;
6537 #define ACCESSX_MAX_DESCR_ON_STACK 10
6538 	struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6539 
6540 	context.vc_ucred = NULL;
6541 
6542 	/*
6543 	 * Validate parameters; if valid, copy the descriptor array and string
6544 	 * arguments into local memory.  Before proceeding, the following
6545 	 * conditions must have been met:
6546 	 *
6547 	 * o	The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6548 	 * o	There must be sufficient room in the request for at least one
6549 	 *	descriptor and a one yte NUL terminated string.
6550 	 * o	The allocation of local storage must not fail.
6551 	 */
6552 	if (uap->size > ACCESSX_MAX_TABLESIZE) {
6553 		return ENOMEM;
6554 	}
6555 	if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6556 		return EINVAL;
6557 	}
6558 	if (uap->size <= sizeof(stack_input)) {
6559 		input = stack_input;
6560 	} else {
6561 		input = kalloc_data(uap->size, Z_WAITOK);
6562 		if (input == NULL) {
6563 			error = ENOMEM;
6564 			goto out;
6565 		}
6566 	}
6567 	error = copyin(uap->entries, input, uap->size);
6568 	if (error) {
6569 		goto out;
6570 	}
6571 
6572 	AUDIT_ARG(opaque, input, uap->size);
6573 
6574 	/*
6575 	 * Force NUL termination of the copyin buffer to avoid nami() running
6576 	 * off the end.  If the caller passes us bogus data, they may get a
6577 	 * bogus result.
6578 	 */
6579 	((char *)input)[uap->size - 1] = 0;
6580 
6581 	/*
6582 	 * Access is defined as checking against the process' real identity,
6583 	 * even if operations are checking the effective identity.  This
6584 	 * requires that we use a local vfs context.
6585 	 */
6586 	context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6587 	context.vc_thread = current_thread();
6588 
6589 	/*
6590 	 * Find out how many entries we have, so we can allocate the result
6591 	 * array by walking the list and adjusting the count downward by the
6592 	 * earliest string offset we see.
6593 	 */
6594 	desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6595 	desc_actual = desc_max;
6596 	for (i = 0; i < desc_actual; i++) {
6597 		/*
6598 		 * Take the offset to the name string for this entry and
6599 		 * convert to an input array index, which would be one off
6600 		 * the end of the array if this entry was the lowest-addressed
6601 		 * name string.
6602 		 */
6603 		j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6604 
6605 		/*
6606 		 * An offset greater than the max allowable offset is an error.
6607 		 * It is also an error for any valid entry to point
6608 		 * to a location prior to the end of the current entry, if
6609 		 * it's not a reference to the string of the previous entry.
6610 		 */
6611 		if (j > desc_max || (j != 0 && j <= i)) {
6612 			error = EINVAL;
6613 			goto out;
6614 		}
6615 
6616 		/* Also do not let ad_name_offset point to something beyond the size of the input */
6617 		if (input[i].ad_name_offset >= uap->size) {
6618 			error = EINVAL;
6619 			goto out;
6620 		}
6621 
6622 		/*
6623 		 * An offset of 0 means use the previous descriptor's offset;
6624 		 * this is used to chain multiple requests for the same file
6625 		 * to avoid multiple lookups.
6626 		 */
6627 		if (j == 0) {
6628 			/* This is not valid for the first entry */
6629 			if (i == 0) {
6630 				error = EINVAL;
6631 				goto out;
6632 			}
6633 			continue;
6634 		}
6635 
6636 		/*
6637 		 * If the offset of the string for this descriptor is before
6638 		 * what we believe is the current actual last descriptor,
6639 		 * then we need to adjust our estimate downward; this permits
6640 		 * the string table following the last descriptor to be out
6641 		 * of order relative to the descriptor list.
6642 		 */
6643 		if (j < desc_actual) {
6644 			desc_actual = j;
6645 		}
6646 	}
6647 
6648 	/*
6649 	 * We limit the actual number of descriptors we are willing to process
6650 	 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
6651 	 * requested does not exceed this limit,
6652 	 */
6653 	if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6654 		error = ENOMEM;
6655 		goto out;
6656 	}
6657 	result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6658 	if (result == NULL) {
6659 		error = ENOMEM;
6660 		goto out;
6661 	}
6662 
6663 	/*
6664 	 * Do the work by iterating over the descriptor entries we know to
6665 	 * at least appear to contain valid data.
6666 	 */
6667 	error = 0;
6668 	for (i = 0; i < desc_actual; i++) {
6669 		/*
6670 		 * If the ad_name_offset is 0, then we use the previous
6671 		 * results to make the check; otherwise, we are looking up
6672 		 * a new file name.
6673 		 */
6674 		if (input[i].ad_name_offset != 0) {
6675 			/* discard old vnodes */
6676 			if (vp) {
6677 				vnode_put(vp);
6678 				vp = NULL;
6679 			}
6680 			if (dvp) {
6681 				vnode_put(dvp);
6682 				dvp = NULL;
6683 			}
6684 
6685 			/*
6686 			 * Scan forward in the descriptor list to see if we
6687 			 * need the parent vnode.  We will need it if we are
6688 			 * deleting, since we must have rights  to remove
6689 			 * entries in the parent directory, as well as the
6690 			 * rights to delete the object itself.
6691 			 */
6692 			wantdelete = input[i].ad_flags & _DELETE_OK;
6693 			for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6694 				if (input[j].ad_flags & _DELETE_OK) {
6695 					wantdelete = 1;
6696 				}
6697 			}
6698 
6699 			niopts = FOLLOW | AUDITVNPATH1;
6700 
6701 			/* need parent for vnode_authorize for deletion test */
6702 			if (wantdelete) {
6703 				niopts |= WANTPARENT;
6704 			}
6705 
6706 			/* do the lookup */
6707 			NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6708 			    CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6709 			    &context);
6710 			error = namei(&nd);
6711 			if (!error) {
6712 				vp = nd.ni_vp;
6713 				if (wantdelete) {
6714 					dvp = nd.ni_dvp;
6715 				}
6716 			}
6717 			nameidone(&nd);
6718 		}
6719 
6720 		/*
6721 		 * Handle lookup errors.
6722 		 */
6723 		switch (error) {
6724 		case ENOENT:
6725 		case EACCES:
6726 		case EPERM:
6727 		case ENOTDIR:
6728 			result[i] = error;
6729 			break;
6730 		case 0:
6731 			/* run this access check */
6732 			result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6733 			break;
6734 		default:
6735 			/* fatal lookup error */
6736 
6737 			goto out;
6738 		}
6739 	}
6740 
6741 	AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6742 
6743 	/* copy out results */
6744 	error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6745 
6746 out:
6747 	if (input && input != stack_input) {
6748 		kfree_data(input, uap->size);
6749 	}
6750 	if (result) {
6751 		kfree_data(result, desc_actual * sizeof(errno_t));
6752 	}
6753 	if (vp) {
6754 		vnode_put(vp);
6755 	}
6756 	if (dvp) {
6757 		vnode_put(dvp);
6758 	}
6759 	if (IS_VALID_CRED(context.vc_ucred)) {
6760 		kauth_cred_unref(&context.vc_ucred);
6761 	}
6762 	return error;
6763 }
6764 
6765 
6766 /*
6767  * Returns:	0			Success
6768  *		namei:EFAULT		Bad address
6769  *		namei:ENAMETOOLONG	Filename too long
6770  *		namei:ENOENT		No such file or directory
6771  *		namei:ELOOP		Too many levels of symbolic links
6772  *		namei:EBADF		Bad file descriptor
6773  *		namei:ENOTDIR		Not a directory
6774  *		namei:???
6775  *		access1:
6776  */
6777 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6778 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6779     int flag, enum uio_seg segflg)
6780 {
6781 	int error;
6782 	struct nameidata nd;
6783 	int niopts;
6784 	struct vfs_context context;
6785 #if NAMEDRSRCFORK
6786 	int is_namedstream = 0;
6787 #endif
6788 
6789 	/*
6790 	 * Unless the AT_EACCESS option is used, Access is defined as checking
6791 	 * against the process' real identity, even if operations are checking
6792 	 * the effective identity.  So we need to tweak the credential
6793 	 * in the context for that case.
6794 	 */
6795 	if (!(flag & AT_EACCESS)) {
6796 		context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6797 	} else {
6798 		context.vc_ucred = ctx->vc_ucred;
6799 	}
6800 	context.vc_thread = ctx->vc_thread;
6801 
6802 
6803 	niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6804 	/* need parent for vnode_authorize for deletion test */
6805 	if (amode & _DELETE_OK) {
6806 		niopts |= WANTPARENT;
6807 	}
6808 	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6809 	    path, &context);
6810 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6811 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6812 	}
6813 
6814 #if NAMEDRSRCFORK
6815 	/* access(F_OK) calls are allowed for resource forks. */
6816 	if (amode == F_OK) {
6817 		nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6818 	}
6819 #endif
6820 	error = nameiat(&nd, fd);
6821 	if (error) {
6822 		goto out;
6823 	}
6824 
6825 #if NAMEDRSRCFORK
6826 	/* Grab reference on the shadow stream file vnode to
6827 	 * force an inactive on release which will mark it
6828 	 * for recycle.
6829 	 */
6830 	if (vnode_isnamedstream(nd.ni_vp) &&
6831 	    (nd.ni_vp->v_parent != NULLVP) &&
6832 	    vnode_isshadow(nd.ni_vp)) {
6833 		is_namedstream = 1;
6834 		vnode_ref(nd.ni_vp);
6835 	}
6836 #endif
6837 
6838 	error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6839 
6840 #if NAMEDRSRCFORK
6841 	if (is_namedstream) {
6842 		vnode_rele(nd.ni_vp);
6843 	}
6844 #endif
6845 
6846 	vnode_put(nd.ni_vp);
6847 	if (amode & _DELETE_OK) {
6848 		vnode_put(nd.ni_dvp);
6849 	}
6850 	nameidone(&nd);
6851 
6852 out:
6853 	if (!(flag & AT_EACCESS)) {
6854 		kauth_cred_unref(&context.vc_ucred);
6855 	}
6856 	return error;
6857 }
6858 
6859 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)6860 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6861 {
6862 	return faccessat_internal(vfs_context_current(), AT_FDCWD,
6863 	           uap->path, uap->flags, 0, UIO_USERSPACE);
6864 }
6865 
6866 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)6867 faccessat(__unused proc_t p, struct faccessat_args *uap,
6868     __unused int32_t *retval)
6869 {
6870 	if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
6871 		return EINVAL;
6872 	}
6873 
6874 	return faccessat_internal(vfs_context_current(), uap->fd,
6875 	           uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6876 }
6877 
6878 /*
6879  * Returns:	0			Success
6880  *		EFAULT
6881  *	copyout:EFAULT
6882  *	namei:???
6883  *	vn_stat:???
6884  */
6885 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)6886 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6887     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6888     enum uio_seg segflg, int fd, int flag)
6889 {
6890 	struct nameidata *ndp = NULL;
6891 	int follow;
6892 	union {
6893 		struct stat sb;
6894 		struct stat64 sb64;
6895 	} source = {};
6896 	union {
6897 		struct user64_stat user64_sb;
6898 		struct user32_stat user32_sb;
6899 		struct user64_stat64 user64_sb64;
6900 		struct user32_stat64 user32_sb64;
6901 	} dest = {};
6902 	caddr_t sbp;
6903 	int error, my_size;
6904 	kauth_filesec_t fsec = KAUTH_FILESEC_NONE;
6905 	size_t xsecurity_bufsize;
6906 	void * statptr;
6907 	struct fileproc *fp = NULL;
6908 	int needsrealdev = 0;
6909 
6910 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
6911 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
6912 	NDINIT(ndp, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6913 	    segflg, path, ctx);
6914 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6915 		ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
6916 	}
6917 
6918 #if NAMEDRSRCFORK
6919 	int is_namedstream = 0;
6920 	/* stat calls are allowed for resource forks. */
6921 	ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6922 #endif
6923 
6924 	if (flag & AT_FDONLY) {
6925 		vnode_t fvp;
6926 
6927 		error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6928 		if (error) {
6929 			goto out;
6930 		}
6931 		if ((error = vnode_getwithref(fvp))) {
6932 			file_drop(fd);
6933 			goto out;
6934 		}
6935 		ndp->ni_vp = fvp;
6936 	} else {
6937 		error = nameiat(ndp, fd);
6938 		if (error) {
6939 			goto out;
6940 		}
6941 	}
6942 
6943 	statptr = (void *)&source;
6944 
6945 #if NAMEDRSRCFORK
6946 	/* Grab reference on the shadow stream file vnode to
6947 	 * force an inactive on release which will mark it
6948 	 * for recycle.
6949 	 */
6950 	if (vnode_isnamedstream(ndp->ni_vp) &&
6951 	    (ndp->ni_vp->v_parent != NULLVP) &&
6952 	    vnode_isshadow(ndp->ni_vp)) {
6953 		is_namedstream = 1;
6954 		vnode_ref(ndp->ni_vp);
6955 	}
6956 #endif
6957 
6958 	needsrealdev = flag & AT_REALDEV ? 1 : 0;
6959 	if (fp && (xsecurity == USER_ADDR_NULL)) {
6960 		/*
6961 		 * If the caller has the file open, and is not
6962 		 * requesting extended security information, we are
6963 		 * going to let them get the basic stat information.
6964 		 */
6965 		error = vn_stat_noauth(ndp->ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6966 		    fp->fp_glob->fg_cred);
6967 	} else {
6968 		error = vn_stat(ndp->ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6969 		    isstat64, needsrealdev, ctx);
6970 	}
6971 
6972 #if NAMEDRSRCFORK
6973 	if (is_namedstream) {
6974 		vnode_rele(ndp->ni_vp);
6975 	}
6976 #endif
6977 	vnode_put(ndp->ni_vp);
6978 	nameidone(ndp);
6979 
6980 	if (fp) {
6981 		file_drop(fd);
6982 		fp = NULL;
6983 	}
6984 
6985 	if (error) {
6986 		goto out;
6987 	}
6988 	/* Zap spare fields */
6989 	if (isstat64 != 0) {
6990 		source.sb64.st_lspare = 0;
6991 		source.sb64.st_qspare[0] = 0LL;
6992 		source.sb64.st_qspare[1] = 0LL;
6993 		if (vfs_context_is64bit(ctx)) {
6994 			munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6995 			my_size = sizeof(dest.user64_sb64);
6996 			sbp = (caddr_t)&dest.user64_sb64;
6997 		} else {
6998 			munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6999 			my_size = sizeof(dest.user32_sb64);
7000 			sbp = (caddr_t)&dest.user32_sb64;
7001 		}
7002 		/*
7003 		 * Check if we raced (post lookup) against the last unlink of a file.
7004 		 */
7005 		if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7006 			source.sb64.st_nlink = 1;
7007 		}
7008 	} else {
7009 		source.sb.st_lspare = 0;
7010 		source.sb.st_qspare[0] = 0LL;
7011 		source.sb.st_qspare[1] = 0LL;
7012 		if (vfs_context_is64bit(ctx)) {
7013 			munge_user64_stat(&source.sb, &dest.user64_sb);
7014 			my_size = sizeof(dest.user64_sb);
7015 			sbp = (caddr_t)&dest.user64_sb;
7016 		} else {
7017 			munge_user32_stat(&source.sb, &dest.user32_sb);
7018 			my_size = sizeof(dest.user32_sb);
7019 			sbp = (caddr_t)&dest.user32_sb;
7020 		}
7021 
7022 		/*
7023 		 * Check if we raced (post lookup) against the last unlink of a file.
7024 		 */
7025 		if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7026 			source.sb.st_nlink = 1;
7027 		}
7028 	}
7029 	if ((error = copyout(sbp, ub, my_size)) != 0) {
7030 		goto out;
7031 	}
7032 
7033 	/* caller wants extended security information? */
7034 	if (xsecurity != USER_ADDR_NULL) {
7035 		/* did we get any? */
7036 		if (fsec == KAUTH_FILESEC_NONE) {
7037 			if (susize(xsecurity_size, 0) != 0) {
7038 				error = EFAULT;
7039 				goto out;
7040 			}
7041 		} else {
7042 			/* find the user buffer size */
7043 			xsecurity_bufsize = fusize(xsecurity_size);
7044 
7045 			/* copy out the actual data size */
7046 			if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7047 				error = EFAULT;
7048 				goto out;
7049 			}
7050 
7051 			/* if the caller supplied enough room, copy out to it */
7052 			if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7053 				error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7054 			}
7055 		}
7056 	}
7057 out:
7058 	if (ndp) {
7059 		kfree_type(struct nameidata, ndp);
7060 	}
7061 	if (fsec != KAUTH_FILESEC_NONE) {
7062 		kauth_filesec_free(fsec);
7063 	}
7064 	return error;
7065 }
7066 
7067 /*
7068  * stat_extended: Get file status; with extended security (ACL).
7069  *
7070  * Parameters:    p                       (ignored)
7071  *                uap                     User argument descriptor (see below)
7072  *                retval                  (ignored)
7073  *
7074  * Indirect:      uap->path               Path of file to get status from
7075  *                uap->ub                 User buffer (holds file status info)
7076  *                uap->xsecurity          ACL to get (extended security)
7077  *                uap->xsecurity_size     Size of ACL
7078  *
7079  * Returns:        0                      Success
7080  *                !0                      errno value
7081  *
7082  */
7083 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7084 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7085     __unused int32_t *retval)
7086 {
7087 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7088 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7089 	           0);
7090 }
7091 
7092 /*
7093  * Returns:	0			Success
7094  *	fstatat_internal:???		[see fstatat_internal() in this file]
7095  */
7096 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7097 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7098 {
7099 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7100 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7101 }
7102 
7103 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7104 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7105 {
7106 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7107 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7108 }
7109 
7110 /*
7111  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7112  *
7113  * Parameters:    p                       (ignored)
7114  *                uap                     User argument descriptor (see below)
7115  *                retval                  (ignored)
7116  *
7117  * Indirect:      uap->path               Path of file to get status from
7118  *                uap->ub                 User buffer (holds file status info)
7119  *                uap->xsecurity          ACL to get (extended security)
7120  *                uap->xsecurity_size     Size of ACL
7121  *
7122  * Returns:        0                      Success
7123  *                !0                      errno value
7124  *
7125  */
7126 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7127 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7128 {
7129 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7130 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7131 	           0);
7132 }
7133 
7134 /*
7135  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7136  *
7137  * Parameters:    p                       (ignored)
7138  *                uap                     User argument descriptor (see below)
7139  *                retval                  (ignored)
7140  *
7141  * Indirect:      uap->path               Path of file to get status from
7142  *                uap->ub                 User buffer (holds file status info)
7143  *                uap->xsecurity          ACL to get (extended security)
7144  *                uap->xsecurity_size     Size of ACL
7145  *
7146  * Returns:        0                      Success
7147  *                !0                      errno value
7148  *
7149  */
7150 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7151 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7152 {
7153 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7154 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7155 	           AT_SYMLINK_NOFOLLOW);
7156 }
7157 
7158 /*
7159  * Get file status; this version does not follow links.
7160  */
7161 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7162 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7163 {
7164 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7165 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7166 }
7167 
7168 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7169 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7170 {
7171 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7172 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7173 }
7174 
7175 /*
7176  * lstat64_extended: Get file status; can handle large inode numbers; does not
7177  * follow links; with extended security (ACL).
7178  *
7179  * Parameters:    p                       (ignored)
7180  *                uap                     User argument descriptor (see below)
7181  *                retval                  (ignored)
7182  *
7183  * Indirect:      uap->path               Path of file to get status from
7184  *                uap->ub                 User buffer (holds file status info)
7185  *                uap->xsecurity          ACL to get (extended security)
7186  *                uap->xsecurity_size     Size of ACL
7187  *
7188  * Returns:        0                      Success
7189  *                !0                      errno value
7190  *
7191  */
7192 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7193 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7194 {
7195 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7196 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7197 	           AT_SYMLINK_NOFOLLOW);
7198 }
7199 
7200 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7201 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7202 {
7203 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7204 		return EINVAL;
7205 	}
7206 
7207 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7208 	           0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7209 }
7210 
7211 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7212 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7213     __unused int32_t *retval)
7214 {
7215 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7216 		return EINVAL;
7217 	}
7218 
7219 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7220 	           0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7221 }
7222 
7223 /*
7224  * Get configurable pathname variables.
7225  *
7226  * Returns:	0			Success
7227  *	namei:???
7228  *	vn_pathconf:???
7229  *
7230  * Notes:	Global implementation  constants are intended to be
7231  *		implemented in this function directly; all other constants
7232  *		are per-FS implementation, and therefore must be handled in
7233  *		each respective FS, instead.
7234  *
7235  * XXX We implement some things globally right now that should actually be
7236  * XXX per-FS; we will need to deal with this at some point.
7237  */
7238 /* ARGSUSED */
7239 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7240 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7241 {
7242 	int error;
7243 	struct nameidata nd;
7244 	vfs_context_t ctx = vfs_context_current();
7245 
7246 	NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7247 	    UIO_USERSPACE, uap->path, ctx);
7248 	error = namei(&nd);
7249 	if (error) {
7250 		return error;
7251 	}
7252 
7253 	error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7254 
7255 	vnode_put(nd.ni_vp);
7256 	nameidone(&nd);
7257 	return error;
7258 }
7259 
7260 /*
7261  * Return target name of a symbolic link.
7262  */
7263 /* ARGSUSED */
7264 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7265 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7266     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7267     int *retval)
7268 {
7269 	vnode_t vp;
7270 	uio_t auio;
7271 	int error;
7272 	struct nameidata nd;
7273 	UIO_STACKBUF(uio_buf, 1);
7274 	bool put_vnode;
7275 
7276 	if (bufsize > INT32_MAX) {
7277 		return EINVAL;
7278 	}
7279 
7280 	if (lnk_vp) {
7281 		vp = lnk_vp;
7282 		put_vnode = false;
7283 	} else {
7284 		NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7285 		    seg, path, ctx);
7286 
7287 		error = nameiat(&nd, fd);
7288 		if (error) {
7289 			return error;
7290 		}
7291 		vp = nd.ni_vp;
7292 		put_vnode = true;
7293 		nameidone(&nd);
7294 	}
7295 
7296 	auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7297 	    &uio_buf[0], sizeof(uio_buf));
7298 	uio_addiov(auio, buf, bufsize);
7299 	if (vp->v_type != VLNK) {
7300 		error = EINVAL;
7301 	} else {
7302 #if CONFIG_MACF
7303 		error = mac_vnode_check_readlink(ctx, vp);
7304 #endif
7305 		if (error == 0) {
7306 			error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7307 			    ctx);
7308 		}
7309 		if (error == 0) {
7310 			error = VNOP_READLINK(vp, auio, ctx);
7311 		}
7312 	}
7313 
7314 	if (put_vnode) {
7315 		vnode_put(vp);
7316 	}
7317 
7318 	*retval = (int)(bufsize - uio_resid(auio));
7319 	return error;
7320 }
7321 
7322 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7323 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7324 {
7325 	enum uio_seg procseg;
7326 	vnode_t vp;
7327 	int error;
7328 
7329 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7330 
7331 	AUDIT_ARG(fd, uap->fd);
7332 
7333 	if ((error = file_vnode(uap->fd, &vp))) {
7334 		return error;
7335 	}
7336 	if ((error = vnode_getwithref(vp))) {
7337 		file_drop(uap->fd);
7338 		return error;
7339 	}
7340 
7341 	error = readlinkat_internal(vfs_context_current(), -1,
7342 	    vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7343 	    uap->bufsize, procseg, retval);
7344 
7345 	vnode_put(vp);
7346 	file_drop(uap->fd);
7347 	return error;
7348 }
7349 
7350 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7351 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7352 {
7353 	enum uio_seg procseg;
7354 
7355 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7356 	return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7357 	           CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7358 	           uap->count, procseg, retval);
7359 }
7360 
7361 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7362 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7363 {
7364 	enum uio_seg procseg;
7365 
7366 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7367 	return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7368 	           CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7369 	           retval);
7370 }
7371 
7372 /*
7373  * Change file flags, the deep inner layer.
7374  */
7375 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7376 chflags0(vnode_t vp, struct vnode_attr *va,
7377     int (*setattr)(vnode_t, void *, vfs_context_t),
7378     void *arg, vfs_context_t ctx)
7379 {
7380 	kauth_action_t action = 0;
7381 	int error;
7382 
7383 #if CONFIG_MACF
7384 	error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7385 	if (error) {
7386 		goto out;
7387 	}
7388 #endif
7389 
7390 	/* request authorisation, disregard immutability */
7391 	if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7392 		goto out;
7393 	}
7394 	/*
7395 	 * Request that the auth layer disregard those file flags it's allowed to when
7396 	 * authorizing this operation; we need to do this in order to be able to
7397 	 * clear immutable flags.
7398 	 */
7399 	if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7400 		goto out;
7401 	}
7402 	error = (*setattr)(vp, arg, ctx);
7403 
7404 #if CONFIG_MACF
7405 	if (error == 0) {
7406 		mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7407 	}
7408 #endif
7409 
7410 out:
7411 	return error;
7412 }
7413 
7414 /*
7415  * Change file flags.
7416  *
7417  * NOTE: this will vnode_put() `vp'
7418  */
7419 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7420 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7421 {
7422 	struct vnode_attr va;
7423 	int error;
7424 
7425 	VATTR_INIT(&va);
7426 	VATTR_SET(&va, va_flags, flags);
7427 
7428 	error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7429 	vnode_put(vp);
7430 
7431 	if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7432 		error = ENOTSUP;
7433 	}
7434 
7435 	return error;
7436 }
7437 
7438 /*
7439  * Change flags of a file given a path name.
7440  */
7441 /* ARGSUSED */
7442 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7443 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7444 {
7445 	vnode_t vp;
7446 	vfs_context_t ctx = vfs_context_current();
7447 	int error;
7448 	struct nameidata nd;
7449 	uint32_t wantparent = 0;
7450 
7451 #if CONFIG_FILE_LEASES
7452 	wantparent = WANTPARENT;
7453 #endif
7454 
7455 	AUDIT_ARG(fflags, uap->flags);
7456 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7457 	    UIO_USERSPACE, uap->path, ctx);
7458 	error = namei(&nd);
7459 	if (error) {
7460 		return error;
7461 	}
7462 	vp = nd.ni_vp;
7463 
7464 #if CONFIG_FILE_LEASES
7465 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7466 	vnode_put(nd.ni_dvp);
7467 #endif
7468 
7469 	nameidone(&nd);
7470 
7471 	/* we don't vnode_put() here because chflags1 does internally */
7472 	error = chflags1(vp, uap->flags, ctx);
7473 
7474 	return error;
7475 }
7476 
7477 /*
7478  * Change flags of a file given a file descriptor.
7479  */
7480 /* ARGSUSED */
7481 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7482 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7483 {
7484 	vnode_t vp;
7485 	int error;
7486 
7487 	AUDIT_ARG(fd, uap->fd);
7488 	AUDIT_ARG(fflags, uap->flags);
7489 	if ((error = file_vnode(uap->fd, &vp))) {
7490 		return error;
7491 	}
7492 
7493 	if ((error = vnode_getwithref(vp))) {
7494 		file_drop(uap->fd);
7495 		return error;
7496 	}
7497 
7498 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7499 
7500 #if CONFIG_FILE_LEASES
7501 	vnode_breakdirlease(vp, true, O_WRONLY);
7502 #endif
7503 
7504 	/* we don't vnode_put() here because chflags1 does internally */
7505 	error = chflags1(vp, uap->flags, vfs_context_current());
7506 
7507 	file_drop(uap->fd);
7508 	return error;
7509 }
7510 
7511 /*
7512  * Change security information on a filesystem object.
7513  *
7514  * Returns:	0			Success
7515  *		EPERM			Operation not permitted
7516  *		vnode_authattr:???	[anything vnode_authattr can return]
7517  *		vnode_authorize:???	[anything vnode_authorize can return]
7518  *		vnode_setattr:???	[anything vnode_setattr can return]
7519  *
7520  * Notes:	If vnode_authattr or vnode_authorize return EACCES, it will be
7521  *		translated to EPERM before being returned.
7522  */
7523 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7524 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7525 {
7526 	kauth_action_t action;
7527 	int error;
7528 
7529 	AUDIT_ARG(mode, vap->va_mode);
7530 	/* XXX audit new args */
7531 
7532 #if NAMEDSTREAMS
7533 	/* chmod calls are not allowed for resource forks. */
7534 	if (vp->v_flag & VISNAMEDSTREAM) {
7535 		return EPERM;
7536 	}
7537 #endif
7538 
7539 #if CONFIG_MACF
7540 	if (VATTR_IS_ACTIVE(vap, va_mode) &&
7541 	    (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7542 		return error;
7543 	}
7544 
7545 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7546 		if ((error = mac_vnode_check_setowner(ctx, vp,
7547 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7548 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7549 			return error;
7550 		}
7551 	}
7552 
7553 	if (VATTR_IS_ACTIVE(vap, va_acl) &&
7554 	    (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7555 		return error;
7556 	}
7557 #endif
7558 
7559 	/* make sure that the caller is allowed to set this security information */
7560 	if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7561 	    ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7562 		if (error == EACCES) {
7563 			error = EPERM;
7564 		}
7565 		return error;
7566 	}
7567 
7568 	if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7569 		return error;
7570 	}
7571 
7572 #if CONFIG_MACF
7573 	if (VATTR_IS_ACTIVE(vap, va_mode)) {
7574 		mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7575 	}
7576 
7577 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7578 		mac_vnode_notify_setowner(ctx, vp,
7579 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7580 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7581 	}
7582 
7583 	if (VATTR_IS_ACTIVE(vap, va_acl)) {
7584 		mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7585 	}
7586 #endif
7587 
7588 	return error;
7589 }
7590 
7591 
7592 /*
7593  * Change mode of a file given a path name.
7594  *
7595  * Returns:	0			Success
7596  *		namei:???		[anything namei can return]
7597  *		chmod_vnode:???		[anything chmod_vnode can return]
7598  */
7599 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7600 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7601     int fd, int flag, enum uio_seg segflg)
7602 {
7603 	struct nameidata nd;
7604 	int follow, error;
7605 	uint32_t wantparent = 0;
7606 
7607 #if CONFIG_FILE_LEASES
7608 	wantparent = WANTPARENT;
7609 #endif
7610 
7611 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7612 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7613 	    segflg, path, ctx);
7614 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7615 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7616 	}
7617 	if ((error = nameiat(&nd, fd))) {
7618 		return error;
7619 	}
7620 
7621 #if CONFIG_FILE_LEASES
7622 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7623 	vnode_put(nd.ni_dvp);
7624 #endif
7625 
7626 	error = chmod_vnode(ctx, nd.ni_vp, vap);
7627 	vnode_put(nd.ni_vp);
7628 	nameidone(&nd);
7629 	return error;
7630 }
7631 
7632 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)7633 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7634     gid_t gid, user_addr_t xsecurity)
7635 {
7636 	int error;
7637 
7638 	VATTR_INIT(pva);
7639 
7640 	if (mode != -1) {
7641 		VATTR_SET(pva, va_mode, mode & ALLPERMS);
7642 	} else {
7643 		pva->va_mode = 0;
7644 	}
7645 
7646 	if (uid != KAUTH_UID_NONE) {
7647 		VATTR_SET(pva, va_uid, uid);
7648 	}
7649 
7650 	if (gid != KAUTH_GID_NONE) {
7651 		VATTR_SET(pva, va_gid, gid);
7652 	}
7653 
7654 	*pxsecdst = NULL;
7655 	switch (xsecurity) {
7656 	case USER_ADDR_NULL:
7657 		break;
7658 
7659 	case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7660 		VATTR_SET(pva, va_acl, NULL);
7661 		break;
7662 
7663 	default:
7664 		if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
7665 			return error;
7666 		}
7667 
7668 		VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7669 		pva->va_vaflags |= VA_FILESEC_ACL;
7670 		KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7671 		break;
7672 	}
7673 
7674 	return 0;
7675 }
7676 
7677 /*
7678  * chmod_extended: Change the mode of a file given a path name; with extended
7679  * argument list (including extended security (ACL)).
7680  *
7681  * Parameters:	p			Process requesting the open
7682  *		uap			User argument descriptor (see below)
7683  *		retval			(ignored)
7684  *
7685  * Indirect:	uap->path		Path to object (same as 'chmod')
7686  *		uap->uid		UID to set
7687  *		uap->gid		GID to set
7688  *		uap->mode		File mode to set (same as 'chmod')
7689  *		uap->xsecurity		ACL to set (or delete)
7690  *
7691  * Returns:	0			Success
7692  *		!0			errno value
7693  *
7694  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
7695  *
7696  * XXX:		We should enummerate the possible errno values here, and where
7697  *		in the code they originated.
7698  */
7699 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7700 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7701 {
7702 	int error;
7703 	struct vnode_attr va;
7704 	kauth_filesec_t xsecdst = NULL;
7705 
7706 	AUDIT_ARG(owner, uap->uid, uap->gid);
7707 
7708 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7709 	    uap->gid, uap->xsecurity);
7710 
7711 	if (error) {
7712 		return error;
7713 	}
7714 
7715 	error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7716 	    UIO_USERSPACE);
7717 
7718 	if (xsecdst != NULL) {
7719 		kauth_filesec_free(xsecdst);
7720 	}
7721 	return error;
7722 }
7723 
7724 /*
7725  * Returns:	0			Success
7726  *		chmodat:???		[anything chmodat can return]
7727  */
7728 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7729 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7730     int flag, enum uio_seg segflg)
7731 {
7732 	struct vnode_attr va;
7733 
7734 	VATTR_INIT(&va);
7735 	VATTR_SET(&va, va_mode, mode & ALLPERMS);
7736 
7737 	return chmodat(ctx, path, &va, fd, flag, segflg);
7738 }
7739 
7740 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7741 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7742 {
7743 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7744 	           AT_FDCWD, 0, UIO_USERSPACE);
7745 }
7746 
7747 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7748 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7749 {
7750 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7751 		return EINVAL;
7752 	}
7753 
7754 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7755 	           uap->fd, uap->flag, UIO_USERSPACE);
7756 }
7757 
7758 /*
7759  * Change mode of a file given a file descriptor.
7760  */
7761 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7762 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7763 {
7764 	vnode_t vp;
7765 	int error;
7766 
7767 	AUDIT_ARG(fd, fd);
7768 
7769 	if ((error = file_vnode(fd, &vp)) != 0) {
7770 		return error;
7771 	}
7772 	if ((error = vnode_getwithref(vp)) != 0) {
7773 		file_drop(fd);
7774 		return error;
7775 	}
7776 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7777 
7778 #if CONFIG_FILE_LEASES
7779 	vnode_breakdirlease(vp, true, O_WRONLY);
7780 #endif
7781 
7782 	error = chmod_vnode(vfs_context_current(), vp, vap);
7783 	(void)vnode_put(vp);
7784 	file_drop(fd);
7785 
7786 	return error;
7787 }
7788 
7789 /*
7790  * fchmod_extended: Change mode of a file given a file descriptor; with
7791  * extended argument list (including extended security (ACL)).
7792  *
7793  * Parameters:    p                       Process requesting to change file mode
7794  *                uap                     User argument descriptor (see below)
7795  *                retval                  (ignored)
7796  *
7797  * Indirect:      uap->mode               File mode to set (same as 'chmod')
7798  *                uap->uid                UID to set
7799  *                uap->gid                GID to set
7800  *                uap->xsecurity          ACL to set (or delete)
7801  *                uap->fd                 File descriptor of file to change mode
7802  *
7803  * Returns:        0                      Success
7804  *                !0                      errno value
7805  *
7806  */
7807 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7808 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7809 {
7810 	int error;
7811 	struct vnode_attr va;
7812 	kauth_filesec_t xsecdst = NULL;
7813 
7814 	AUDIT_ARG(owner, uap->uid, uap->gid);
7815 
7816 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7817 	    uap->gid, uap->xsecurity);
7818 
7819 	if (error) {
7820 		return error;
7821 	}
7822 
7823 	error = fchmod1(p, uap->fd, &va);
7824 
7825 	if (xsecdst != NULL) {
7826 		kauth_filesec_free(xsecdst);
7827 	}
7828 	return error;
7829 }
7830 
7831 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)7832 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7833 {
7834 	struct vnode_attr va;
7835 
7836 	VATTR_INIT(&va);
7837 	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7838 
7839 	return fchmod1(p, uap->fd, &va);
7840 }
7841 
7842 static int
vn_chown_internal(__unused vfs_context_t ctx,vnode_t vp,uid_t uid,gid_t gid)7843 vn_chown_internal(__unused vfs_context_t ctx, vnode_t vp, uid_t uid, gid_t gid)
7844 {
7845 	struct vnode_attr va;
7846 	kauth_action_t action;
7847 	int error;
7848 
7849 	VATTR_INIT(&va);
7850 	if (uid != (uid_t)VNOVAL) {
7851 		VATTR_SET(&va, va_uid, uid);
7852 	}
7853 	if (gid != (gid_t)VNOVAL) {
7854 		VATTR_SET(&va, va_gid, gid);
7855 	}
7856 
7857 #if NAMEDSTREAMS
7858 	/* chown calls are not allowed for resource forks. */
7859 	if (vp->v_flag & VISNAMEDSTREAM) {
7860 		error = EPERM;
7861 		goto out;
7862 	}
7863 #endif
7864 
7865 #if CONFIG_MACF
7866 	error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7867 	if (error) {
7868 		goto out;
7869 	}
7870 #endif
7871 
7872 	/* preflight and authorize attribute changes */
7873 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7874 		goto out;
7875 	}
7876 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7877 		/*
7878 		 * EACCES is only allowed from namei(); permissions failure should
7879 		 * return EPERM, so we need to translate the error code.
7880 		 */
7881 		if (error == EACCES) {
7882 			error = EPERM;
7883 		}
7884 
7885 		goto out;
7886 	}
7887 
7888 #if CONFIG_FILE_LEASES
7889 	vnode_breakdirlease(vp, true, O_WRONLY);
7890 #endif
7891 
7892 	error = vnode_setattr(vp, &va, ctx);
7893 
7894 #if CONFIG_MACF
7895 	if (error == 0) {
7896 		mac_vnode_notify_setowner(ctx, vp, uid, gid);
7897 	}
7898 #endif
7899 
7900 out:
7901 	return error;
7902 }
7903 
7904 /*
7905  * Set ownership given a path name.
7906  */
7907 /* ARGSUSED */
7908 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)7909 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7910     gid_t gid, int flag, enum uio_seg segflg)
7911 {
7912 	vnode_t vp;
7913 	int error;
7914 	struct nameidata nd;
7915 	int follow;
7916 
7917 	AUDIT_ARG(owner, uid, gid);
7918 
7919 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7920 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg, path, ctx);
7921 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7922 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7923 	}
7924 
7925 	error = nameiat(&nd, fd);
7926 	if (error) {
7927 		return error;
7928 	}
7929 
7930 	vp = nd.ni_vp;
7931 	error = vn_chown_internal(ctx, vp, uid, gid);
7932 
7933 	nameidone(&nd);
7934 	vnode_put(vp);
7935 	return error;
7936 }
7937 
7938 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)7939 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7940 {
7941 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7942 	           uap->uid, uap->gid, 0, UIO_USERSPACE);
7943 }
7944 
7945 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)7946 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7947 {
7948 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7949 	           uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7950 }
7951 
7952 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)7953 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7954 {
7955 	if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7956 		return EINVAL;
7957 	}
7958 
7959 	return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7960 	           uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7961 }
7962 
7963 /*
7964  * Set ownership given a file descriptor.
7965  */
7966 /* ARGSUSED */
7967 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)7968 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7969 {
7970 	vfs_context_t ctx = vfs_context_current();
7971 	vnode_t vp;
7972 	int error;
7973 
7974 	AUDIT_ARG(owner, uap->uid, uap->gid);
7975 	AUDIT_ARG(fd, uap->fd);
7976 
7977 	if ((error = file_vnode(uap->fd, &vp))) {
7978 		return error;
7979 	}
7980 
7981 	if ((error = vnode_getwithref(vp))) {
7982 		file_drop(uap->fd);
7983 		return error;
7984 	}
7985 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7986 
7987 	error = vn_chown_internal(ctx, vp, uap->uid, uap->gid);
7988 
7989 	(void)vnode_put(vp);
7990 	file_drop(uap->fd);
7991 	return error;
7992 }
7993 
7994 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)7995 getutimes(user_addr_t usrtvp, struct timespec *tsp)
7996 {
7997 	int error;
7998 
7999 	if (usrtvp == USER_ADDR_NULL) {
8000 		struct timeval old_tv;
8001 		/* XXX Y2038 bug because of microtime argument */
8002 		microtime(&old_tv);
8003 		TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8004 		tsp[1] = tsp[0];
8005 	} else {
8006 		if (IS_64BIT_PROCESS(current_proc())) {
8007 			struct user64_timeval tv[2];
8008 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8009 			if (error) {
8010 				return error;
8011 			}
8012 			TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8013 			TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8014 		} else {
8015 			struct user32_timeval tv[2];
8016 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8017 			if (error) {
8018 				return error;
8019 			}
8020 			TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8021 			TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8022 		}
8023 	}
8024 	return 0;
8025 }
8026 
8027 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8028 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8029     int nullflag)
8030 {
8031 	int error;
8032 	struct vnode_attr va;
8033 	kauth_action_t action;
8034 
8035 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8036 
8037 	VATTR_INIT(&va);
8038 	VATTR_SET(&va, va_access_time, ts[0]);
8039 	VATTR_SET(&va, va_modify_time, ts[1]);
8040 	if (nullflag) {
8041 		va.va_vaflags |= VA_UTIMES_NULL;
8042 	}
8043 
8044 #if NAMEDSTREAMS
8045 	/* utimes calls are not allowed for resource forks. */
8046 	if (vp->v_flag & VISNAMEDSTREAM) {
8047 		error = EPERM;
8048 		goto out;
8049 	}
8050 #endif
8051 
8052 #if CONFIG_MACF
8053 	error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8054 	if (error) {
8055 		goto out;
8056 	}
8057 #endif
8058 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8059 		if (!nullflag && error == EACCES) {
8060 			error = EPERM;
8061 		}
8062 		goto out;
8063 	}
8064 
8065 	/* since we may not need to auth anything, check here */
8066 	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8067 		if (!nullflag && error == EACCES) {
8068 			error = EPERM;
8069 		}
8070 		goto out;
8071 	}
8072 	error = vnode_setattr(vp, &va, ctx);
8073 
8074 #if CONFIG_MACF
8075 	if (error == 0) {
8076 		mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8077 	}
8078 #endif
8079 
8080 out:
8081 	return error;
8082 }
8083 
8084 /*
8085  * Set the access and modification times of a file.
8086  */
8087 /* ARGSUSED */
8088 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8089 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8090 {
8091 	struct timespec ts[2];
8092 	user_addr_t usrtvp;
8093 	int error;
8094 	struct nameidata nd;
8095 	vfs_context_t ctx = vfs_context_current();
8096 	uint32_t wantparent = 0;
8097 
8098 #if CONFIG_FILE_LEASES
8099 	wantparent = WANTPARENT;
8100 #endif
8101 
8102 	/*
8103 	 * AUDIT: Needed to change the order of operations to do the
8104 	 * name lookup first because auditing wants the path.
8105 	 */
8106 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8107 	    UIO_USERSPACE, uap->path, ctx);
8108 	error = namei(&nd);
8109 	if (error) {
8110 		return error;
8111 	}
8112 
8113 	/*
8114 	 * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
8115 	 * the current time instead.
8116 	 */
8117 	usrtvp = uap->tptr;
8118 	if ((error = getutimes(usrtvp, ts)) != 0) {
8119 		goto out;
8120 	}
8121 
8122 #if CONFIG_FILE_LEASES
8123 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8124 #endif
8125 
8126 	error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8127 
8128 out:
8129 #if CONFIG_FILE_LEASES
8130 	vnode_put(nd.ni_dvp);
8131 #endif
8132 	nameidone(&nd);
8133 	vnode_put(nd.ni_vp);
8134 	return error;
8135 }
8136 
8137 /*
8138  * Set the access and modification times of a file.
8139  */
8140 /* ARGSUSED */
8141 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8142 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8143 {
8144 	struct timespec ts[2];
8145 	vnode_t vp;
8146 	user_addr_t usrtvp;
8147 	int error;
8148 
8149 	AUDIT_ARG(fd, uap->fd);
8150 	usrtvp = uap->tptr;
8151 	if ((error = getutimes(usrtvp, ts)) != 0) {
8152 		return error;
8153 	}
8154 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
8155 		return error;
8156 	}
8157 	if ((error = vnode_getwithref(vp))) {
8158 		file_drop(uap->fd);
8159 		return error;
8160 	}
8161 
8162 #if CONFIG_FILE_LEASES
8163 	vnode_breakdirlease(vp, true, O_WRONLY);
8164 #endif
8165 
8166 	error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8167 
8168 	vnode_put(vp);
8169 	file_drop(uap->fd);
8170 	return error;
8171 }
8172 
8173 static int
truncate_validate_common(proc_t p,off_t length)8174 truncate_validate_common(proc_t p, off_t length)
8175 {
8176 	rlim_t fsize_limit;
8177 
8178 	if (length < 0) {
8179 		return EINVAL;
8180 	}
8181 
8182 	fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8183 	if ((rlim_t)length > fsize_limit) {
8184 		psignal(p, SIGXFSZ);
8185 		return EFBIG;
8186 	}
8187 
8188 	return 0;
8189 }
8190 
8191 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8192 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8193     vfs_context_t ctx, boolean_t need_auth)
8194 {
8195 	struct vnode_attr va;
8196 	kauth_action_t action;
8197 	int error;
8198 
8199 	VATTR_INIT(&va);
8200 	VATTR_SET(&va, va_data_size, length);
8201 
8202 #if CONFIG_MACF
8203 	error = mac_vnode_check_truncate(ctx, cred, vp);
8204 	if (error) {
8205 		return error;
8206 	}
8207 #endif
8208 
8209 	/*
8210 	 * If we reached here from `ftruncate` then we already did an effective
8211 	 * `vnode_authorize` upon open.  We honour the result from then.
8212 	 */
8213 	if (need_auth) {
8214 		if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8215 			return error;
8216 		}
8217 
8218 		if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8219 			return error;
8220 		}
8221 	}
8222 
8223 #if CONFIG_FILE_LEASES
8224 	/* Check if there is a lease placed on the parent directory. */
8225 	vnode_breakdirlease(vp, true, O_WRONLY);
8226 
8227 	/* Now check if there is a lease placed on the file itself. */
8228 	(void)vnode_breaklease(vp, O_WRONLY, ctx);
8229 #endif
8230 
8231 	error = vnode_setattr(vp, &va, ctx);
8232 
8233 #if CONFIG_MACF
8234 	if (error == 0) {
8235 		mac_vnode_notify_truncate(ctx, cred, vp);
8236 	}
8237 #endif
8238 
8239 	return error;
8240 }
8241 
8242 /*
8243  * Truncate a file given its path name.
8244  */
8245 /* ARGSUSED */
8246 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8247 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8248 {
8249 	vfs_context_t ctx = vfs_context_current();
8250 	vnode_t vp;
8251 	int error;
8252 	struct nameidata nd;
8253 
8254 	if ((error = truncate_validate_common(p, uap->length))) {
8255 		return error;
8256 	}
8257 
8258 	NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8259 	    UIO_USERSPACE, uap->path, ctx);
8260 
8261 	if ((error = namei(&nd))) {
8262 		return error;
8263 	}
8264 
8265 	vp = nd.ni_vp;
8266 	nameidone(&nd);
8267 
8268 	error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8269 	vnode_put(vp);
8270 
8271 	return error;
8272 }
8273 
8274 /*
8275  * Truncate a file given a file descriptor.
8276  */
8277 /* ARGSUSED */
8278 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8279 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8280 {
8281 	vnode_t vp;
8282 	struct fileproc *fp;
8283 	int error;
8284 
8285 	AUDIT_ARG(fd, uap->fd);
8286 
8287 	if ((error = truncate_validate_common(p, uap->length))) {
8288 		return error;
8289 	}
8290 
8291 	if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8292 		return error;
8293 	}
8294 
8295 	switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8296 	case DTYPE_PSXSHM:
8297 		error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8298 		goto out;
8299 	case DTYPE_VNODE:
8300 		break;
8301 	default:
8302 		error = EINVAL;
8303 		goto out;
8304 	}
8305 
8306 	vp = (vnode_t)fp_get_data(fp);
8307 
8308 	if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8309 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8310 		error = EINVAL;
8311 		goto out;
8312 	}
8313 
8314 	if ((error = vnode_getwithref(vp)) != 0) {
8315 		goto out;
8316 	}
8317 
8318 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8319 
8320 	error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8321 	    vfs_context_current(), false);
8322 	vnode_put(vp);
8323 
8324 out:
8325 	file_drop(uap->fd);
8326 	return error;
8327 }
8328 
8329 
8330 /*
8331  * Sync an open file with synchronized I/O _file_ integrity completion
8332  */
8333 /* ARGSUSED */
8334 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8335 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8336 {
8337 	__pthread_testcancel(1);
8338 	return fsync_common(p, uap, MNT_WAIT);
8339 }
8340 
8341 
8342 /*
8343  * Sync an open file with synchronized I/O _file_ integrity completion
8344  *
8345  * Notes:	This is a legacy support function that does not test for
8346  *		thread cancellation points.
8347  */
8348 /* ARGSUSED */
8349 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8350 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8351 {
8352 	return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8353 }
8354 
8355 
8356 /*
8357  * Sync an open file with synchronized I/O _data_ integrity completion
8358  */
8359 /* ARGSUSED */
8360 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8361 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8362 {
8363 	__pthread_testcancel(1);
8364 	return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8365 }
8366 
8367 
8368 /*
8369  * fsync_common
8370  *
8371  * Common fsync code to support both synchronized I/O file integrity completion
8372  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8373  *
8374  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8375  * will only guarantee that the file data contents are retrievable.  If
8376  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8377  * includes additional metadata unnecessary for retrieving the file data
8378  * contents, such as atime, mtime, ctime, etc., also be committed to stable
8379  * storage.
8380  *
8381  * Parameters:	p				The process
8382  *		uap->fd				The descriptor to synchronize
8383  *		flags				The data integrity flags
8384  *
8385  * Returns:	int				Success
8386  *	fp_getfvp:EBADF				Bad file descriptor
8387  *	fp_getfvp:ENOTSUP			fd does not refer to a vnode
8388  *	VNOP_FSYNC:???				unspecified
8389  *
8390  * Notes:	We use struct fsync_args because it is a short name, and all
8391  *		caller argument structures are otherwise identical.
8392  */
8393 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8394 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8395 {
8396 	vnode_t vp;
8397 	struct fileproc *fp;
8398 	vfs_context_t ctx = vfs_context_current();
8399 	int error;
8400 
8401 	AUDIT_ARG(fd, uap->fd);
8402 
8403 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8404 		return error;
8405 	}
8406 	if ((error = vnode_getwithref(vp))) {
8407 		file_drop(uap->fd);
8408 		return error;
8409 	}
8410 
8411 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8412 
8413 	error = VNOP_FSYNC(vp, flags, ctx);
8414 
8415 #if NAMEDRSRCFORK
8416 	/* Sync resource fork shadow file if necessary. */
8417 	if ((error == 0) &&
8418 	    (vp->v_flag & VISNAMEDSTREAM) &&
8419 	    (vp->v_parent != NULLVP) &&
8420 	    vnode_isshadow(vp) &&
8421 	    (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8422 		(void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8423 	}
8424 #endif
8425 
8426 	(void)vnode_put(vp);
8427 	file_drop(uap->fd);
8428 	return error;
8429 }
8430 
8431 /*
8432  * Duplicate files.  Source must be a file, target must be a file or
8433  * must not exist.
8434  *
8435  * XXX Copyfile authorisation checking is woefully inadequate, and will not
8436  *     perform inheritance correctly.
8437  */
8438 /* ARGSUSED */
8439 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8440 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8441 {
8442 	vnode_t tvp, fvp, tdvp, sdvp;
8443 	struct nameidata fromnd, tond;
8444 	int error;
8445 	vfs_context_t ctx = vfs_context_current();
8446 
8447 	/* Check that the flags are valid. */
8448 	if (uap->flags & ~CPF_MASK) {
8449 		return EINVAL;
8450 	}
8451 
8452 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8453 	    UIO_USERSPACE, uap->from, ctx);
8454 	if ((error = namei(&fromnd))) {
8455 		return error;
8456 	}
8457 	fvp = fromnd.ni_vp;
8458 
8459 	NDINIT(&tond, CREATE, OP_LINK,
8460 	    LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8461 	    UIO_USERSPACE, uap->to, ctx);
8462 	if ((error = namei(&tond))) {
8463 		goto out1;
8464 	}
8465 	tdvp = tond.ni_dvp;
8466 	tvp = tond.ni_vp;
8467 
8468 	if (tvp != NULL) {
8469 		if (!(uap->flags & CPF_OVERWRITE)) {
8470 			error = EEXIST;
8471 			goto out;
8472 		}
8473 	}
8474 
8475 	if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8476 		error = EISDIR;
8477 		goto out;
8478 	}
8479 
8480 	if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8481 		error = EOPNOTSUPP;
8482 		goto out;
8483 	}
8484 
8485 #if CONFIG_MACF
8486 	if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8487 		goto out;
8488 	}
8489 #endif /* CONFIG_MACF */
8490 
8491 	if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8492 		goto out;
8493 	}
8494 	if (tvp) {
8495 		if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8496 			goto out;
8497 		}
8498 	}
8499 	if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8500 		goto out;
8501 	}
8502 
8503 	if (fvp == tdvp) {
8504 		error = EINVAL;
8505 	}
8506 	/*
8507 	 * If source is the same as the destination (that is the
8508 	 * same inode number) then there is nothing to do.
8509 	 * (fixed to have POSIX semantics - CSM 3/2/98)
8510 	 */
8511 	if (fvp == tvp) {
8512 		error = -1;
8513 	}
8514 
8515 #if CONFIG_FILE_LEASES
8516 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8517 #endif
8518 
8519 	if (!error) {
8520 		error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8521 	}
8522 out:
8523 	sdvp = tond.ni_startdir;
8524 	/*
8525 	 * nameidone has to happen before we vnode_put(tdvp)
8526 	 * since it may need to release the fs_nodelock on the tdvp
8527 	 */
8528 	nameidone(&tond);
8529 
8530 	if (tvp) {
8531 		vnode_put(tvp);
8532 	}
8533 	vnode_put(tdvp);
8534 	vnode_put(sdvp);
8535 out1:
8536 	vnode_put(fvp);
8537 
8538 	nameidone(&fromnd);
8539 
8540 	if (error == -1) {
8541 		return 0;
8542 	}
8543 	return error;
8544 }
8545 
8546 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8547 
8548 /*
8549  * Helper function for doing clones. The caller is expected to provide an
8550  * iocounted source vnode and release it.
8551  */
8552 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)8553 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8554     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8555 {
8556 	vnode_t tvp, tdvp;
8557 	struct nameidata tond;
8558 	int error;
8559 	int follow;
8560 	boolean_t free_src_acl;
8561 	boolean_t attr_cleanup;
8562 	enum vtype v_type;
8563 	kauth_action_t action;
8564 	struct componentname *cnp;
8565 	uint32_t defaulted = 0;
8566 	struct vnode_attr va;
8567 	struct vnode_attr nva;
8568 	uint32_t vnop_flags;
8569 
8570 	v_type = vnode_vtype(fvp);
8571 	switch (v_type) {
8572 	case VLNK:
8573 	/* FALLTHRU */
8574 	case VREG:
8575 		action = KAUTH_VNODE_ADD_FILE;
8576 		break;
8577 	case VDIR:
8578 		if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8579 		    fvp->v_mountedhere) {
8580 			return EINVAL;
8581 		}
8582 		action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8583 		break;
8584 	default:
8585 		return EINVAL;
8586 	}
8587 
8588 	AUDIT_ARG(fd2, dst_dirfd);
8589 	AUDIT_ARG(value32, flags);
8590 
8591 	follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8592 	NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8593 	    UIO_USERSPACE, dst, ctx);
8594 	if ((error = nameiat(&tond, dst_dirfd))) {
8595 		return error;
8596 	}
8597 	cnp = &tond.ni_cnd;
8598 	tdvp = tond.ni_dvp;
8599 	tvp = tond.ni_vp;
8600 
8601 	free_src_acl = FALSE;
8602 	attr_cleanup = FALSE;
8603 
8604 	if (tvp != NULL) {
8605 		error = EEXIST;
8606 		goto out;
8607 	}
8608 
8609 	if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8610 		error = EXDEV;
8611 		goto out;
8612 	}
8613 
8614 #if CONFIG_MACF
8615 	if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8616 		goto out;
8617 	}
8618 #endif
8619 	if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8620 		goto out;
8621 	}
8622 
8623 	action = KAUTH_VNODE_GENERIC_READ_BITS;
8624 	if (data_read_authorised) {
8625 		action &= ~KAUTH_VNODE_READ_DATA;
8626 	}
8627 	if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8628 		goto out;
8629 	}
8630 
8631 	/*
8632 	 * certain attributes may need to be changed from the source, we ask for
8633 	 * those here with the exception of source file's ACLs unless the CLONE_ACL
8634 	 * flag is specified. By default, the clone file will inherit the target
8635 	 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8636 	 * will inherit the source file's ACLs instead.
8637 	 */
8638 	VATTR_INIT(&va);
8639 	VATTR_WANTED(&va, va_uid);
8640 	VATTR_WANTED(&va, va_gid);
8641 	VATTR_WANTED(&va, va_mode);
8642 	VATTR_WANTED(&va, va_flags);
8643 	if (flags & CLONE_ACL) {
8644 		VATTR_WANTED(&va, va_acl);
8645 	}
8646 
8647 	if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
8648 		goto out;
8649 	}
8650 
8651 	VATTR_INIT(&nva);
8652 	VATTR_SET(&nva, va_type, v_type);
8653 	if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8654 		VATTR_SET(&nva, va_acl, va.va_acl);
8655 		free_src_acl = TRUE;
8656 	}
8657 
8658 	/* Handle ACL inheritance, initialize vap. */
8659 	if (v_type == VLNK) {
8660 		error = vnode_authattr_new(tdvp, &nva, 0, ctx);
8661 	} else {
8662 		error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
8663 		if (error) {
8664 			goto out;
8665 		}
8666 		attr_cleanup = TRUE;
8667 	}
8668 
8669 	vnop_flags = VNODE_CLONEFILE_DEFAULT;
8670 	/*
8671 	 * We've got initial values for all security parameters,
8672 	 * If we are superuser, then we can change owners to be the
8673 	 * same as the source. Both superuser and the owner have default
8674 	 * WRITE_SECURITY privileges so all other fields can be taken
8675 	 * from source as well.
8676 	 */
8677 	if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8678 		if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8679 			VATTR_SET(&nva, va_uid, va.va_uid);
8680 		}
8681 		if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8682 			VATTR_SET(&nva, va_gid, va.va_gid);
8683 		}
8684 	} else {
8685 		vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8686 	}
8687 
8688 	if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8689 		VATTR_SET(&nva, va_mode, va.va_mode);
8690 	}
8691 	if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8692 		VATTR_SET(&nva, va_flags,
8693 		    ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8694 		    (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8695 	}
8696 
8697 #if CONFIG_FILE_LEASES
8698 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8699 #endif
8700 
8701 	error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8702 
8703 	if (!error && tvp) {
8704 		int     update_flags = 0;
8705 #if CONFIG_FSE
8706 		int fsevent;
8707 #endif /* CONFIG_FSE */
8708 
8709 		/*
8710 		 * If some of the requested attributes weren't handled by the
8711 		 * VNOP, use our fallback code.
8712 		 */
8713 		if (!VATTR_ALL_SUPPORTED(&nva)) {
8714 			(void)vnode_setattr_fallback(tvp, &nva, ctx);
8715 		}
8716 
8717 #if CONFIG_MACF
8718 		(void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8719 		    VNODE_LABEL_CREATE, ctx);
8720 #endif
8721 
8722 		// Make sure the name & parent pointers are hooked up
8723 		if (tvp->v_name == NULL) {
8724 			update_flags |= VNODE_UPDATE_NAME;
8725 		}
8726 		if (tvp->v_parent == NULLVP) {
8727 			update_flags |= VNODE_UPDATE_PARENT;
8728 		}
8729 
8730 		if (update_flags) {
8731 			(void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8732 			    cnp->cn_namelen, cnp->cn_hash, update_flags);
8733 		}
8734 
8735 #if CONFIG_FSE
8736 		switch (vnode_vtype(tvp)) {
8737 		case VLNK:
8738 		/* FALLTHRU */
8739 		case VREG:
8740 			fsevent = FSE_CREATE_FILE;
8741 			break;
8742 		case VDIR:
8743 			fsevent = FSE_CREATE_DIR;
8744 			break;
8745 		default:
8746 			goto out;
8747 		}
8748 
8749 		if (need_fsevent(fsevent, tvp)) {
8750 			/*
8751 			 * The following is a sequence of three explicit events.
8752 			 * A pair of FSE_CLONE events representing the source and destination
8753 			 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8754 			 * fseventsd may coalesce the destination clone and create events
8755 			 * into a single event resulting in the following sequence for a client
8756 			 * FSE_CLONE (src)
8757 			 * FSE_CLONE | FSE_CREATE (dst)
8758 			 */
8759 			add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8760 			    FSE_ARG_DONE);
8761 			add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8762 			    FSE_ARG_DONE);
8763 		}
8764 #endif /* CONFIG_FSE */
8765 	}
8766 
8767 out:
8768 	if (attr_cleanup) {
8769 		vn_attribute_cleanup(&nva, defaulted);
8770 	}
8771 	if (free_src_acl && va.va_acl) {
8772 		kauth_acl_free(va.va_acl);
8773 	}
8774 	nameidone(&tond);
8775 	if (tvp) {
8776 		vnode_put(tvp);
8777 	}
8778 	vnode_put(tdvp);
8779 	return error;
8780 }
8781 
8782 /*
8783  * clone files or directories, target must not exist.
8784  */
8785 /* ARGSUSED */
8786 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8787 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8788     __unused int32_t *retval)
8789 {
8790 	vnode_t fvp;
8791 	struct nameidata fromnd;
8792 	int follow;
8793 	int error;
8794 	vfs_context_t ctx = vfs_context_current();
8795 
8796 	/* Check that the flags are valid. */
8797 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8798 		return EINVAL;
8799 	}
8800 
8801 	AUDIT_ARG(fd, uap->src_dirfd);
8802 
8803 	follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8804 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8805 	    UIO_USERSPACE, uap->src, ctx);
8806 	if ((error = nameiat(&fromnd, uap->src_dirfd))) {
8807 		return error;
8808 	}
8809 
8810 	fvp = fromnd.ni_vp;
8811 	nameidone(&fromnd);
8812 
8813 	error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8814 	    uap->flags, ctx);
8815 
8816 	vnode_put(fvp);
8817 	return error;
8818 }
8819 
8820 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)8821 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8822     __unused int32_t *retval)
8823 {
8824 	vnode_t fvp;
8825 	struct fileproc *fp;
8826 	int error;
8827 	vfs_context_t ctx = vfs_context_current();
8828 
8829 	/* Check that the flags are valid. */
8830 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8831 		return EINVAL;
8832 	}
8833 
8834 	AUDIT_ARG(fd, uap->src_fd);
8835 	error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8836 	if (error) {
8837 		return error;
8838 	}
8839 
8840 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8841 		AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8842 		error = EBADF;
8843 		goto out;
8844 	}
8845 
8846 	if ((error = vnode_getwithref(fvp))) {
8847 		goto out;
8848 	}
8849 
8850 	AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8851 
8852 	error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8853 	    uap->flags, ctx);
8854 
8855 	vnode_put(fvp);
8856 out:
8857 	file_drop(uap->src_fd);
8858 	return error;
8859 }
8860 
8861 static int
rename_submounts_callback(mount_t mp,void * arg)8862 rename_submounts_callback(mount_t mp, void *arg)
8863 {
8864 	int error = 0;
8865 	mount_t pmp = (mount_t)arg;
8866 	int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8867 
8868 	if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8869 		return 0;
8870 	}
8871 
8872 	if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8873 		return 0;
8874 	}
8875 
8876 	if ((error = vfs_busy(mp, LK_NOWAIT))) {
8877 		printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8878 		return -1;
8879 	}
8880 
8881 	size_t pathlen = MAXPATHLEN;
8882 	if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8883 		printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8884 	}
8885 
8886 	vfs_unbusy(mp);
8887 
8888 	return error;
8889 }
8890 
8891 /*
8892  * Rename files.  Source and destination must either both be directories,
8893  * or both not be directories.  If target is a directory, it must be empty.
8894  */
8895 /* ARGSUSED */
8896 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)8897 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8898     int tofd, user_addr_t to, int segflg, u_int uflags)
8899 {
8900 	vnode_t tvp, tdvp;
8901 	vnode_t fvp, fdvp;
8902 	vnode_t mnt_fvp;
8903 	struct nameidata *fromnd, *tond;
8904 	int error = 0;
8905 	int do_retry;
8906 	int retry_count;
8907 	int mntrename;
8908 	int need_event;
8909 	int need_kpath2;
8910 	int has_listeners;
8911 	const char *oname = NULL;
8912 	char *from_name = NULL, *to_name = NULL;
8913 	char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8914 	int from_len = 0, to_len = 0;
8915 	int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8916 	int holding_mntlock;
8917 	int vn_authorize_skipped;
8918 	mount_t locked_mp = NULL;
8919 	vnode_t oparent = NULLVP;
8920 #if CONFIG_FSE
8921 	fse_info from_finfo = {}, to_finfo;
8922 #endif
8923 	int from_truncated = 0, to_truncated = 0;
8924 	int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8925 	int batched = 0;
8926 	struct vnode_attr *fvap, *tvap;
8927 	int continuing = 0;
8928 	vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
8929 	int32_t nofollow_any = 0;
8930 	/* carving out a chunk for structs that are too big to be on stack. */
8931 	struct {
8932 		struct nameidata from_node, to_node;
8933 		struct vnode_attr fv_attr, tv_attr;
8934 	} * __rename_data;
8935 
8936 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
8937 	fromnd = &__rename_data->from_node;
8938 	tond = &__rename_data->to_node;
8939 
8940 	holding_mntlock = 0;
8941 	do_retry = 0;
8942 	retry_count = 0;
8943 retry:
8944 	fvp = tvp = NULL;
8945 	fdvp = tdvp = NULL;
8946 	fvap = tvap = NULL;
8947 	mnt_fvp = NULLVP;
8948 	mntrename = FALSE;
8949 	vn_authorize_skipped = FALSE;
8950 
8951 	if (uflags & RENAME_NOFOLLOW_ANY) {
8952 		nofollow_any = NAMEI_NOFOLLOW_ANY;
8953 	}
8954 	NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8955 	    segflg, from, ctx);
8956 	fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8957 
8958 	NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8959 	    segflg, to, ctx);
8960 	tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8961 
8962 continue_lookup:
8963 	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8964 		if ((error = nameiat(fromnd, fromfd))) {
8965 			goto out1;
8966 		}
8967 		fdvp = fromnd->ni_dvp;
8968 		fvp  = fromnd->ni_vp;
8969 
8970 		if (fvp && fvp->v_type == VDIR) {
8971 			tond->ni_cnd.cn_flags |= WILLBEDIR;
8972 		}
8973 	}
8974 
8975 	if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8976 		if ((error = nameiat(tond, tofd))) {
8977 			/*
8978 			 * Translate error code for rename("dir1", "dir2/.").
8979 			 */
8980 			if (error == EISDIR && fvp->v_type == VDIR) {
8981 				error = EINVAL;
8982 			}
8983 			goto out1;
8984 		}
8985 		tdvp = tond->ni_dvp;
8986 		tvp  = tond->ni_vp;
8987 	}
8988 
8989 #if DEVELOPMENT || DEBUG
8990 	/*
8991 	 * XXX VSWAP: Check for entitlements or special flag here
8992 	 * so we can restrict access appropriately.
8993 	 */
8994 #else /* DEVELOPMENT || DEBUG */
8995 
8996 	if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
8997 		error = EPERM;
8998 		goto out1;
8999 	}
9000 
9001 	if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
9002 		error = EPERM;
9003 		goto out1;
9004 	}
9005 #endif /* DEVELOPMENT || DEBUG */
9006 
9007 	if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9008 		error = ENOENT;
9009 		goto out1;
9010 	}
9011 
9012 	if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9013 		int32_t pval = 0;
9014 		int err = 0;
9015 
9016 		/*
9017 		 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9018 		 * has the same name as target iff the following conditions are met:
9019 		 * 1. the target file system is case insensitive
9020 		 * 2. source and target directories are the same
9021 		 * 3. source and target files are the same
9022 		 * 4. name only differs in case (determined by underlying filesystem)
9023 		 */
9024 		if (fvp != tvp || fdvp != tdvp) {
9025 			error = EEXIST;
9026 			goto out1;
9027 		}
9028 
9029 		/*
9030 		 * Assume that the target file system is case sensitive if
9031 		 * _PC_CASE_SENSITIVE selector isn't supported.
9032 		 */
9033 		err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9034 		if (err != 0 || pval != 0) {
9035 			error = EEXIST;
9036 			goto out1;
9037 		}
9038 	}
9039 
9040 	batched = vnode_compound_rename_available(fdvp);
9041 
9042 #if CONFIG_FSE
9043 	need_event = need_fsevent(FSE_RENAME, fdvp);
9044 	if (need_event) {
9045 		if (fvp) {
9046 			get_fse_info(fvp, &from_finfo, ctx);
9047 		} else {
9048 			error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9049 			if (error) {
9050 				goto out1;
9051 			}
9052 
9053 			fvap = &__rename_data->fv_attr;
9054 		}
9055 
9056 		if (tvp) {
9057 			get_fse_info(tvp, &to_finfo, ctx);
9058 		} else if (batched) {
9059 			error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9060 			if (error) {
9061 				goto out1;
9062 			}
9063 
9064 			tvap = &__rename_data->tv_attr;
9065 		}
9066 	}
9067 #else
9068 	need_event = 0;
9069 #endif /* CONFIG_FSE */
9070 
9071 	has_listeners = kauth_authorize_fileop_has_listeners();
9072 
9073 	need_kpath2 = 0;
9074 #if CONFIG_AUDIT
9075 	if (AUDIT_RECORD_EXISTS()) {
9076 		need_kpath2 = 1;
9077 	}
9078 #endif
9079 
9080 	if (need_event || has_listeners) {
9081 		if (from_name == NULL) {
9082 			GET_PATH(from_name);
9083 		}
9084 
9085 		from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9086 
9087 		if (from_name_no_firmlink == NULL) {
9088 			GET_PATH(from_name_no_firmlink);
9089 		}
9090 
9091 		from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9092 	}
9093 
9094 	if (need_event || need_kpath2 || has_listeners) {
9095 		if (to_name == NULL) {
9096 			GET_PATH(to_name);
9097 		}
9098 
9099 		to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9100 
9101 		if (to_name_no_firmlink == NULL) {
9102 			GET_PATH(to_name_no_firmlink);
9103 		}
9104 
9105 		to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9106 		if (to_name && need_kpath2) {
9107 			AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9108 		}
9109 	}
9110 	if (!fvp) {
9111 		/*
9112 		 * Claim: this check will never reject a valid rename.
9113 		 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9114 		 * Suppose fdvp and tdvp are not on the same mount.
9115 		 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
9116 		 *      then you can't move it to within another dir on the same mountpoint.
9117 		 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9118 		 *
9119 		 * If this check passes, then we are safe to pass these vnodes to the same FS.
9120 		 */
9121 		if (fdvp->v_mount != tdvp->v_mount) {
9122 			error = EXDEV;
9123 			goto out1;
9124 		}
9125 		goto skipped_lookup;
9126 	}
9127 
9128 	/*
9129 	 * If the source and destination are the same (i.e. they're
9130 	 * links to the same vnode) and the target file system is
9131 	 * case sensitive, then there is nothing to do.
9132 	 *
9133 	 * XXX Come back to this.
9134 	 */
9135 	if (fvp == tvp) {
9136 		int pathconf_val;
9137 
9138 		/*
9139 		 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9140 		 * then assume that this file system is case sensitive.
9141 		 */
9142 		if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9143 		    pathconf_val != 0) {
9144 			vn_authorize_skipped = TRUE;
9145 			goto out1;
9146 		}
9147 	}
9148 
9149 	/*
9150 	 * Allow the renaming of mount points.
9151 	 * - target must not exist
9152 	 * - target must reside in the same directory as source
9153 	 * - union mounts cannot be renamed
9154 	 * - the root fs, and tightly-linked system volumes, cannot be renamed
9155 	 *
9156 	 * XXX Handle this in VFS after a continued lookup (if we missed
9157 	 * in the cache to start off)
9158 	 *
9159 	 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9160 	 * we'll skip past here.  The file system is responsible for
9161 	 * checking that @tvp is not a descendent of @fvp and vice versa
9162 	 * so it should always return EINVAL if either @tvp or @fvp is the
9163 	 * root of a volume.
9164 	 */
9165 	if ((fvp->v_flag & VROOT) &&
9166 	    (fvp->v_type == VDIR) &&
9167 	    (tvp == NULL) &&
9168 	    (fvp->v_mountedhere == NULL) &&
9169 	    (fdvp == tdvp) &&
9170 	    ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9171 	    ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9172 	    (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9173 		vnode_t coveredvp;
9174 
9175 		/* switch fvp to the covered vnode */
9176 		coveredvp = fvp->v_mount->mnt_vnodecovered;
9177 		if ((vnode_getwithref(coveredvp))) {
9178 			error = ENOENT;
9179 			goto out1;
9180 		}
9181 		/*
9182 		 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9183 		 * later.
9184 		 */
9185 		mnt_fvp = fvp;
9186 
9187 		fvp = coveredvp;
9188 		mntrename = TRUE;
9189 	}
9190 	/*
9191 	 * Check for cross-device rename.
9192 	 */
9193 	if ((fvp->v_mount != tdvp->v_mount) ||
9194 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
9195 		error = EXDEV;
9196 		goto out1;
9197 	}
9198 
9199 	/*
9200 	 * If source is the same as the destination (that is the
9201 	 * same inode number) then there is nothing to do...
9202 	 * EXCEPT if the underlying file system supports case
9203 	 * insensitivity and is case preserving.  In this case
9204 	 * the file system needs to handle the special case of
9205 	 * getting the same vnode as target (fvp) and source (tvp).
9206 	 *
9207 	 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9208 	 * and _PC_CASE_PRESERVING can have this exception, and they need to
9209 	 * handle the special case of getting the same vnode as target and
9210 	 * source.  NOTE: Then the target is unlocked going into vnop_rename,
9211 	 * so not to cause locking problems. There is a single reference on tvp.
9212 	 *
9213 	 * NOTE - that fvp == tvp also occurs if they are hard linked and
9214 	 * that correct behaviour then is just to return success without doing
9215 	 * anything.
9216 	 *
9217 	 * XXX filesystem should take care of this itself, perhaps...
9218 	 */
9219 	if (fvp == tvp && fdvp == tdvp) {
9220 		if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9221 		    !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9222 		    fromnd->ni_cnd.cn_namelen)) {
9223 			vn_authorize_skipped = TRUE;
9224 			goto out1;
9225 		}
9226 	}
9227 
9228 	if (holding_mntlock && fvp->v_mount != locked_mp) {
9229 		/*
9230 		 * we're holding a reference and lock
9231 		 * on locked_mp, but it no longer matches
9232 		 * what we want to do... so drop our hold
9233 		 */
9234 		mount_unlock_renames(locked_mp);
9235 		mount_drop(locked_mp, 0);
9236 		holding_mntlock = 0;
9237 	}
9238 	if (tdvp != fdvp && fvp->v_type == VDIR) {
9239 		/*
9240 		 * serialize renames that re-shape
9241 		 * the tree... if holding_mntlock is
9242 		 * set, then we're ready to go...
9243 		 * otherwise we
9244 		 * first need to drop the iocounts
9245 		 * we picked up, second take the
9246 		 * lock to serialize the access,
9247 		 * then finally start the lookup
9248 		 * process over with the lock held
9249 		 */
9250 		if (!holding_mntlock) {
9251 			/*
9252 			 * need to grab a reference on
9253 			 * the mount point before we
9254 			 * drop all the iocounts... once
9255 			 * the iocounts are gone, the mount
9256 			 * could follow
9257 			 */
9258 			locked_mp = fvp->v_mount;
9259 			mount_ref(locked_mp, 0);
9260 
9261 			/*
9262 			 * nameidone has to happen before we vnode_put(tvp)
9263 			 * since it may need to release the fs_nodelock on the tvp
9264 			 */
9265 			nameidone(tond);
9266 
9267 			if (tvp) {
9268 				vnode_put(tvp);
9269 			}
9270 			vnode_put(tdvp);
9271 
9272 			/*
9273 			 * nameidone has to happen before we vnode_put(fdvp)
9274 			 * since it may need to release the fs_nodelock on the fvp
9275 			 */
9276 			nameidone(fromnd);
9277 
9278 			vnode_put(fvp);
9279 			vnode_put(fdvp);
9280 
9281 			if (mnt_fvp != NULLVP) {
9282 				vnode_put(mnt_fvp);
9283 			}
9284 
9285 			mount_lock_renames(locked_mp);
9286 			holding_mntlock = 1;
9287 
9288 			goto retry;
9289 		}
9290 	} else {
9291 		/*
9292 		 * when we dropped the iocounts to take
9293 		 * the lock, we allowed the identity of
9294 		 * the various vnodes to change... if they did,
9295 		 * we may no longer be dealing with a rename
9296 		 * that reshapes the tree... once we're holding
9297 		 * the iocounts, the vnodes can't change type
9298 		 * so we're free to drop the lock at this point
9299 		 * and continue on
9300 		 */
9301 		if (holding_mntlock) {
9302 			mount_unlock_renames(locked_mp);
9303 			mount_drop(locked_mp, 0);
9304 			holding_mntlock = 0;
9305 		}
9306 	}
9307 
9308 	if (!batched) {
9309 		error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9310 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9311 		    flags, NULL);
9312 		if (error) {
9313 			if (error == ENOENT) {
9314 				if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9315 					/*
9316 					 * We encountered a race where after doing the namei,
9317 					 * tvp stops being valid. If so, simply re-drive the rename
9318 					 * call from the top.
9319 					 */
9320 					do_retry = 1;
9321 					retry_count += 1;
9322 				}
9323 			}
9324 			goto out1;
9325 		}
9326 	}
9327 
9328 	/* Release the 'mnt_fvp' now that it is no longer needed. */
9329 	if (mnt_fvp != NULLVP) {
9330 		vnode_put(mnt_fvp);
9331 		mnt_fvp = NULLVP;
9332 	}
9333 
9334 	// save these off so we can later verify that fvp is the same
9335 	oname   = fvp->v_name;
9336 	oparent = fvp->v_parent;
9337 
9338 skipped_lookup:
9339 #if CONFIG_FILE_LEASES
9340 	/* Lease break needed for source's parent dir? */
9341 	vnode_breakdirlease(fdvp, false, O_WRONLY);
9342 
9343 	/* Lease break needed for target's parent dir? */
9344 	vnode_breakdirlease(tdvp, false, O_WRONLY);
9345 #endif
9346 
9347 	error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9348 	    tdvp, &tvp, &tond->ni_cnd, tvap,
9349 	    flags, ctx);
9350 
9351 	if (holding_mntlock) {
9352 		/*
9353 		 * we can drop our serialization
9354 		 * lock now
9355 		 */
9356 		mount_unlock_renames(locked_mp);
9357 		mount_drop(locked_mp, 0);
9358 		holding_mntlock = 0;
9359 	}
9360 	if (error) {
9361 		if (error == EDATALESS) {
9362 			/*
9363 			 * If we've been here before, something has gone
9364 			 * horribly wrong and we should just get out lest
9365 			 * we spiral around the drain forever.
9366 			 */
9367 			if (flags & VFS_RENAME_DATALESS) {
9368 				error = EIO;
9369 				goto out1;
9370 			}
9371 
9372 			/*
9373 			 * The object we're renaming is dataless (or has a
9374 			 * dataless descendent) and requires materialization
9375 			 * before the rename occurs.  But we're holding the
9376 			 * mount point's rename lock, so it's not safe to
9377 			 * make the upcall.
9378 			 *
9379 			 * In this case, we release the lock (above), perform
9380 			 * the materialization, and start the whole thing over.
9381 			 */
9382 			error = vfs_materialize_reparent(fvp, tdvp);
9383 			if (error == 0) {
9384 				/*
9385 				 * The next time around we need to tell the
9386 				 * file system that the materializtaion has
9387 				 * been performed.
9388 				 */
9389 				flags |= VFS_RENAME_DATALESS;
9390 				do_retry = 1;
9391 			}
9392 			goto out1;
9393 		}
9394 		if (error == EKEEPLOOKING) {
9395 			if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9396 				if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9397 					panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9398 				}
9399 			}
9400 
9401 			fromnd->ni_vp = fvp;
9402 			tond->ni_vp = tvp;
9403 
9404 			goto continue_lookup;
9405 		}
9406 
9407 		/*
9408 		 * We may encounter a race in the VNOP where the destination didn't
9409 		 * exist when we did the namei, but it does by the time we go and
9410 		 * try to create the entry. In this case, we should re-drive this rename
9411 		 * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
9412 		 * but other filesystems susceptible to this race could return it, too.
9413 		 */
9414 		if (error == ERECYCLE) {
9415 			if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9416 				do_retry = 1;
9417 				retry_count += 1;
9418 			} else {
9419 				printf("rename retry limit due to ERECYCLE reached\n");
9420 				error = ENOENT;
9421 			}
9422 		}
9423 
9424 		/*
9425 		 * For compound VNOPs, the authorization callback may return
9426 		 * ENOENT in case of racing hardlink lookups hitting the name
9427 		 * cache, redrive the lookup.
9428 		 */
9429 		if (batched && error == ENOENT) {
9430 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9431 				do_retry = 1;
9432 				retry_count += 1;
9433 			}
9434 		}
9435 
9436 		goto out1;
9437 	}
9438 
9439 	/* call out to allow 3rd party notification of rename.
9440 	 * Ignore result of kauth_authorize_fileop call.
9441 	 */
9442 	kauth_authorize_fileop(vfs_context_ucred(ctx),
9443 	    KAUTH_FILEOP_RENAME,
9444 	    (uintptr_t)from_name, (uintptr_t)to_name);
9445 	if (flags & VFS_RENAME_SWAP) {
9446 		kauth_authorize_fileop(vfs_context_ucred(ctx),
9447 		    KAUTH_FILEOP_RENAME,
9448 		    (uintptr_t)to_name, (uintptr_t)from_name);
9449 	}
9450 
9451 #if CONFIG_FSE
9452 	if (from_name != NULL && to_name != NULL) {
9453 		if (from_truncated || to_truncated) {
9454 			// set it here since only the from_finfo gets reported up to user space
9455 			from_finfo.mode |= FSE_TRUNCATED_PATH;
9456 		}
9457 
9458 		if (tvap && tvp) {
9459 			vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
9460 		}
9461 		if (fvap) {
9462 			vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
9463 		}
9464 
9465 		if (tvp) {
9466 			add_fsevent(FSE_RENAME, ctx,
9467 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9468 			    FSE_ARG_FINFO, &from_finfo,
9469 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9470 			    FSE_ARG_FINFO, &to_finfo,
9471 			    FSE_ARG_DONE);
9472 			if (flags & VFS_RENAME_SWAP) {
9473 				/*
9474 				 * Strictly speaking, swap is the equivalent of
9475 				 * *three* renames.  FSEvents clients should only take
9476 				 * the events as a hint, so we only bother reporting
9477 				 * two.
9478 				 */
9479 				add_fsevent(FSE_RENAME, ctx,
9480 				    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9481 				    FSE_ARG_FINFO, &to_finfo,
9482 				    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9483 				    FSE_ARG_FINFO, &from_finfo,
9484 				    FSE_ARG_DONE);
9485 			}
9486 		} else {
9487 			add_fsevent(FSE_RENAME, ctx,
9488 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9489 			    FSE_ARG_FINFO, &from_finfo,
9490 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9491 			    FSE_ARG_DONE);
9492 		}
9493 	}
9494 #endif /* CONFIG_FSE */
9495 
9496 	/*
9497 	 * update filesystem's mount point data
9498 	 */
9499 	if (mntrename) {
9500 		char *cp, *pathend, *mpname;
9501 		char * tobuf;
9502 		struct mount *mp;
9503 		int maxlen;
9504 		size_t len = 0;
9505 
9506 		mp = fvp->v_mountedhere;
9507 
9508 		if (vfs_busy(mp, LK_NOWAIT)) {
9509 			error = EBUSY;
9510 			goto out1;
9511 		}
9512 		tobuf = zalloc(ZV_NAMEI);
9513 
9514 		if (UIO_SEG_IS_USER_SPACE(segflg)) {
9515 			error = copyinstr(to, tobuf, MAXPATHLEN, &len);
9516 		} else {
9517 			error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
9518 		}
9519 		if (!error) {
9520 			/* find current mount point prefix */
9521 			pathend = &mp->mnt_vfsstat.f_mntonname[0];
9522 			for (cp = pathend; *cp != '\0'; ++cp) {
9523 				if (*cp == '/') {
9524 					pathend = cp + 1;
9525 				}
9526 			}
9527 			/* find last component of target name */
9528 			for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9529 				if (*cp == '/') {
9530 					mpname = cp + 1;
9531 				}
9532 			}
9533 
9534 			/* Update f_mntonname of sub mounts */
9535 			vfs_iterate(0, rename_submounts_callback, (void *)mp);
9536 
9537 			/* append name to prefix */
9538 			maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9539 			bzero(pathend, maxlen);
9540 
9541 			strlcpy(pathend, mpname, maxlen);
9542 		}
9543 		zfree(ZV_NAMEI, tobuf);
9544 
9545 		vfs_unbusy(mp);
9546 
9547 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
9548 	}
9549 	/*
9550 	 * fix up name & parent pointers.  note that we first
9551 	 * check that fvp has the same name/parent pointers it
9552 	 * had before the rename call... this is a 'weak' check
9553 	 * at best...
9554 	 *
9555 	 * XXX oparent and oname may not be set in the compound vnop case
9556 	 */
9557 	if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9558 		int update_flags;
9559 
9560 		update_flags = VNODE_UPDATE_NAME;
9561 
9562 		if (fdvp != tdvp) {
9563 			update_flags |= VNODE_UPDATE_PARENT;
9564 		}
9565 
9566 		vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
9567 	}
9568 out1:
9569 	/*
9570 	 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9571 	 * skipped earlier as no actual rename was performed.
9572 	 */
9573 	if (vn_authorize_skipped && error == 0) {
9574 		error = vn_authorize_renamex_with_paths(fdvp, fvp,
9575 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9576 		    flags, NULL);
9577 		if (error && error == ENOENT) {
9578 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9579 				do_retry = 1;
9580 				retry_count += 1;
9581 			}
9582 		}
9583 	}
9584 	if (to_name != NULL) {
9585 		RELEASE_PATH(to_name);
9586 		to_name = NULL;
9587 	}
9588 	if (to_name_no_firmlink != NULL) {
9589 		RELEASE_PATH(to_name_no_firmlink);
9590 		to_name_no_firmlink = NULL;
9591 	}
9592 	if (from_name != NULL) {
9593 		RELEASE_PATH(from_name);
9594 		from_name = NULL;
9595 	}
9596 	if (from_name_no_firmlink != NULL) {
9597 		RELEASE_PATH(from_name_no_firmlink);
9598 		from_name_no_firmlink = NULL;
9599 	}
9600 	if (holding_mntlock) {
9601 		mount_unlock_renames(locked_mp);
9602 		mount_drop(locked_mp, 0);
9603 		holding_mntlock = 0;
9604 	}
9605 	if (tdvp) {
9606 		/*
9607 		 * nameidone has to happen before we vnode_put(tdvp)
9608 		 * since it may need to release the fs_nodelock on the tdvp
9609 		 */
9610 		nameidone(tond);
9611 
9612 		if (tvp) {
9613 			vnode_put(tvp);
9614 		}
9615 		vnode_put(tdvp);
9616 	}
9617 	if (fdvp) {
9618 		/*
9619 		 * nameidone has to happen before we vnode_put(fdvp)
9620 		 * since it may need to release the fs_nodelock on the fdvp
9621 		 */
9622 		nameidone(fromnd);
9623 
9624 		if (fvp) {
9625 			vnode_put(fvp);
9626 		}
9627 		vnode_put(fdvp);
9628 	}
9629 	if (mnt_fvp != NULLVP) {
9630 		vnode_put(mnt_fvp);
9631 	}
9632 	/*
9633 	 * If things changed after we did the namei, then we will re-drive
9634 	 * this rename call from the top.
9635 	 */
9636 	if (do_retry) {
9637 		do_retry = 0;
9638 		goto retry;
9639 	}
9640 
9641 	kfree_type(typeof(*__rename_data), __rename_data);
9642 	return error;
9643 }
9644 
9645 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9646 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9647 {
9648 	return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9649 	           AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9650 }
9651 
9652 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9653 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9654 {
9655 	if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9656 		return EINVAL;
9657 	}
9658 
9659 	if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9660 		return EINVAL;
9661 	}
9662 
9663 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9664 	           uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9665 }
9666 
9667 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9668 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9669 {
9670 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9671 	           uap->tofd, uap->to, UIO_USERSPACE, 0);
9672 }
9673 
9674 /*
9675  * Make a directory file.
9676  *
9677  * Returns:	0			Success
9678  *		EEXIST
9679  *	namei:???
9680  *	vnode_authorize:???
9681  *	vn_create:???
9682  */
9683 /* ARGSUSED */
9684 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9685 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9686     enum uio_seg segflg)
9687 {
9688 	vnode_t vp, dvp;
9689 	int error;
9690 	int update_flags = 0;
9691 	int batched;
9692 	struct nameidata nd;
9693 
9694 	AUDIT_ARG(mode, vap->va_mode);
9695 	NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9696 	    path, ctx);
9697 	nd.ni_cnd.cn_flags |= WILLBEDIR;
9698 	nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9699 
9700 continue_lookup:
9701 	error = nameiat(&nd, fd);
9702 	if (error) {
9703 		return error;
9704 	}
9705 	dvp = nd.ni_dvp;
9706 	vp = nd.ni_vp;
9707 
9708 	if (vp != NULL) {
9709 		error = EEXIST;
9710 		goto out;
9711 	}
9712 
9713 	batched = vnode_compound_mkdir_available(dvp);
9714 
9715 	VATTR_SET(vap, va_type, VDIR);
9716 
9717 	/*
9718 	 * XXX
9719 	 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9720 	 * only get EXISTS or EISDIR for existing path components, and not that it could see
9721 	 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9722 	 * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
9723 	 */
9724 	if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9725 		if (error == EACCES || error == EPERM) {
9726 			int error2;
9727 
9728 			nameidone(&nd);
9729 			vnode_put(dvp);
9730 			dvp = NULLVP;
9731 
9732 			/*
9733 			 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9734 			 * rather than EACCESS if the target exists.
9735 			 */
9736 			NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9737 			    path, ctx);
9738 			error2 = nameiat(&nd, fd);
9739 			if (error2) {
9740 				goto out;
9741 			} else {
9742 				vp = nd.ni_vp;
9743 				error = EEXIST;
9744 				goto out;
9745 			}
9746 		}
9747 
9748 		goto out;
9749 	}
9750 
9751 #if CONFIG_FILE_LEASES
9752 	vnode_breakdirlease(dvp, false, O_WRONLY);
9753 #endif
9754 
9755 	/*
9756 	 * make the directory
9757 	 */
9758 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9759 		if (error == EKEEPLOOKING) {
9760 			nd.ni_vp = vp;
9761 			goto continue_lookup;
9762 		}
9763 
9764 		goto out;
9765 	}
9766 
9767 	// Make sure the name & parent pointers are hooked up
9768 	if (vp->v_name == NULL) {
9769 		update_flags |= VNODE_UPDATE_NAME;
9770 	}
9771 	if (vp->v_parent == NULLVP) {
9772 		update_flags |= VNODE_UPDATE_PARENT;
9773 	}
9774 
9775 	if (update_flags) {
9776 		vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9777 	}
9778 
9779 #if CONFIG_FSE
9780 	add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9781 #endif
9782 
9783 out:
9784 	/*
9785 	 * nameidone has to happen before we vnode_put(dvp)
9786 	 * since it may need to release the fs_nodelock on the dvp
9787 	 */
9788 	nameidone(&nd);
9789 
9790 	if (vp) {
9791 		vnode_put(vp);
9792 	}
9793 	if (dvp) {
9794 		vnode_put(dvp);
9795 	}
9796 
9797 	return error;
9798 }
9799 
9800 /*
9801  * mkdir_extended: Create a directory; with extended security (ACL).
9802  *
9803  * Parameters:    p                       Process requesting to create the directory
9804  *                uap                     User argument descriptor (see below)
9805  *                retval                  (ignored)
9806  *
9807  * Indirect:      uap->path               Path of directory to create
9808  *                uap->mode               Access permissions to set
9809  *                uap->xsecurity          ACL to set
9810  *
9811  * Returns:        0                      Success
9812  *                !0                      Not success
9813  *
9814  */
9815 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)9816 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9817 {
9818 	int ciferror;
9819 	kauth_filesec_t xsecdst;
9820 	struct vnode_attr va;
9821 
9822 	AUDIT_ARG(owner, uap->uid, uap->gid);
9823 
9824 	xsecdst = NULL;
9825 	if ((uap->xsecurity != USER_ADDR_NULL) &&
9826 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9827 		return ciferror;
9828 	}
9829 
9830 	VATTR_INIT(&va);
9831 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9832 	if (xsecdst != NULL) {
9833 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9834 		va.va_vaflags |= VA_FILESEC_ACL;
9835 	}
9836 
9837 	ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9838 	    UIO_USERSPACE);
9839 	if (xsecdst != NULL) {
9840 		kauth_filesec_free(xsecdst);
9841 	}
9842 	return ciferror;
9843 }
9844 
9845 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)9846 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9847 {
9848 	struct vnode_attr va;
9849 
9850 	VATTR_INIT(&va);
9851 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9852 
9853 	return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9854 	           UIO_USERSPACE);
9855 }
9856 
9857 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)9858 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9859 {
9860 	struct vnode_attr va;
9861 
9862 	VATTR_INIT(&va);
9863 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9864 
9865 	return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9866 	           UIO_USERSPACE);
9867 }
9868 
9869 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)9870 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9871     enum uio_seg segflg, int unlink_flags)
9872 {
9873 	struct {
9874 		struct nameidata nd;
9875 #if CONFIG_FSE
9876 		struct vnode_attr va;
9877 #endif /* CONFIG_FSE */
9878 	} *__rmdir_data;
9879 	vnode_t vp, dvp;
9880 	int error;
9881 	struct nameidata *ndp;
9882 	char     *path = NULL;
9883 	char     *no_firmlink_path = NULL;
9884 	int       len_path = 0;
9885 	int       len_no_firmlink_path = 0;
9886 	int has_listeners = 0;
9887 	int need_event = 0;
9888 	int truncated_path = 0;
9889 	int truncated_no_firmlink_path = 0;
9890 	struct vnode_attr *vap = NULL;
9891 	int restart_count = 0;
9892 	int batched;
9893 
9894 	int restart_flag;
9895 	int nofollow_any = 0;
9896 
9897 	__rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
9898 	ndp = &__rmdir_data->nd;
9899 
9900 	if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
9901 		nofollow_any = NAMEI_NOFOLLOW_ANY;
9902 		unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
9903 	}
9904 
9905 	/*
9906 	 * This loop exists to restart rmdir in the unlikely case that two
9907 	 * processes are simultaneously trying to remove the same directory
9908 	 * containing orphaned appleDouble files.
9909 	 */
9910 	do {
9911 		NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9912 		    segflg, dirpath, ctx);
9913 		ndp->ni_flag = NAMEI_COMPOUNDRMDIR | nofollow_any;
9914 continue_lookup:
9915 		restart_flag = 0;
9916 		vap = NULL;
9917 
9918 		error = nameiat(ndp, fd);
9919 		if (error) {
9920 			goto err_out;
9921 		}
9922 
9923 		dvp = ndp->ni_dvp;
9924 		vp = ndp->ni_vp;
9925 
9926 		if (vp) {
9927 			batched = vnode_compound_rmdir_available(vp);
9928 
9929 			if (vp->v_flag & VROOT) {
9930 				/*
9931 				 * The root of a mounted filesystem cannot be deleted.
9932 				 */
9933 				error = EBUSY;
9934 				goto out;
9935 			}
9936 
9937 #if DEVELOPMENT || DEBUG
9938 			/*
9939 			 * XXX VSWAP: Check for entitlements or special flag here
9940 			 * so we can restrict access appropriately.
9941 			 */
9942 #else /* DEVELOPMENT || DEBUG */
9943 
9944 			if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
9945 				error = EPERM;
9946 				goto out;
9947 			}
9948 #endif /* DEVELOPMENT || DEBUG */
9949 
9950 			/*
9951 			 * Removed a check here; we used to abort if vp's vid
9952 			 * was not the same as what we'd seen the last time around.
9953 			 * I do not think that check was valid, because if we retry
9954 			 * and all dirents are gone, the directory could legitimately
9955 			 * be recycled but still be present in a situation where we would
9956 			 * have had permission to delete.  Therefore, we won't make
9957 			 * an effort to preserve that check now that we may not have a
9958 			 * vp here.
9959 			 */
9960 
9961 			if (!batched) {
9962 				error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
9963 				if (error) {
9964 					if (error == ENOENT) {
9965 						if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9966 							restart_flag = 1;
9967 							restart_count += 1;
9968 						}
9969 					}
9970 					goto out;
9971 				}
9972 			}
9973 		} else {
9974 			batched = 1;
9975 
9976 			if (!vnode_compound_rmdir_available(dvp)) {
9977 				panic("No error, but no compound rmdir?");
9978 			}
9979 		}
9980 
9981 #if CONFIG_FSE
9982 		fse_info  finfo = {0};
9983 
9984 		need_event = need_fsevent(FSE_DELETE, dvp);
9985 		if (need_event) {
9986 			if (!batched) {
9987 				get_fse_info(vp, &finfo, ctx);
9988 			} else {
9989 				error = vfs_get_notify_attributes(&__rmdir_data->va);
9990 				if (error) {
9991 					goto out;
9992 				}
9993 
9994 				vap = &__rmdir_data->va;
9995 			}
9996 		}
9997 #endif
9998 		has_listeners = kauth_authorize_fileop_has_listeners();
9999 		if (need_event || has_listeners) {
10000 			if (path == NULL) {
10001 				GET_PATH(path);
10002 			}
10003 
10004 			len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10005 
10006 			if (no_firmlink_path == NULL) {
10007 				GET_PATH(no_firmlink_path);
10008 			}
10009 
10010 			len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10011 #if CONFIG_FSE
10012 			if (truncated_no_firmlink_path) {
10013 				finfo.mode |= FSE_TRUNCATED_PATH;
10014 			}
10015 #endif
10016 		}
10017 
10018 #if CONFIG_FILE_LEASES
10019 		vnode_breakdirlease(dvp, false, O_WRONLY);
10020 #endif
10021 
10022 		error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10023 		ndp->ni_vp = vp;
10024 		if (vp == NULLVP) {
10025 			/* Couldn't find a vnode */
10026 			goto out;
10027 		}
10028 
10029 		if (error == EKEEPLOOKING) {
10030 			goto continue_lookup;
10031 		} else if (batched && error == ENOENT) {
10032 			if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10033 				/*
10034 				 * For compound VNOPs, the authorization callback
10035 				 * may return ENOENT in case of racing hard link lookups
10036 				 * redrive the lookup.
10037 				 */
10038 				restart_flag = 1;
10039 				restart_count += 1;
10040 				goto out;
10041 			}
10042 		}
10043 
10044 		/*
10045 		 * XXX There's no provision for passing flags
10046 		 * to VNOP_RMDIR().  So, if vn_rmdir() fails
10047 		 * because it's not empty, then we try again
10048 		 * with VNOP_REMOVE(), passing in a special
10049 		 * flag that clever file systems will know
10050 		 * how to handle.
10051 		 */
10052 		if (error == ENOTEMPTY &&
10053 		    (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10054 			/*
10055 			 * Only do this if the directory is actually
10056 			 * marked as DATALESS.
10057 			 */
10058 			struct vnode_attr *lvap =
10059 			    kalloc_type(struct vnode_attr, Z_WAITOK);
10060 
10061 			VATTR_INIT(lvap);
10062 			VATTR_WANTED(lvap, va_flags);
10063 			if (vnode_getattr(vp, lvap, ctx) == 0 &&
10064 			    VATTR_IS_SUPPORTED(lvap, va_flags) &&
10065 			    (lvap->va_flags & SF_DATALESS) != 0) {
10066 				/*
10067 				 * If this fails, we want to keep the original
10068 				 * error.
10069 				 */
10070 				if (vn_remove(dvp, &vp, ndp,
10071 				    VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10072 					error = 0;
10073 				}
10074 			}
10075 			kfree_type(struct vnode_attr, lvap);
10076 		}
10077 
10078 #if CONFIG_APPLEDOUBLE
10079 		/*
10080 		 * Special case to remove orphaned AppleDouble
10081 		 * files. I don't like putting this in the kernel,
10082 		 * but carbon does not like putting this in carbon either,
10083 		 * so here we are.
10084 		 */
10085 		if (error == ENOTEMPTY) {
10086 			int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10087 			if (ad_error == EBUSY) {
10088 				error = ad_error;
10089 				goto out;
10090 			}
10091 
10092 
10093 			/*
10094 			 * Assuming everything went well, we will try the RMDIR again
10095 			 */
10096 			if (!ad_error) {
10097 				error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10098 			}
10099 		}
10100 #endif /* CONFIG_APPLEDOUBLE */
10101 		/*
10102 		 * Call out to allow 3rd party notification of delete.
10103 		 * Ignore result of kauth_authorize_fileop call.
10104 		 */
10105 		if (!error) {
10106 			if (has_listeners) {
10107 				kauth_authorize_fileop(vfs_context_ucred(ctx),
10108 				    KAUTH_FILEOP_DELETE,
10109 				    (uintptr_t)vp,
10110 				    (uintptr_t)path);
10111 			}
10112 
10113 			if (vp->v_flag & VISHARDLINK) {
10114 				// see the comment in unlink1() about why we update
10115 				// the parent of a hard link when it is removed
10116 				vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10117 			}
10118 
10119 #if CONFIG_FSE
10120 			if (need_event) {
10121 				if (vap) {
10122 					vnode_get_fse_info_from_vap(vp, &finfo, vap);
10123 				}
10124 				add_fsevent(FSE_DELETE, ctx,
10125 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10126 				    FSE_ARG_FINFO, &finfo,
10127 				    FSE_ARG_DONE);
10128 			}
10129 #endif
10130 
10131 #if CONFIG_MACF
10132 			mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10133 #endif
10134 		}
10135 
10136 out:
10137 		if (path != NULL) {
10138 			RELEASE_PATH(path);
10139 			path = NULL;
10140 		}
10141 
10142 		if (no_firmlink_path != NULL) {
10143 			RELEASE_PATH(no_firmlink_path);
10144 			no_firmlink_path = NULL;
10145 		}
10146 
10147 		/*
10148 		 * nameidone has to happen before we vnode_put(dvp)
10149 		 * since it may need to release the fs_nodelock on the dvp
10150 		 */
10151 		nameidone(ndp);
10152 		vnode_put(dvp);
10153 
10154 		if (vp) {
10155 			vnode_put(vp);
10156 		}
10157 
10158 		if (restart_flag == 0) {
10159 			wakeup_one((caddr_t)vp);
10160 			goto err_out;
10161 		}
10162 		tsleep(vp, PVFS, "rm AD", 1);
10163 	} while (restart_flag != 0);
10164 
10165 err_out:
10166 	kfree_type(typeof(*__rmdir_data), __rmdir_data);
10167 
10168 	return error;
10169 }
10170 
10171 /*
10172  * Remove a directory file.
10173  */
10174 /* ARGSUSED */
10175 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10176 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10177 {
10178 	return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10179 	           CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10180 }
10181 
10182 /* Get direntry length padded to 8 byte alignment */
10183 #define DIRENT64_LEN(namlen) \
10184 	((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10185 
10186 /* Get dirent length padded to 4 byte alignment */
10187 #define DIRENT_LEN(namelen) \
10188 	((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10189 
10190 /* Get the end of this dirent */
10191 #define DIRENT_END(dep) \
10192 	(((char *)(dep)) + (dep)->d_reclen - 1)
10193 
10194 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10195 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10196     int *numdirent, vfs_context_t ctxp)
10197 {
10198 	/* Check if fs natively supports VNODE_READDIR_EXTENDED */
10199 	if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10200 	    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10201 		return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10202 	} else {
10203 		size_t bufsize;
10204 		void * bufptr;
10205 		uio_t auio;
10206 		struct direntry *entry64;
10207 		struct dirent *dep;
10208 		size_t bytesread;
10209 		int error;
10210 
10211 		/*
10212 		 * We're here because the underlying file system does not
10213 		 * support direnties or we mounted denying support so we must
10214 		 * fall back to dirents and convert them to direntries.
10215 		 *
10216 		 * Our kernel buffer needs to be smaller since re-packing will
10217 		 * expand each dirent.  The worse case (when the name length
10218 		 * is 3 or less) corresponds to a struct direntry size of 32
10219 		 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10220 		 * (4-byte aligned).  So having a buffer that is 3/8 the size
10221 		 * will prevent us from reading more than we can pack.
10222 		 *
10223 		 * Since this buffer is wired memory, we will limit the
10224 		 * buffer size to a maximum of 32K. We would really like to
10225 		 * use 32K in the MIN(), but we use magic number 87371 to
10226 		 * prevent uio_resid() * 3 / 8 from overflowing.
10227 		 */
10228 		bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10229 		bufptr = kalloc_data(bufsize, Z_WAITOK);
10230 		if (bufptr == NULL) {
10231 			return ENOMEM;
10232 		}
10233 
10234 		auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10235 		uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10236 		auio->uio_offset = uio->uio_offset;
10237 
10238 		error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10239 
10240 		dep = (struct dirent *)bufptr;
10241 		bytesread = bufsize - uio_resid(auio);
10242 
10243 		entry64 = kalloc_type(struct direntry, Z_WAITOK);
10244 		/*
10245 		 * Convert all the entries and copy them out to user's buffer.
10246 		 */
10247 		while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10248 			/* First check that the dirent struct up to d_name is within the buffer */
10249 			if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10250 			    /* Check that the length of the entire dirent is within the buffer */
10251 			    DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10252 			    /* Check that the actual length including the name doesn't exceed d_reclen */
10253 			    DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10254 				printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10255 				    vp->v_mount->mnt_vfsstat.f_mntonname,
10256 				    vp->v_name ? vp->v_name : "<unknown>");
10257 				error = EIO;
10258 				break;
10259 			}
10260 
10261 			size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
10262 
10263 			bzero(entry64, enbufsize);
10264 			/* Convert a dirent to a dirent64. */
10265 			entry64->d_ino = dep->d_ino;
10266 			entry64->d_seekoff = 0;
10267 			entry64->d_reclen = (uint16_t)enbufsize;
10268 			entry64->d_namlen = dep->d_namlen;
10269 			entry64->d_type = dep->d_type;
10270 			bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10271 
10272 			/* Move to next entry. */
10273 			dep = (struct dirent *)((char *)dep + dep->d_reclen);
10274 
10275 			/* Copy entry64 to user's buffer. */
10276 			error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10277 		}
10278 
10279 		/* Update the real offset using the offset we got from VNOP_READDIR. */
10280 		if (error == 0) {
10281 			uio->uio_offset = auio->uio_offset;
10282 		}
10283 		uio_free(auio);
10284 		kfree_data(bufptr, bufsize);
10285 		kfree_type(struct direntry, entry64);
10286 		return error;
10287 	}
10288 }
10289 
10290 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
10291 
10292 /*
10293  * Read a block of directory entries in a file system independent format.
10294  */
10295 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10296 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10297     off_t *offset, int *eofflag, int flags)
10298 {
10299 	vnode_t vp;
10300 	struct vfs_context context = *vfs_context_current();    /* local copy */
10301 	struct fileproc *fp;
10302 	uio_t auio;
10303 	int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10304 	off_t loff;
10305 	int error, numdirent;
10306 	UIO_STACKBUF(uio_buf, 1);
10307 
10308 get_from_fd:
10309 	error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10310 	if (error) {
10311 		return error;
10312 	}
10313 
10314 	vn_offset_lock(fp->fp_glob);
10315 	if (((vnode_t)fp_get_data(fp)) != vp) {
10316 		vn_offset_unlock(fp->fp_glob);
10317 		file_drop(fd);
10318 		goto get_from_fd;
10319 	}
10320 
10321 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10322 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10323 		error = EBADF;
10324 		goto out;
10325 	}
10326 
10327 	if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10328 		bufsize = GETDIRENTRIES_MAXBUFSIZE;
10329 	}
10330 
10331 #if CONFIG_MACF
10332 	error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10333 	if (error) {
10334 		goto out;
10335 	}
10336 #endif
10337 
10338 	if ((error = vnode_getwithref(vp))) {
10339 		goto out;
10340 	}
10341 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10342 
10343 #if CONFIG_UNION_MOUNTS
10344 unionread:
10345 #endif /* CONFIG_UNION_MOUNTS */
10346 	if (vp->v_type != VDIR) {
10347 		(void)vnode_put(vp);
10348 		error = EINVAL;
10349 		goto out;
10350 	}
10351 
10352 #if CONFIG_MACF
10353 	error = mac_vnode_check_readdir(&context, vp);
10354 	if (error != 0) {
10355 		(void)vnode_put(vp);
10356 		goto out;
10357 	}
10358 #endif /* MAC */
10359 
10360 	loff = fp->fp_glob->fg_offset;
10361 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10362 	uio_addiov(auio, bufp, bufsize);
10363 
10364 	if (flags & VNODE_READDIR_EXTENDED) {
10365 		error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10366 		fp->fp_glob->fg_offset = uio_offset(auio);
10367 	} else {
10368 		error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10369 		fp->fp_glob->fg_offset = uio_offset(auio);
10370 	}
10371 	if (error) {
10372 		(void)vnode_put(vp);
10373 		goto out;
10374 	}
10375 
10376 #if CONFIG_UNION_MOUNTS
10377 	if ((user_ssize_t)bufsize == uio_resid(auio) &&
10378 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
10379 		vnode_t uvp;
10380 
10381 		if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10382 			if (vnode_ref(uvp) == 0) {
10383 				fp_set_data(fp, uvp);
10384 				fp->fp_glob->fg_offset = 0;
10385 				vnode_rele(vp);
10386 				vnode_put(vp);
10387 				vp = uvp;
10388 				goto unionread;
10389 			} else {
10390 				/* could not get a ref, can't replace in fd */
10391 				vnode_put(uvp);
10392 			}
10393 		}
10394 	}
10395 #endif /* CONFIG_UNION_MOUNTS */
10396 
10397 	vnode_put(vp);
10398 	if (offset) {
10399 		*offset = loff;
10400 	}
10401 
10402 	*bytesread = bufsize - uio_resid(auio);
10403 out:
10404 	vn_offset_unlock(fp->fp_glob);
10405 	file_drop(fd);
10406 	return error;
10407 }
10408 
10409 
10410 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)10411 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10412 {
10413 	off_t offset;
10414 	ssize_t bytesread;
10415 	int error, eofflag;
10416 
10417 	AUDIT_ARG(fd, uap->fd);
10418 	error = getdirentries_common(uap->fd, uap->buf, uap->count,
10419 	    &bytesread, &offset, &eofflag, 0);
10420 
10421 	if (error == 0) {
10422 		if (proc_is64bit(p)) {
10423 			user64_long_t base = (user64_long_t)offset;
10424 			error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10425 		} else {
10426 			user32_long_t base = (user32_long_t)offset;
10427 			error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10428 		}
10429 		*retval = (int)bytesread;
10430 	}
10431 	return error;
10432 }
10433 
10434 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)10435 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10436 {
10437 	off_t offset;
10438 	ssize_t bytesread;
10439 	int error, eofflag;
10440 	user_size_t bufsize;
10441 
10442 	AUDIT_ARG(fd, uap->fd);
10443 
10444 	/*
10445 	 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10446 	 * then the kernel carves out the last 4 bytes to return extended
10447 	 * information to userspace (namely whether we reached EOF with this call).
10448 	 */
10449 	if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10450 		bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10451 	} else {
10452 		bufsize = uap->bufsize;
10453 	}
10454 
10455 	error = getdirentries_common(uap->fd, uap->buf, bufsize,
10456 	    &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
10457 
10458 	if (error == 0) {
10459 		*retval = bytesread;
10460 		error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10461 
10462 		if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10463 			getdirentries64_flags_t flags = 0;
10464 			if (eofflag) {
10465 				flags |= GETDIRENTRIES64_EOF;
10466 			}
10467 			error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10468 			    sizeof(flags));
10469 		}
10470 	}
10471 	return error;
10472 }
10473 
10474 
10475 /*
10476  * Set the mode mask for creation of filesystem nodes.
10477  * XXX implement xsecurity
10478  */
10479 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
10480 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)10481 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10482 {
10483 	AUDIT_ARG(mask, newmask);
10484 	proc_fdlock(p);
10485 	*retval = p->p_fd.fd_cmask;
10486 	p->p_fd.fd_cmask = newmask & ALLPERMS;
10487 	proc_fdunlock(p);
10488 	return 0;
10489 }
10490 
10491 /*
10492  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10493  *
10494  * Parameters:    p                       Process requesting to set the umask
10495  *                uap                     User argument descriptor (see below)
10496  *                retval                  umask of the process (parameter p)
10497  *
10498  * Indirect:      uap->newmask            umask to set
10499  *                uap->xsecurity          ACL to set
10500  *
10501  * Returns:        0                      Success
10502  *                !0                      Not success
10503  *
10504  */
10505 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)10506 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10507 {
10508 	return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
10509 }
10510 
10511 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)10512 umask(proc_t p, struct umask_args *uap, int32_t *retval)
10513 {
10514 	return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
10515 }
10516 
10517 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT                               \
10518 	"com.apple.private.vfs.revoke-mounted-device"
10519 
10520 /*
10521  * Void all references to file by ripping underlying filesystem
10522  * away from vnode.
10523  */
10524 /* ARGSUSED */
10525 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)10526 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10527 {
10528 	vnode_t vp;
10529 	struct vnode_attr va;
10530 	vfs_context_t ctx = vfs_context_current();
10531 	int error;
10532 	struct nameidata nd;
10533 
10534 	NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10535 	    uap->path, ctx);
10536 	error = namei(&nd);
10537 	if (error) {
10538 		return error;
10539 	}
10540 	vp = nd.ni_vp;
10541 
10542 	nameidone(&nd);
10543 
10544 	if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10545 		error = ENOTSUP;
10546 		goto out;
10547 	}
10548 
10549 	if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10550 		error = EBUSY;
10551 		goto out;
10552 	}
10553 
10554 #if CONFIG_MACF
10555 	error = mac_vnode_check_revoke(ctx, vp);
10556 	if (error) {
10557 		goto out;
10558 	}
10559 #endif
10560 
10561 	VATTR_INIT(&va);
10562 	VATTR_WANTED(&va, va_uid);
10563 	if ((error = vnode_getattr(vp, &va, ctx))) {
10564 		goto out;
10565 	}
10566 	if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
10567 	    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
10568 		goto out;
10569 	}
10570 	if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10571 		VNOP_REVOKE(vp, REVOKEALL, ctx);
10572 	}
10573 out:
10574 	vnode_put(vp);
10575 	return error;
10576 }
10577 
10578 
10579 /*
10580  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10581  *  The following system calls are designed to support features
10582  *  which are specific to the HFS & HFS Plus volume formats
10583  */
10584 
10585 
10586 /*
10587  * Obtain attribute information on objects in a directory while enumerating
10588  * the directory.
10589  */
10590 /* ARGSUSED */
10591 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)10592 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10593 {
10594 	vnode_t vp;
10595 	struct fileproc *fp;
10596 	uio_t auio = NULL;
10597 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10598 	uint32_t count = 0, savecount = 0;
10599 	uint32_t newstate = 0;
10600 	int error, eofflag = 0;
10601 	off_t loff = 0;
10602 	struct attrlist attributelist;
10603 	vfs_context_t ctx = vfs_context_current();
10604 	int fd = uap->fd;
10605 	UIO_STACKBUF(uio_buf, 1);
10606 	kauth_action_t action;
10607 
10608 	AUDIT_ARG(fd, fd);
10609 
10610 	/* Get the attributes into kernel space */
10611 	if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10612 		return error;
10613 	}
10614 	if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10615 		return error;
10616 	}
10617 	savecount = count;
10618 
10619 get_from_fd:
10620 	if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10621 		return error;
10622 	}
10623 
10624 	vn_offset_lock(fp->fp_glob);
10625 	if (((vnode_t)fp_get_data(fp)) != vp) {
10626 		vn_offset_unlock(fp->fp_glob);
10627 		file_drop(fd);
10628 		goto get_from_fd;
10629 	}
10630 
10631 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10632 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10633 		error = EBADF;
10634 		goto out;
10635 	}
10636 
10637 
10638 #if CONFIG_MACF
10639 	error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10640 	    fp->fp_glob);
10641 	if (error) {
10642 		goto out;
10643 	}
10644 #endif
10645 
10646 
10647 	if ((error = vnode_getwithref(vp))) {
10648 		goto out;
10649 	}
10650 
10651 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10652 
10653 #if CONFIG_UNION_MOUNTS
10654 unionread:
10655 #endif /* CONFIG_UNION_MOUNTS */
10656 	if (vp->v_type != VDIR) {
10657 		(void)vnode_put(vp);
10658 		error = EINVAL;
10659 		goto out;
10660 	}
10661 
10662 #if CONFIG_MACF
10663 	error = mac_vnode_check_readdir(ctx, vp);
10664 	if (error != 0) {
10665 		(void)vnode_put(vp);
10666 		goto out;
10667 	}
10668 #endif /* MAC */
10669 
10670 	/* set up the uio structure which will contain the users return buffer */
10671 	loff = fp->fp_glob->fg_offset;
10672 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10673 	uio_addiov(auio, uap->buffer, uap->buffersize);
10674 
10675 	/*
10676 	 * If the only item requested is file names, we can let that past with
10677 	 * just LIST_DIRECTORY.  If they want any other attributes, that means
10678 	 * they need SEARCH as well.
10679 	 */
10680 	action = KAUTH_VNODE_LIST_DIRECTORY;
10681 	if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10682 	    attributelist.fileattr || attributelist.dirattr) {
10683 		action |= KAUTH_VNODE_SEARCH;
10684 	}
10685 
10686 	if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10687 		/* Believe it or not, uap->options only has 32-bits of valid
10688 		 * info, so truncate before extending again */
10689 
10690 		error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10691 		    (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10692 	}
10693 
10694 	if (error) {
10695 		(void) vnode_put(vp);
10696 		goto out;
10697 	}
10698 
10699 #if CONFIG_UNION_MOUNTS
10700 	/*
10701 	 * If we've got the last entry of a directory in a union mount
10702 	 * then reset the eofflag and pretend there's still more to come.
10703 	 * The next call will again set eofflag and the buffer will be empty,
10704 	 * so traverse to the underlying directory and do the directory
10705 	 * read there.
10706 	 */
10707 	if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10708 		if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10709 			eofflag = 0;
10710 		} else {                                                // Empty buffer
10711 			vnode_t uvp;
10712 			if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10713 				if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10714 					fp_set_data(fp, uvp);
10715 					fp->fp_glob->fg_offset = 0; // reset index for new dir
10716 					count = savecount;
10717 					vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10718 					vnode_put(vp);
10719 					vp = uvp;
10720 					goto unionread;
10721 				} else {
10722 					/* could not get a ref, can't replace in fd */
10723 					vnode_put(uvp);
10724 				}
10725 			}
10726 		}
10727 	}
10728 #endif /* CONFIG_UNION_MOUNTS */
10729 
10730 	(void)vnode_put(vp);
10731 
10732 	if (error) {
10733 		goto out;
10734 	}
10735 	fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10736 
10737 	if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10738 		goto out;
10739 	}
10740 	if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10741 		goto out;
10742 	}
10743 	if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10744 		goto out;
10745 	}
10746 
10747 	*retval = eofflag;  /* similar to getdirentries */
10748 	error = 0;
10749 out:
10750 	vn_offset_unlock(fp->fp_glob);
10751 	file_drop(fd);
10752 	return error; /* return error earlier, an retval of 0 or 1 now */
10753 } /* end of getdirentriesattr system call */
10754 
10755 /*
10756  * Exchange data between two files
10757  */
10758 
10759 /* ARGSUSED */
10760 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10761 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10762 {
10763 	struct nameidata fnd, snd;
10764 	vfs_context_t ctx = vfs_context_current();
10765 	vnode_t fvp;
10766 	vnode_t svp;
10767 	int error;
10768 	u_int32_t nameiflags;
10769 	char *fpath = NULL;
10770 	char *spath = NULL;
10771 	int   flen = 0, slen = 0;
10772 	int from_truncated = 0, to_truncated = 0;
10773 #if CONFIG_FSE
10774 	fse_info f_finfo, s_finfo;
10775 #endif
10776 
10777 	nameiflags = 0;
10778 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10779 		nameiflags |= FOLLOW;
10780 	}
10781 
10782 	NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10783 	    UIO_USERSPACE, uap->path1, ctx);
10784 
10785 	error = namei(&fnd);
10786 	if (error) {
10787 		goto out2;
10788 	}
10789 
10790 	nameidone(&fnd);
10791 	fvp = fnd.ni_vp;
10792 
10793 	NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10794 	    UIO_USERSPACE, uap->path2, ctx);
10795 
10796 	error = namei(&snd);
10797 	if (error) {
10798 		vnode_put(fvp);
10799 		goto out2;
10800 	}
10801 	nameidone(&snd);
10802 	svp = snd.ni_vp;
10803 
10804 	/*
10805 	 * if the files are the same, return an inval error
10806 	 */
10807 	if (svp == fvp) {
10808 		error = EINVAL;
10809 		goto out;
10810 	}
10811 
10812 	/*
10813 	 * if the files are on different volumes, return an error
10814 	 */
10815 	if (svp->v_mount != fvp->v_mount) {
10816 		error = EXDEV;
10817 		goto out;
10818 	}
10819 
10820 	/* If they're not files, return an error */
10821 	if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10822 		error = EINVAL;
10823 		goto out;
10824 	}
10825 
10826 #if CONFIG_MACF
10827 	error = mac_vnode_check_exchangedata(ctx,
10828 	    fvp, svp);
10829 	if (error) {
10830 		goto out;
10831 	}
10832 #endif
10833 	if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10834 	    ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10835 		goto out;
10836 	}
10837 
10838 	if (
10839 #if CONFIG_FSE
10840 		need_fsevent(FSE_EXCHANGE, fvp) ||
10841 #endif
10842 		kauth_authorize_fileop_has_listeners()) {
10843 		GET_PATH(fpath);
10844 		GET_PATH(spath);
10845 
10846 		flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10847 		slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10848 
10849 #if CONFIG_FSE
10850 		get_fse_info(fvp, &f_finfo, ctx);
10851 		get_fse_info(svp, &s_finfo, ctx);
10852 		if (from_truncated || to_truncated) {
10853 			// set it here since only the f_finfo gets reported up to user space
10854 			f_finfo.mode |= FSE_TRUNCATED_PATH;
10855 		}
10856 #endif
10857 	}
10858 	/* Ok, make the call */
10859 	error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10860 
10861 	if (error == 0) {
10862 		const char *tmpname;
10863 
10864 		if (fpath != NULL && spath != NULL) {
10865 			/* call out to allow 3rd party notification of exchangedata.
10866 			 * Ignore result of kauth_authorize_fileop call.
10867 			 */
10868 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10869 			    (uintptr_t)fpath, (uintptr_t)spath);
10870 		}
10871 		name_cache_lock();
10872 
10873 		tmpname     = fvp->v_name;
10874 		fvp->v_name = svp->v_name;
10875 		svp->v_name = tmpname;
10876 
10877 		if (fvp->v_parent != svp->v_parent) {
10878 			vnode_t tmp;
10879 
10880 			tmp           = fvp->v_parent;
10881 			fvp->v_parent = svp->v_parent;
10882 			svp->v_parent = tmp;
10883 		}
10884 		name_cache_unlock();
10885 
10886 #if CONFIG_FSE
10887 		if (fpath != NULL && spath != NULL) {
10888 			add_fsevent(FSE_EXCHANGE, ctx,
10889 			    FSE_ARG_STRING, flen, fpath,
10890 			    FSE_ARG_FINFO, &f_finfo,
10891 			    FSE_ARG_STRING, slen, spath,
10892 			    FSE_ARG_FINFO, &s_finfo,
10893 			    FSE_ARG_DONE);
10894 		}
10895 #endif
10896 	}
10897 
10898 out:
10899 	if (fpath != NULL) {
10900 		RELEASE_PATH(fpath);
10901 	}
10902 	if (spath != NULL) {
10903 		RELEASE_PATH(spath);
10904 	}
10905 	vnode_put(svp);
10906 	vnode_put(fvp);
10907 out2:
10908 	return error;
10909 }
10910 
10911 /*
10912  * Return (in MB) the amount of freespace on the given vnode's volume.
10913  */
10914 uint32_t freespace_mb(vnode_t vp);
10915 
10916 uint32_t
freespace_mb(vnode_t vp)10917 freespace_mb(vnode_t vp)
10918 {
10919 	vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
10920 	return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10921 	       vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10922 }
10923 
10924 #if CONFIG_SEARCHFS
10925 
10926 /* ARGSUSED */
10927 
10928 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)10929 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10930 {
10931 	vnode_t vp, tvp;
10932 	int i, error = 0;
10933 	int fserror = 0;
10934 	struct nameidata nd;
10935 	struct user64_fssearchblock searchblock;
10936 	struct searchstate *state;
10937 	struct attrlist *returnattrs;
10938 	struct timeval timelimit;
10939 	void *searchparams1, *searchparams2;
10940 	uio_t auio = NULL;
10941 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10942 	uint32_t nummatches;
10943 	size_t mallocsize;
10944 	uint32_t nameiflags;
10945 	vfs_context_t ctx = vfs_context_current();
10946 	UIO_STACKBUF(uio_buf, 1);
10947 
10948 	/* Start by copying in fsearchblock parameter list */
10949 	if (IS_64BIT_PROCESS(p)) {
10950 		error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
10951 		timelimit.tv_sec = searchblock.timelimit.tv_sec;
10952 		timelimit.tv_usec = searchblock.timelimit.tv_usec;
10953 	} else {
10954 		struct user32_fssearchblock tmp_searchblock;
10955 
10956 		error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
10957 		// munge into 64-bit version
10958 		searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
10959 		searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
10960 		searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
10961 		searchblock.maxmatches = tmp_searchblock.maxmatches;
10962 		/*
10963 		 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
10964 		 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
10965 		 */
10966 		timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
10967 		timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
10968 		searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
10969 		searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
10970 		searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
10971 		searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
10972 		searchblock.searchattrs = tmp_searchblock.searchattrs;
10973 	}
10974 	if (error) {
10975 		return error;
10976 	}
10977 
10978 	/* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
10979 	 */
10980 	if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10981 	    searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10982 		return EINVAL;
10983 	}
10984 
10985 	/* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
10986 	/* It all has to do into local memory and it's not that big so we might as well  put it all together. */
10987 	/* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
10988 	/* block.                                                                                             */
10989 	/*												      */
10990 	/* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
10991 	/*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
10992 	/*       assumes the size is still 556 bytes it will continue to work				      */
10993 
10994 	mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
10995 	    sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
10996 
10997 	searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
10998 
10999 	/* Now set up the various pointers to the correct place in our newly allocated memory */
11000 
11001 	searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
11002 	returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11003 	state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11004 
11005 	/* Now copy in the stuff given our local variables. */
11006 
11007 	if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11008 		goto freeandexit;
11009 	}
11010 
11011 	if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11012 		goto freeandexit;
11013 	}
11014 
11015 	if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11016 		goto freeandexit;
11017 	}
11018 
11019 	if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11020 		goto freeandexit;
11021 	}
11022 
11023 	/*
11024 	 * When searching a union mount, need to set the
11025 	 * start flag at the first call on each layer to
11026 	 * reset state for the new volume.
11027 	 */
11028 	if (uap->options & SRCHFS_START) {
11029 		state->ss_union_layer = 0;
11030 	} else {
11031 		uap->options |= state->ss_union_flags;
11032 	}
11033 	state->ss_union_flags = 0;
11034 
11035 	/*
11036 	 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11037 	 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11038 	 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11039 	 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11040 	 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11041 	 */
11042 
11043 	if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11044 		attrreference_t* string_ref;
11045 		u_int32_t* start_length;
11046 		user64_size_t param_length;
11047 
11048 		/* validate searchparams1 */
11049 		param_length = searchblock.sizeofsearchparams1;
11050 		/* skip the word that specifies length of the buffer */
11051 		start_length = (u_int32_t*) searchparams1;
11052 		start_length = start_length + 1;
11053 		string_ref = (attrreference_t*) start_length;
11054 
11055 		/* ensure no negative offsets or too big offsets */
11056 		if (string_ref->attr_dataoffset < 0) {
11057 			error = EINVAL;
11058 			goto freeandexit;
11059 		}
11060 		if (string_ref->attr_length > MAXPATHLEN) {
11061 			error = EINVAL;
11062 			goto freeandexit;
11063 		}
11064 
11065 		/* Check for pointer overflow in the string ref */
11066 		if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11067 			error = EINVAL;
11068 			goto freeandexit;
11069 		}
11070 
11071 		if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11072 			error = EINVAL;
11073 			goto freeandexit;
11074 		}
11075 		if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11076 			error = EINVAL;
11077 			goto freeandexit;
11078 		}
11079 	}
11080 
11081 	/* set up the uio structure which will contain the users return buffer */
11082 	auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11083 	uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11084 
11085 	nameiflags = 0;
11086 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11087 		nameiflags |= FOLLOW;
11088 	}
11089 	NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11090 	    UIO_USERSPACE, uap->path, ctx);
11091 
11092 	error = namei(&nd);
11093 	if (error) {
11094 		goto freeandexit;
11095 	}
11096 	vp = nd.ni_vp;
11097 	nameidone(&nd);
11098 
11099 	/*
11100 	 * Switch to the root vnode for the volume
11101 	 */
11102 	error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11103 	vnode_put(vp);
11104 	if (error) {
11105 		goto freeandexit;
11106 	}
11107 	vp = tvp;
11108 
11109 #if CONFIG_UNION_MOUNTS
11110 	/*
11111 	 * If it's a union mount, the path lookup takes
11112 	 * us to the top layer. But we may need to descend
11113 	 * to a lower layer. For non-union mounts the layer
11114 	 * is always zero.
11115 	 */
11116 	for (i = 0; i < (int) state->ss_union_layer; i++) {
11117 		if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11118 			break;
11119 		}
11120 		tvp = vp;
11121 		vp = vp->v_mount->mnt_vnodecovered;
11122 		if (vp == NULL) {
11123 			vnode_put(tvp);
11124 			error = ENOENT;
11125 			goto freeandexit;
11126 		}
11127 		error = vnode_getwithref(vp);
11128 		vnode_put(tvp);
11129 		if (error) {
11130 			goto freeandexit;
11131 		}
11132 	}
11133 #endif /* CONFIG_UNION_MOUNTS */
11134 
11135 #if CONFIG_MACF
11136 	error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11137 	if (error) {
11138 		vnode_put(vp);
11139 		goto freeandexit;
11140 	}
11141 #endif
11142 
11143 
11144 	/*
11145 	 * If searchblock.maxmatches == 0, then skip the search. This has happened
11146 	 * before and sometimes the underlying code doesnt deal with it well.
11147 	 */
11148 	if (searchblock.maxmatches == 0) {
11149 		nummatches = 0;
11150 		goto saveandexit;
11151 	}
11152 
11153 	/*
11154 	 * Allright, we have everything we need, so lets make that call.
11155 	 *
11156 	 * We keep special track of the return value from the file system:
11157 	 * EAGAIN is an acceptable error condition that shouldn't keep us
11158 	 * from copying out any results...
11159 	 */
11160 
11161 	fserror = VNOP_SEARCHFS(vp,
11162 	    searchparams1,
11163 	    searchparams2,
11164 	    &searchblock.searchattrs,
11165 	    (uint32_t)searchblock.maxmatches,
11166 	    &timelimit,
11167 	    returnattrs,
11168 	    &nummatches,
11169 	    (uint32_t)uap->scriptcode,
11170 	    (uint32_t)uap->options,
11171 	    auio,
11172 	    (struct searchstate *) &state->ss_fsstate,
11173 	    ctx);
11174 
11175 #if CONFIG_UNION_MOUNTS
11176 	/*
11177 	 * If it's a union mount we need to be called again
11178 	 * to search the mounted-on filesystem.
11179 	 */
11180 	if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11181 		state->ss_union_flags = SRCHFS_START;
11182 		state->ss_union_layer++;        // search next layer down
11183 		fserror = EAGAIN;
11184 	}
11185 #endif /* CONFIG_UNION_MOUNTS */
11186 
11187 saveandexit:
11188 
11189 	vnode_put(vp);
11190 
11191 	/* Now copy out the stuff that needs copying out. That means the number of matches, the
11192 	 *  search state.  Everything was already put into he return buffer by the vop call. */
11193 
11194 	if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11195 		goto freeandexit;
11196 	}
11197 
11198 	if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11199 		goto freeandexit;
11200 	}
11201 
11202 	error = fserror;
11203 
11204 freeandexit:
11205 
11206 	kfree_data(searchparams1, mallocsize);
11207 
11208 	return error;
11209 } /* end of searchfs system call */
11210 
11211 #else /* CONFIG_SEARCHFS */
11212 
11213 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11214 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11215 {
11216 	return ENOTSUP;
11217 }
11218 
11219 #endif /* CONFIG_SEARCHFS */
11220 
11221 
11222 #if CONFIG_DATALESS_FILES
11223 
11224 /*
11225  * === Namespace Resolver Up-call Mechanism ===
11226  *
11227  * When I/O is performed to a dataless file or directory (read, write,
11228  * lookup-in, etc.), the file system performs an upcall to the namespace
11229  * resolver (filecoordinationd) to materialize the object.
11230  *
11231  * We need multiple up-calls to be in flight at once, and we need these
11232  * up-calls to be interruptible, thus the following implementation:
11233  *
11234  * => The nspace_resolver_request represents the in-kernel request state.
11235  *    It contains a request ID, storage space for the errno code returned
11236  *    by filecoordinationd, and flags.
11237  *
11238  * => The request ID is simply a global monotonically incrementing 32-bit
11239  *    number.  Outstanding requests are stored in a hash table, and the
11240  *    hash function is extremely simple.
11241  *
11242  * => When an upcall is to be made to filecoordinationd, a request structure
11243  *    is allocated on the stack (it is small, and needs to live only during
11244  *    the duration of the call to resolve_nspace_item_ext()).  It is
11245  *    initialized and inserted into the table.  Some backpressure from
11246  *    filecoordinationd is applied by limiting the numnber of entries that
11247  *    can be inserted into the table (and thus limiting the number of
11248  *    outstanding requests issued to filecoordinationd); waiting for an
11249  *    available slot is interruptible.
11250  *
11251  * => Once the request has been inserted into the table, the up-call is made
11252  *    to filecoordinationd via a MiG-generated stub.  The up-call returns
11253  *    immediately and filecoordinationd processes the request asynchronously.
11254  *
11255  * => The caller now waits for the request to complete.  Tnis is achieved by
11256  *    sleeping on the address of the request structure and waiting for
11257  *    filecoordinationd to mark the request structure as complete.  This
11258  *    is an interruptible sleep call; if interrupted, the request structure
11259  *    is removed from the table and EINTR is returned to the caller.  If
11260  *    this occurs, an advisory up-call is made to filecoordinationd with
11261  *    the request ID to indicate that the request can be aborted or
11262  *    de-prioritized at the discretion of filecoordinationd.
11263  *
11264  * => When filecoordinationd has completed the request, it signals completion
11265  *    by writing to the vfs.nspace.complete sysctl node.  Only a process
11266  *    decorated as a namespace resolver can write to this sysctl node.  The
11267  *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11268  *    The request ID is looked up in the table, and if the request is found,
11269  *    the error code is stored in the request structure and a wakeup()
11270  *    issued on the address of the request structure.  If the request is not
11271  *    found, we simply drop the completion notification, assuming that the
11272  *    caller was interrupted.
11273  *
11274  * => When the waiting thread wakes up, it extracts the error code from the
11275  *    request structure, removes the request from the table, and returns the
11276  *    error code to the calling function.  Fini!
11277  */
11278 
11279 struct nspace_resolver_request {
11280 	LIST_ENTRY(nspace_resolver_request) r_hashlink;
11281 	vnode_t         r_vp;
11282 	vnode_t         r_tdvp;
11283 	uint32_t        r_req_id;
11284 	int             r_resolver_error;
11285 	int             r_flags;
11286 };
11287 
11288 #define RRF_COMPLETE    0x0001
11289 #define RRF_COMPLETING  0x0002
11290 
11291 struct nspace_resolver_completion_data {
11292 	uint32_t req_id;
11293 	int32_t  resolver_error;
11294 	uint64_t orig_gencount;
11295 	uint64_t orig_syncroot;
11296 };
11297 
11298 static uint32_t
next_nspace_req_id(void)11299 next_nspace_req_id(void)
11300 {
11301 	static uint32_t next_req_id;
11302 
11303 	return OSAddAtomic(1, &next_req_id);
11304 }
11305 
11306 #define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
11307 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
11308 
11309 static LIST_HEAD(nspace_resolver_requesthead,
11310     nspace_resolver_request) * nspace_resolver_request_hashtbl;
11311 static u_long nspace_resolver_request_hashmask;
11312 static u_int nspace_resolver_request_count;
11313 static bool nspace_resolver_request_wait_slot;
11314 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11315 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11316     &nspace_resolver_request_lck_grp);
11317 
11318 #define NSPACE_REQ_LOCK() \
11319 	lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11320 #define NSPACE_REQ_UNLOCK() \
11321 	lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11322 
11323 #define NSPACE_RESOLVER_HASH(req_id)    \
11324 	(&nspace_resolver_request_hashtbl[(req_id) & \
11325 	 nspace_resolver_request_hashmask])
11326 
11327 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id,bool skip_completing)11328 nspace_resolver_req_lookup(uint32_t req_id, bool skip_completing)
11329 {
11330 	struct nspace_resolver_requesthead *bucket;
11331 	struct nspace_resolver_request *req;
11332 
11333 	bucket = NSPACE_RESOLVER_HASH(req_id);
11334 	LIST_FOREACH(req, bucket, r_hashlink) {
11335 		if (req->r_req_id == req_id) {
11336 			/*
11337 			 * If this request already has a completion
11338 			 * pending, don't return it again.
11339 			 */
11340 			if ((req->r_flags & RRF_COMPLETING) != 0 &&
11341 			    skip_completing) {
11342 				req = NULL;
11343 			}
11344 			return req;
11345 		}
11346 	}
11347 
11348 	return NULL;
11349 }
11350 
11351 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11352 nspace_resolver_req_add(struct nspace_resolver_request *req)
11353 {
11354 	struct nspace_resolver_requesthead *bucket;
11355 	int error;
11356 
11357 	NSPACE_REQ_LOCK();
11358 
11359 	while (nspace_resolver_request_count >=
11360 	    NSPACE_RESOLVER_MAX_OUTSTANDING) {
11361 		nspace_resolver_request_wait_slot = true;
11362 		error = msleep(&nspace_resolver_request_count,
11363 		    &nspace_resolver_request_hash_mutex,
11364 		    PVFS | PCATCH, "nspacerq", NULL);
11365 		if (error) {
11366 			NSPACE_REQ_UNLOCK();
11367 			return error;
11368 		}
11369 	}
11370 
11371 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11372 #if DIAGNOSTIC
11373 	assert(nspace_resolver_req_lookup(req->r_req_id, false) == NULL);
11374 #endif /* DIAGNOSTIC */
11375 	LIST_INSERT_HEAD(bucket, req, r_hashlink);
11376 	nspace_resolver_request_count++;
11377 
11378 	NSPACE_REQ_UNLOCK();
11379 
11380 	return 0;
11381 }
11382 
11383 static void
nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request * req)11384 nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request *req)
11385 {
11386 	/*
11387 	 * If a completion is in-progress, we have to wait for the
11388 	 * completion handler to finish because it's still using 'req',
11389 	 * which is allocated on our stack a couple of frames up.
11390 	 */
11391 	while ((req->r_flags & RRF_COMPLETING) != 0) {
11392 		(void) msleep(req, &nspace_resolver_request_hash_mutex,
11393 		    PVFS, "nspacecmplt", NULL);
11394 	}
11395 }
11396 
11397 static void
nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request * req)11398 nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request *req)
11399 {
11400 	struct nspace_resolver_requesthead *bucket;
11401 
11402 	/* We're called with NSPACE_REQ_LOCK held. */
11403 
11404 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11405 #if DIAGNOSTIC
11406 	assert((req->r_flags & RRF_COMPLETING) == 0);
11407 	assert(nspace_resolver_req_lookup(req->r_req_id, false) != NULL);
11408 #endif /* DIAGNOSTIC */
11409 	LIST_REMOVE(req, r_hashlink);
11410 	nspace_resolver_request_count--;
11411 
11412 	if (nspace_resolver_request_wait_slot) {
11413 		nspace_resolver_request_wait_slot = false;
11414 		wakeup(&nspace_resolver_request_count);
11415 	}
11416 
11417 	nspace_resolver_req_wait_pending_completion(req);
11418 
11419 	NSPACE_REQ_UNLOCK();
11420 }
11421 
11422 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)11423 nspace_resolver_req_remove(struct nspace_resolver_request *req)
11424 {
11425 	NSPACE_REQ_LOCK();
11426 	nspace_resolver_req_remove_and_unlock(req);
11427 }
11428 
11429 static void
nspace_resolver_req_cancel(uint32_t req_id)11430 nspace_resolver_req_cancel(uint32_t req_id)
11431 {
11432 	kern_return_t kr;
11433 	mach_port_t mp;
11434 
11435 	// Failures here aren't fatal -- the cancellation message
11436 	// sent to the resolver is merely advisory.
11437 
11438 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11439 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11440 		return;
11441 	}
11442 
11443 	kr = send_nspace_resolve_cancel(mp, req_id);
11444 	if (kr != KERN_SUCCESS) {
11445 		os_log_error(OS_LOG_DEFAULT,
11446 		    "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11447 	}
11448 
11449 	ipc_port_release_send(mp);
11450 }
11451 
11452 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)11453 nspace_resolver_req_wait(struct nspace_resolver_request *req)
11454 {
11455 	bool send_cancel_message = false;
11456 	int error;
11457 
11458 	NSPACE_REQ_LOCK();
11459 
11460 	while ((req->r_flags & RRF_COMPLETE) == 0) {
11461 		error = msleep(req, &nspace_resolver_request_hash_mutex,
11462 		    PVFS | PCATCH, "nspace", NULL);
11463 		if (error && error != ERESTART) {
11464 			req->r_resolver_error = (error == EINTR) ? EINTR :
11465 			    ETIMEDOUT;
11466 			send_cancel_message = true;
11467 			break;
11468 		}
11469 	}
11470 
11471 	nspace_resolver_req_remove_and_unlock(req);
11472 
11473 	/*
11474 	 * It's safe to continue referencing 'req' here because it's
11475 	 * allocated on our caller's stack.
11476 	 */
11477 
11478 	if (send_cancel_message) {
11479 		nspace_resolver_req_cancel(req->r_req_id);
11480 	}
11481 
11482 	return req->r_resolver_error;
11483 }
11484 
11485 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)11486 nspace_resolver_req_mark_complete(
11487 	struct nspace_resolver_request *req,
11488 	int resolver_error)
11489 {
11490 	req->r_resolver_error = resolver_error;
11491 	req->r_flags = (req->r_flags & ~RRF_COMPLETING) | RRF_COMPLETE;
11492 	wakeup(req);
11493 }
11494 
11495 static void
nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request * req)11496 nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request *req)
11497 {
11498 	req->r_flags |= RRF_COMPLETING;
11499 }
11500 
11501 static void
nspace_resolver_req_completed(const struct nspace_resolver_completion_data * c)11502 nspace_resolver_req_completed(const struct nspace_resolver_completion_data *c)
11503 {
11504 	struct nspace_resolver_request *req;
11505 	int error;
11506 	struct vnode_attr va;
11507 	vnode_t vp;
11508 
11509 	NSPACE_REQ_LOCK();
11510 
11511 	req = nspace_resolver_req_lookup(c->req_id, true);
11512 	if (req == NULL) {
11513 		/*
11514 		 * If we don't find the request corresponding to our req_id,
11515 		 * just drop the completion on the floor; it's likely that
11516 		 * the requester interrupted with a signal, or it may already
11517 		 * be completing.
11518 		 */
11519 		NSPACE_REQ_UNLOCK();
11520 		return;
11521 	}
11522 
11523 	/*
11524 	 * Get out now if the resolver reported an error.
11525 	 */
11526 	if ((error = c->resolver_error) != 0) {
11527 		goto out;
11528 	}
11529 
11530 	/*
11531 	 * If the resolver did not specify any namespace shape criteria
11532 	 * for letting the operation proceed, then get out now.
11533 	 */
11534 	if (c->orig_gencount == 0 && c->orig_syncroot == 0) {
11535 		goto out;
11536 	}
11537 
11538 	/*
11539 	 * We're going to have to acquire the mount rename lock and do
11540 	 * some I/O in order to verify the criteria.  Mark the request
11541 	 * as pending so no one else messes with it after we drop the
11542 	 * NSPACE_REQ_LOCK.
11543 	 */
11544 	nspace_resolver_req_mark_completion_pending(req);
11545 	NSPACE_REQ_UNLOCK();
11546 
11547 	/*
11548 	 * Lock out renames from changing the shape of the tree while
11549 	 * validate the criteria.
11550 	 */
11551 	mount_t locked_mp = req->r_vp->v_mount;
11552 	mount_ref(locked_mp, 0);
11553 	mount_lock_renames(locked_mp);
11554 
11555 	if (c->orig_gencount != 0) {
11556 		vp = req->r_vp;
11557 		if (error) {
11558 			goto out_dropmount;
11559 		}
11560 
11561 		VATTR_INIT(&va);
11562 		VATTR_WANTED(&va, va_recursive_gencount);
11563 		error = vnode_getattr(vp, &va, vfs_context_kernel());
11564 		if (error) {
11565 			goto out_dropmount;
11566 		}
11567 		if (VATTR_NOT_RETURNED(&va, va_recursive_gencount) ||
11568 		    va.va_recursive_gencount != c->orig_gencount) {
11569 			printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n",
11570 			    c->orig_gencount, va.va_recursive_gencount);
11571 			error = EBUSY;
11572 			goto out_dropmount;
11573 		}
11574 	}
11575 
11576 	/*
11577 	 * Ignore orig_syncroot if a destination directory wasn't specified
11578 	 * in the request.
11579 	 */
11580 	if (c->orig_syncroot != 0 && (vp = req->r_tdvp) != NULL) {
11581 		uint64_t syncroot_id;
11582 
11583 		if (error) {
11584 			goto out_dropmount;
11585 		}
11586 
11587 #ifndef APFSIOC_GET_SYNC_ROOT
11588 #define APFSIOC_GET_SYNC_ROOT _IOR('J', 115, uint64_t)
11589 #endif
11590 
11591 		error = VNOP_IOCTL(vp, APFSIOC_GET_SYNC_ROOT,
11592 		    (caddr_t)&syncroot_id, 0, vfs_context_kernel());
11593 		if (error) {
11594 			goto out_dropmount;
11595 		}
11596 		if (syncroot_id != c->orig_syncroot) {
11597 			printf("nspace.complete: syncroot changed! (orig %llu cur %llu)\n",
11598 			    c->orig_syncroot, syncroot_id);
11599 			error = EBUSY;
11600 			goto out_dropmount;
11601 		}
11602 	}
11603 
11604 out_dropmount:
11605 	mount_unlock_renames(locked_mp);
11606 	mount_drop(locked_mp, 0);
11607 	NSPACE_REQ_LOCK();
11608 
11609 out:
11610 	nspace_resolver_req_mark_complete(req, error);
11611 	NSPACE_REQ_UNLOCK();
11612 }
11613 
11614 static struct proc *nspace_resolver_proc;
11615 
11616 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)11617 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11618 {
11619 	*is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11620 	    p == nspace_resolver_proc) ? 1 : 0;
11621 	return 0;
11622 }
11623 
11624 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11625 
11626 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)11627 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11628 {
11629 	vfs_context_t ctx = vfs_context_current();
11630 	int error = 0;
11631 
11632 	//
11633 	// The system filecoordinationd runs as uid == 0.  This also
11634 	// has the nice side-effect of filtering out filecoordinationd
11635 	// running in the simulator.
11636 	//
11637 	if (!vfs_context_issuser(ctx) ||
11638 	    !vfs_context_is_dataless_resolver(ctx)) {
11639 		return EPERM;
11640 	}
11641 
11642 	if (is_resolver) {
11643 		NSPACE_REQ_LOCK();
11644 
11645 		if (nspace_resolver_proc == NULL) {
11646 			proc_lock(p);
11647 			p->p_lflag |= P_LNSPACE_RESOLVER;
11648 			proc_unlock(p);
11649 			nspace_resolver_proc = p;
11650 		} else {
11651 			error = EBUSY;
11652 		}
11653 
11654 		NSPACE_REQ_UNLOCK();
11655 	} else {
11656 		// This is basically just like the exit case.
11657 		// nspace_resolver_exited() will verify that the
11658 		// process is the resolver, and will clear the
11659 		// global.
11660 		nspace_resolver_exited(p);
11661 	}
11662 
11663 	return error;
11664 }
11665 
11666 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)11667 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11668 {
11669 	if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11670 	    (p->p_vfs_iopolicy &
11671 	    P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11672 		*is_prevented = 1;
11673 	} else {
11674 		*is_prevented = 0;
11675 	}
11676 	return 0;
11677 }
11678 
11679 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)11680 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11681 {
11682 	if (p->p_lflag & P_LNSPACE_RESOLVER) {
11683 		return is_prevented ? 0 : EBUSY;
11684 	}
11685 
11686 	if (is_prevented) {
11687 		OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
11688 	} else {
11689 		OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
11690 	}
11691 	return 0;
11692 }
11693 
11694 static int
nspace_materialization_get_thread_state(int * is_prevented)11695 nspace_materialization_get_thread_state(int *is_prevented)
11696 {
11697 	uthread_t ut = current_uthread();
11698 
11699 	*is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11700 	return 0;
11701 }
11702 
11703 static int
nspace_materialization_set_thread_state(int is_prevented)11704 nspace_materialization_set_thread_state(int is_prevented)
11705 {
11706 	uthread_t ut = current_uthread();
11707 
11708 	if (is_prevented) {
11709 		ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11710 	} else {
11711 		ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11712 	}
11713 	return 0;
11714 }
11715 
11716 /* the vfs.nspace branch */
11717 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11718 
11719 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11720 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11721     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11722 {
11723 	struct proc *p = req->p;
11724 	int new_value, old_value, changed = 0;
11725 	int error;
11726 
11727 	error = nspace_resolver_get_proc_state(p, &old_value);
11728 	if (error) {
11729 		return error;
11730 	}
11731 
11732 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11733 	    &changed);
11734 	if (error == 0 && changed) {
11735 		error = nspace_resolver_set_proc_state(p, new_value);
11736 	}
11737 	return error;
11738 }
11739 
11740 /* decorate this process as the dataless file resolver */
11741 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11742     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11743     0, 0, sysctl_nspace_resolver, "I", "");
11744 
11745 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11746 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11747     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11748 {
11749 	struct proc *p = req->p;
11750 	int new_value, old_value, changed = 0;
11751 	int error;
11752 
11753 	error = nspace_materialization_get_proc_state(p, &old_value);
11754 	if (error) {
11755 		return error;
11756 	}
11757 
11758 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11759 	    &changed);
11760 	if (error == 0 && changed) {
11761 		error = nspace_materialization_set_proc_state(p, new_value);
11762 	}
11763 	return error;
11764 }
11765 
11766 /* decorate this process as not wanting to materialize dataless files */
11767 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
11768     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11769     0, 0, sysctl_nspace_prevent_materialization, "I", "");
11770 
11771 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11772 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
11773     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11774 {
11775 	int new_value, old_value, changed = 0;
11776 	int error;
11777 
11778 	error = nspace_materialization_get_thread_state(&old_value);
11779 	if (error) {
11780 		return error;
11781 	}
11782 
11783 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11784 	    &changed);
11785 	if (error == 0 && changed) {
11786 		error = nspace_materialization_set_thread_state(new_value);
11787 	}
11788 	return error;
11789 }
11790 
11791 /* decorate this thread as not wanting to materialize dataless files */
11792 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
11793     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11794     0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
11795 
11796 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11797 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
11798     __unused int arg2, struct sysctl_req *req)
11799 {
11800 	struct proc *p = req->p;
11801 	uint32_t req_status[2] = { 0, 0 };
11802 	uint64_t gencount = 0;
11803 	uint64_t syncroot = 0;
11804 	int error, is_resolver, changed = 0, other_changed;
11805 
11806 	error = nspace_resolver_get_proc_state(p, &is_resolver);
11807 	if (error) {
11808 		return error;
11809 	}
11810 
11811 	if (!is_resolver) {
11812 		return EPERM;
11813 	}
11814 
11815 	error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11816 	    &changed);
11817 	if (error) {
11818 		return error;
11819 	}
11820 
11821 	/*
11822 	 * Get the gencount if it was passed.  Ignore errors, because
11823 	 * it's optional.
11824 	 */
11825 	error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11826 	    &other_changed);
11827 	if (error) {
11828 		gencount = 0;
11829 		error = 0;
11830 	}
11831 
11832 	/*
11833 	 * ...and now the syncroot ID.
11834 	 */
11835 	error = sysctl_io_opaque(req, &syncroot, sizeof(syncroot),
11836 	    &other_changed);
11837 	if (error) {
11838 		syncroot = 0;
11839 		error = 0;
11840 	}
11841 
11842 	/*
11843 	 * req_status[0] is the req_id
11844 	 *
11845 	 * req_status[1] is the errno
11846 	 */
11847 	if (error == 0 && changed) {
11848 		const struct nspace_resolver_completion_data cd = {
11849 			.req_id = req_status[0],
11850 			.resolver_error = req_status[1],
11851 			.orig_gencount = gencount,
11852 			.orig_syncroot = syncroot,
11853 		};
11854 		nspace_resolver_req_completed(&cd);
11855 	}
11856 	return error;
11857 }
11858 
11859 /* Resolver reports completed reqs here. */
11860 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11861     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11862     0, 0, sysctl_nspace_complete, "-", "");
11863 
11864 #endif /* CONFIG_DATALESS_FILES */
11865 
11866 #if CONFIG_DATALESS_FILES
11867 #define __no_dataless_unused    /* nothing */
11868 #else
11869 #define __no_dataless_unused    __unused
11870 #endif
11871 
11872 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)11873 vfs_context_dataless_materialization_is_prevented(
11874 	vfs_context_t const ctx __no_dataless_unused)
11875 {
11876 #if CONFIG_DATALESS_FILES
11877 	proc_t const p = vfs_context_proc(ctx);
11878 	thread_t const t = vfs_context_thread(ctx);
11879 	uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11880 
11881 	/*
11882 	 * Kernel context ==> return EDEADLK, as we would with any random
11883 	 * process decorated as no-materialize.
11884 	 */
11885 	if (ctx == vfs_context_kernel()) {
11886 		return EDEADLK;
11887 	}
11888 
11889 	/*
11890 	 * If the process has the dataless-manipulation entitlement,
11891 	 * materialization is prevented, and depending on the kind
11892 	 * of file system operation, things get to proceed as if the
11893 	 * object is not dataless.
11894 	 */
11895 	if (vfs_context_is_dataless_manipulator(ctx)) {
11896 		return EJUSTRETURN;
11897 	}
11898 
11899 	/*
11900 	 * Per-thread decorations override any process-wide decorations.
11901 	 * (Foundation uses this, and this overrides even the dataless-
11902 	 * manipulation entitlement so as to make API contracts consistent.)
11903 	 */
11904 	if (ut != NULL) {
11905 		if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11906 			return EDEADLK;
11907 		}
11908 		if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11909 			return 0;
11910 		}
11911 	}
11912 
11913 	/*
11914 	 * If the process's iopolicy specifies that dataless files
11915 	 * can be materialized, then we let it go ahead.
11916 	 */
11917 	if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11918 		return 0;
11919 	}
11920 #endif /* CONFIG_DATALESS_FILES */
11921 
11922 	/*
11923 	 * The default behavior is to not materialize dataless files;
11924 	 * return to the caller that deadlock was detected.
11925 	 */
11926 	return EDEADLK;
11927 }
11928 
11929 void
nspace_resolver_init(void)11930 nspace_resolver_init(void)
11931 {
11932 #if CONFIG_DATALESS_FILES
11933 	nspace_resolver_request_hashtbl =
11934 	    hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
11935 	    M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
11936 #endif /* CONFIG_DATALESS_FILES */
11937 }
11938 
11939 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)11940 nspace_resolver_exited(struct proc *p __no_dataless_unused)
11941 {
11942 #if CONFIG_DATALESS_FILES
11943 	struct nspace_resolver_requesthead *bucket;
11944 	struct nspace_resolver_request *req;
11945 	u_long idx;
11946 
11947 	NSPACE_REQ_LOCK();
11948 
11949 	if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11950 	    p == nspace_resolver_proc) {
11951 		for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
11952 			bucket = &nspace_resolver_request_hashtbl[idx];
11953 			LIST_FOREACH(req, bucket, r_hashlink) {
11954 				nspace_resolver_req_wait_pending_completion(req);
11955 				nspace_resolver_req_mark_complete(req,
11956 				    ETIMEDOUT);
11957 			}
11958 		}
11959 		nspace_resolver_proc = NULL;
11960 	}
11961 
11962 	NSPACE_REQ_UNLOCK();
11963 #endif /* CONFIG_DATALESS_FILES */
11964 }
11965 
11966 #define DATALESS_RESOLVER_ENTITLEMENT     \
11967 	"com.apple.private.vfs.dataless-resolver"
11968 #define DATALESS_MANIPULATION_ENTITLEMENT \
11969 	"com.apple.private.vfs.dataless-manipulation"
11970 
11971 #if CONFIG_DATALESS_FILES
11972 /*
11973  * Return TRUE if the vfs context is associated with the dataless
11974  * resolver.
11975  */
11976 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)11977 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
11978 {
11979 	return IOTaskHasEntitlement(vfs_context_task(ctx),
11980 	           DATALESS_RESOLVER_ENTITLEMENT);
11981 }
11982 #endif /* CONFIG_DATALESS_FILES */
11983 
11984 /*
11985  * Return TRUE if the vfs context is associated with a process entitled
11986  * for dataless manipulation.
11987  *
11988  * XXX Arguably belongs in vfs_subr.c, but is here because of the
11989  * complication around CONFIG_DATALESS_FILES.
11990  */
11991 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)11992 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
11993 {
11994 #if CONFIG_DATALESS_FILES
11995 	task_t task = vfs_context_task(ctx);
11996 	return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
11997 	       IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
11998 #else
11999 	return false;
12000 #endif /* CONFIG_DATALESS_FILES */
12001 }
12002 
12003 #if CONFIG_DATALESS_FILES
12004 static void
log_materialization_prevented(vnode_t vp,uint64_t op)12005 log_materialization_prevented(vnode_t vp, uint64_t op)
12006 {
12007 	char p_name[MAXCOMLEN + 1];
12008 	char *vntype;
12009 	proc_selfname(&p_name[0], sizeof(p_name));
12010 
12011 	if (vp->v_type == VREG) {
12012 		vntype = "File";
12013 	} else if (vp->v_type == VDIR) {
12014 		vntype = "Dir";
12015 	} else if (vp->v_type == VLNK) {
12016 		vntype = "SymLink";
12017 	} else {
12018 		vntype = "Other";
12019 	}
12020 
12021 #if DEVELOPMENT
12022 	char *path = NULL;
12023 	int   len;
12024 
12025 	path = get_pathbuff();
12026 	len = MAXPATHLEN;
12027 	if (path) {
12028 		vn_getpath(vp, path, &len);
12029 	}
12030 
12031 	os_log_debug(OS_LOG_DEFAULT,
12032 	    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) path: %s",
12033 	    p_name, proc_selfpid(),
12034 	    op, vntype, path ? path : "<unknown-path>");
12035 	if (path) {
12036 		release_pathbuff(path);
12037 	}
12038 #else
12039 	os_log_debug(OS_LOG_DEFAULT,
12040 	    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
12041 	    p_name, proc_selfpid(),
12042 	    op, vntype);
12043 #endif
12044 }
12045 #endif /* CONFIG_DATALESS_FILES */
12046 
12047 static int
vfs_materialize_item(vnode_t vp __no_dataless_unused,uint32_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused,vnode_t tdvp __no_dataless_unused)12048 vfs_materialize_item(
12049 	vnode_t vp __no_dataless_unused,
12050 	uint32_t op __no_dataless_unused,
12051 	int64_t offset __no_dataless_unused,
12052 	int64_t size __no_dataless_unused,
12053 	char *lookup_name __no_dataless_unused,
12054 	size_t const namelen __no_dataless_unused,
12055 	vnode_t tdvp __no_dataless_unused)
12056 {
12057 #if CONFIG_DATALESS_FILES
12058 	kern_return_t kern_ret;
12059 	mach_port_t mach_port;
12060 	char *path = NULL;
12061 	vfs_context_t context;
12062 	int path_len;
12063 	int error;
12064 	audit_token_t atoken;
12065 	enum vtype vp_vtype;
12066 
12067 	/* Swap files are special; ignore them */
12068 	if (vnode_isswap(vp)) {
12069 		return 0;
12070 	}
12071 
12072 	/*
12073 	 * NAMESPACE_HANDLER_SNAPSHOT_EVENT and NAMESPACE_HANDLER_TRACK_EVENT
12074 	 * are no longer used nor supported.
12075 	 */
12076 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12077 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12078 		return ENOTSUP;
12079 	}
12080 	if (op & NAMESPACE_HANDLER_TRACK_EVENT) {
12081 		os_log_debug(OS_LOG_DEFAULT, "NSPACE TRACK not handled");
12082 		return ENOTSUP;
12083 	}
12084 
12085 	/* Normalize 'op'. */
12086 	op &= ~NAMESPACE_HANDLER_EVENT_TYPE_MASK;
12087 
12088 	/*
12089 	 * To-directory is only meaningful for rename operations;
12090 	 * ignore it if someone handed one to us unexpectedly.
12091 	 */
12092 	if (op != NAMESPACE_HANDLER_RENAME_OP) {
12093 		tdvp = NULL;
12094 	}
12095 
12096 	context = vfs_context_current();
12097 
12098 	/* Remember this for later. */
12099 	vp_vtype = vnode_vtype(vp);
12100 
12101 	error = vfs_context_dataless_materialization_is_prevented(context);
12102 	if (error) {
12103 		log_materialization_prevented(vp, op);
12104 		goto out_check_errors;
12105 	}
12106 
12107 	kern_ret = host_get_filecoordinationd_port(host_priv_self(),
12108 	    &mach_port);
12109 	if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
12110 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12111 		/*
12112 		 * Treat this like being unable to access the backing store
12113 		 * server.
12114 		 */
12115 		return ETIMEDOUT;
12116 	}
12117 
12118 	int path_alloc_len = MAXPATHLEN;
12119 	do {
12120 		path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
12121 		if (path == NULL) {
12122 			return ENOMEM;
12123 		}
12124 
12125 		path_len = path_alloc_len;
12126 		error = vn_getpath(vp, path, &path_len);
12127 		if (error == 0) {
12128 			break;
12129 		} else if (error == ENOSPC) {
12130 			kfree_data(path, path_alloc_len);
12131 			path = NULL;
12132 		} else {
12133 			goto out_release_port;
12134 		}
12135 	} while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) && path_alloc_len <= FSGETPATH_MAXBUFLEN);
12136 
12137 	error = vfs_context_copy_audit_token(context, &atoken);
12138 	if (error) {
12139 		goto out_release_port;
12140 	}
12141 
12142 	struct nspace_resolver_request req = {
12143 		.r_req_id = next_nspace_req_id(),
12144 		.r_vp = vp,
12145 		.r_tdvp = tdvp,
12146 	};
12147 
12148 	error = nspace_resolver_req_add(&req);
12149 	if (error) {
12150 		goto out_release_port;
12151 	}
12152 
12153 	os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12154 
12155 	if (op == NAMESPACE_HANDLER_RENAME_OP && tdvp != NULL) {
12156 		char *dest_path = NULL;
12157 		int dest_path_len;
12158 
12159 		dest_path = zalloc(ZV_NAMEI);
12160 		dest_path_len = MAXPATHLEN;
12161 
12162 		error = vn_getpath(tdvp, dest_path, &dest_path_len);
12163 		if (error) {
12164 			zfree(ZV_NAMEI, dest_path);
12165 			goto out_release_port;
12166 		}
12167 
12168 		/*
12169 		 * Force setting NAMESPACE_HANDLER_NSPACE_EVENT for
12170 		 * compatibility with existing agents in user-space
12171 		 * who get passed this value.
12172 		 */
12173 		kern_ret = send_vfs_resolve_reparent_with_audit_token(mach_port,
12174 		    req.r_req_id,
12175 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12176 		    path, dest_path, atoken);
12177 
12178 		zfree(ZV_NAMEI, dest_path);
12179 	} else if (vp_vtype == VDIR) {
12180 		char *tmpname = NULL;
12181 
12182 		/*
12183 		 * If the caller provided a lookup_name *and* a name length,
12184 		 * then we assume the lookup_name is not NUL-terminated.
12185 		 * Allocate a temporary buffer in this case to provide
12186 		 * a NUL-terminated path name to the IPC call.
12187 		 */
12188 		if (lookup_name != NULL && namelen != 0) {
12189 			if (namelen >= PATH_MAX) {
12190 				error = EINVAL;
12191 				goto out_req_remove;
12192 			}
12193 			tmpname = zalloc(ZV_NAMEI);
12194 			strlcpy(tmpname, lookup_name, namelen + 1);
12195 			lookup_name = tmpname;
12196 		} else if (lookup_name != NULL) {
12197 			/*
12198 			 * If the caller provided a lookup_name with a
12199 			 * zero name length, then we assume it's NUL-
12200 			 * terminated.  Verify it has a valid length.
12201 			 */
12202 			if (strlen(lookup_name) >= PATH_MAX) {
12203 				error = EINVAL;
12204 				goto out_req_remove;
12205 			}
12206 		}
12207 
12208 		/* (See above.) */
12209 		kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12210 		    req.r_req_id,
12211 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12212 		    lookup_name == NULL ? "" : lookup_name, path, atoken);
12213 
12214 		if (tmpname != NULL) {
12215 			zfree(ZV_NAMEI, tmpname);
12216 
12217 			/*
12218 			 * Poison lookup_name rather than reference
12219 			 * freed memory.
12220 			 */
12221 			lookup_name = NULL;
12222 		}
12223 	} else {
12224 		/* (See above.) */
12225 		kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12226 		    req.r_req_id,
12227 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12228 		    offset, size, path, atoken);
12229 	}
12230 	if (kern_ret != KERN_SUCCESS) {
12231 		/*
12232 		 * Also treat this like being unable to access the backing
12233 		 * store server.
12234 		 */
12235 		os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12236 		    kern_ret);
12237 		error = ETIMEDOUT;
12238 		goto out_req_remove;
12239 	}
12240 
12241 	/*
12242 	 * Give back the memory we allocated earlier while we wait; we
12243 	 * no longer need it.
12244 	 */
12245 	kfree_data(path, path_alloc_len);
12246 	path = NULL;
12247 
12248 	/*
12249 	 * Request has been submitted to the resolver. Now (interruptibly)
12250 	 * wait for completion. Upon requrn, the request will have been
12251 	 * removed from the lookup table.
12252 	 */
12253 	error = nspace_resolver_req_wait(&req);
12254 
12255 out_release_port:
12256 	if (path != NULL) {
12257 		kfree_data(path, path_alloc_len);
12258 		path = NULL;
12259 	}
12260 	ipc_port_release_send(mach_port);
12261 
12262 out_check_errors:
12263 	/*
12264 	 * The file resolver owns the logic about what error to return
12265 	 * to the caller.  We only need to handle a couple of special
12266 	 * cases here:
12267 	 */
12268 	if (error == EJUSTRETURN) {
12269 		/*
12270 		 * The requesting process is allowed to interact with
12271 		 * dataless objects.  Make a couple of sanity-checks
12272 		 * here to ensure the action makes sense.
12273 		 */
12274 		switch (op) {
12275 		case NAMESPACE_HANDLER_WRITE_OP:
12276 		case NAMESPACE_HANDLER_TRUNCATE_OP:
12277 		case NAMESPACE_HANDLER_RENAME_OP:
12278 			/*
12279 			 * This handles the case of the resolver itself
12280 			 * writing data to the file (or throwing it
12281 			 * away).
12282 			 */
12283 			error = 0;
12284 			break;
12285 		case NAMESPACE_HANDLER_READ_OP:
12286 		case NAMESPACE_HANDLER_LOOKUP_OP:
12287 			/*
12288 			 * This handles the case of the resolver needing
12289 			 * to look up inside of a dataless directory while
12290 			 * it's in the process of materializing it (for
12291 			 * example, creating files or directories).
12292 			 */
12293 			error = (vp_vtype == VDIR) ? 0 : EBADF;
12294 			break;
12295 		default:
12296 			error = EBADF;
12297 			break;
12298 		}
12299 	}
12300 
12301 	return error;
12302 
12303 out_req_remove:
12304 	nspace_resolver_req_remove(&req);
12305 	goto out_release_port;
12306 #else
12307 	return ENOTSUP;
12308 #endif /* CONFIG_DATALESS_FILES */
12309 }
12310 
12311 /*
12312  * vfs_materialize_file: Materialize a regular file.
12313  *
12314  * Inputs:
12315  * vp		The dataless file to be materialized.
12316  *
12317  * op		What kind of operation is being performed:
12318  *		-> NAMESPACE_HANDLER_READ_OP
12319  *		-> NAMESPACE_HANDLER_WRITE_OP
12320  *		-> NAMESPACE_HANDLER_LINK_CREATE
12321  *		-> NAMESPACE_HANDLER_DELETE_OP
12322  *		-> NAMESPACE_HANDLER_TRUNCATE_OP
12323  *		-> NAMESPACE_HANDLER_RENAME_OP
12324  *
12325  * offset	offset of I/O for READ or WRITE.  Ignored for
12326  *		other ops.
12327  *
12328  * size		size of I/O for READ or WRITE  Ignored for
12329  *		other ops.
12330  *
12331  * If offset or size are -1 for a READ or WRITE, then the resolver should
12332  * consider the range to be unknown.
12333  *
12334  * Upon successful return, the caller may proceed with the operation.
12335  * N.B. the file may still be "dataless" in this case.
12336  */
12337 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12338 vfs_materialize_file(
12339 	struct vnode *vp,
12340 	uint64_t op,
12341 	int64_t offset,
12342 	int64_t size)
12343 {
12344 	if (vp->v_type != VREG) {
12345 		return EFTYPE;
12346 	}
12347 	return vfs_materialize_item(vp, (uint32_t)op, offset, size, NULL, 0,
12348 	           NULL);
12349 }
12350 
12351 /*
12352  * vfs_materialize_dir:
12353  *
12354  * Inputs:
12355  * vp		The dataless directory to be materialized.
12356  *
12357  * op		What kind of operation is being performed:
12358  *		-> NAMESPACE_HANDLER_READ_OP
12359  *		-> NAMESPACE_HANDLER_WRITE_OP
12360  *		-> NAMESPACE_HANDLER_DELETE_OP
12361  *		-> NAMESPACE_HANDLER_RENAME_OP
12362  *		-> NAMESPACE_HANDLER_LOOKUP_OP
12363  *
12364  * lookup_name	Name being looked up for a LOOKUP op.  Ignored for
12365  *		other ops.  May or may not be NUL-terminated; see below.
12366  *
12367  * namelen	If non-zero, then lookup_name is assumed to not be NUL-
12368  *		terminated and namelen is the number of valid bytes in
12369  *		lookup_name. If zero, then lookup_name is assumed to be
12370  *		NUL-terminated.
12371  *
12372  * Upon successful return, the caller may proceed with the operation.
12373  * N.B. the directory may still be "dataless" in this case.
12374  */
12375 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12376 vfs_materialize_dir(
12377 	struct vnode *vp,
12378 	uint64_t op,
12379 	char *lookup_name,
12380 	size_t namelen)
12381 {
12382 	if (vp->v_type != VDIR) {
12383 		return EFTYPE;
12384 	}
12385 	if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12386 		return EINVAL;
12387 	}
12388 	return vfs_materialize_item(vp, (uint32_t)op, 0, 0, lookup_name,
12389 	           namelen, NULL);
12390 }
12391 
12392 /*
12393  * vfs_materialize_reparent:
12394  *
12395  * Inputs:
12396  * vp		The dataless file or directory to be materialized.
12397  *
12398  * tdvp		The new parent directory for the dataless file.
12399  *
12400  * Upon successful return, the caller may proceed with the operation.
12401  * N.B. the item may still be "dataless" in this case.
12402  */
12403 int
vfs_materialize_reparent(vnode_t vp,vnode_t tdvp)12404 vfs_materialize_reparent(vnode_t vp, vnode_t tdvp)
12405 {
12406 	if (vp->v_type != VDIR && vp->v_type != VREG) {
12407 		return EFTYPE;
12408 	}
12409 	return vfs_materialize_item(vp, NAMESPACE_HANDLER_RENAME_OP,
12410 	           0, 0, NULL, 0, tdvp);
12411 }
12412 
12413 #if 0
12414 static int
12415 build_volfs_path(struct vnode *vp, char *path, int *len)
12416 {
12417 	struct vnode_attr va;
12418 	int ret;
12419 
12420 	VATTR_INIT(&va);
12421 	VATTR_WANTED(&va, va_fsid);
12422 	VATTR_WANTED(&va, va_fileid);
12423 
12424 	if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12425 		*len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12426 		ret = -1;
12427 	} else {
12428 		*len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12429 		ret = 0;
12430 	}
12431 
12432 	return ret;
12433 }
12434 #endif
12435 
12436 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)12437 fsctl_bogus_command_compat(unsigned long cmd)
12438 {
12439 	switch (cmd) {
12440 	case IOCBASECMD(FSIOC_SYNC_VOLUME):
12441 		return FSIOC_SYNC_VOLUME;
12442 	case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12443 		return FSIOC_ROUTEFS_SETROUTEID;
12444 	case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12445 		return FSIOC_SET_PACKAGE_EXTS;
12446 	case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12447 		return FSIOC_SET_FSTYPENAME_OVERRIDE;
12448 	case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12449 		return DISK_CONDITIONER_IOC_GET;
12450 	case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12451 		return DISK_CONDITIONER_IOC_SET;
12452 	case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12453 		return FSIOC_FIOSEEKHOLE;
12454 	case IOCBASECMD(FSIOC_FIOSEEKDATA):
12455 		return FSIOC_FIOSEEKDATA;
12456 	case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12457 		return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12458 	case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12459 		return SPOTLIGHT_IOC_GET_LAST_MTIME;
12460 	}
12461 
12462 	return cmd;
12463 }
12464 
12465 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)12466 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12467 {
12468 	return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
12469 }
12470 
12471 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)12472 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12473 {
12474 	struct vfs_attr vfa;
12475 	mount_t mp = vp->v_mount;
12476 	unsigned arg;
12477 	int error;
12478 
12479 	/* record vid of vp so we can drop it below. */
12480 	uint32_t vvid = vp->v_id;
12481 
12482 	/*
12483 	 * Then grab mount_iterref so that we can release the vnode.
12484 	 * Without this, a thread may call vnode_iterate_prepare then
12485 	 * get into a deadlock because we've never released the root vp
12486 	 */
12487 	error = mount_iterref(mp, 0);
12488 	if (error) {
12489 		return error;
12490 	}
12491 	vnode_hold(vp);
12492 	vnode_put(vp);
12493 
12494 	arg = MNT_NOWAIT;
12495 	if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12496 		arg = MNT_WAIT;
12497 	}
12498 
12499 	/*
12500 	 * If the filessytem supports multiple filesytems in a
12501 	 * partition (For eg APFS volumes in a container, it knows
12502 	 * that the waitfor argument to VFS_SYNC are flags.
12503 	 */
12504 	VFSATTR_INIT(&vfa);
12505 	VFSATTR_WANTED(&vfa, f_capabilities);
12506 	if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
12507 	    VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12508 	    ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12509 	    ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12510 		arg |= MNT_VOLUME;
12511 	}
12512 
12513 	/* issue the sync for this volume */
12514 	(void)sync_callback(mp, &arg);
12515 
12516 	/*
12517 	 * Then release the mount_iterref once we're done syncing; it's not
12518 	 * needed for the VNOP_IOCTL below
12519 	 */
12520 	mount_iterdrop(mp);
12521 
12522 	if (arg & FSCTL_SYNC_FULLSYNC) {
12523 		/* re-obtain vnode iocount on the root vp, if possible */
12524 		error = vnode_getwithvid(vp, vvid);
12525 		if (error == 0) {
12526 			error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
12527 			vnode_put(vp);
12528 		}
12529 	}
12530 	vnode_drop(vp);
12531 	/* mark the argument VP as having been released */
12532 	*arg_vp = NULL;
12533 	return error;
12534 }
12535 
12536 #if ROUTEFS
12537 static int __attribute__((noinline))
handle_routes(user_addr_t udata)12538 handle_routes(user_addr_t udata)
12539 {
12540 	char routepath[MAXPATHLEN];
12541 	size_t len = 0;
12542 	int error;
12543 
12544 	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12545 		return error;
12546 	}
12547 	bzero(routepath, MAXPATHLEN);
12548 	error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12549 	if (error) {
12550 		return error;
12551 	}
12552 	error = routefs_kernel_mount(routepath);
12553 	return error;
12554 }
12555 #endif
12556 
12557 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)12558 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12559 {
12560 	struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12561 	struct vnode_attr va;
12562 	int error;
12563 
12564 	VATTR_INIT(&va);
12565 	VATTR_SET(&va, va_flags, cas->new_flags);
12566 
12567 	error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
12568 
12569 #if CONFIG_FSE
12570 	if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12571 		add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12572 	}
12573 #endif
12574 
12575 	return error;
12576 }
12577 
12578 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)12579 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12580 {
12581 	struct mount *mp = NULL;
12582 	errno_t rootauth = 0;
12583 
12584 	mp = vp->v_mount;
12585 
12586 	/*
12587 	 * query the underlying FS and see if it reports something
12588 	 * sane for this vnode. If volume is authenticated via
12589 	 * chunklist, leave that for the caller to determine.
12590 	 */
12591 	rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12592 
12593 	return rootauth;
12594 }
12595 
12596 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12597 	"com.apple.private.kernel.set-package-extensions"
12598 
12599 /*
12600  * Make a filesystem-specific control call:
12601  */
12602 /* ARGSUSED */
12603 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)12604 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12605 {
12606 	int error = 0;
12607 	boolean_t is64bit;
12608 	u_int size;
12609 #define STK_PARAMS 128
12610 	char stkbuf[STK_PARAMS] = {0};
12611 	caddr_t data, memp;
12612 	vnode_t vp = *arg_vp;
12613 
12614 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
12615 		return ENOTTY;
12616 	}
12617 
12618 	cmd = fsctl_bogus_command_compat(cmd);
12619 
12620 	size = IOCPARM_LEN(cmd);
12621 	if (size > IOCPARM_MAX) {
12622 		return EINVAL;
12623 	}
12624 
12625 	is64bit = proc_is64bit(p);
12626 
12627 	memp = NULL;
12628 
12629 	if (size > sizeof(stkbuf)) {
12630 		if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12631 			return ENOMEM;
12632 		}
12633 		data = memp;
12634 	} else {
12635 		data = &stkbuf[0];
12636 	};
12637 
12638 	if (cmd & IOC_IN) {
12639 		if (size) {
12640 			error = copyin(udata, data, size);
12641 			if (error) {
12642 				if (memp) {
12643 					kfree_data(memp, size);
12644 				}
12645 				return error;
12646 			}
12647 		} else {
12648 			if (is64bit) {
12649 				*(user_addr_t *)data = udata;
12650 			} else {
12651 				*(uint32_t *)data = (uint32_t)udata;
12652 			}
12653 		};
12654 	} else if ((cmd & IOC_OUT) && size) {
12655 		/*
12656 		 * Zero the buffer so the user always
12657 		 * gets back something deterministic.
12658 		 */
12659 		bzero(data, size);
12660 	} else if (cmd & IOC_VOID) {
12661 		if (is64bit) {
12662 			*(user_addr_t *)data = udata;
12663 		} else {
12664 			*(uint32_t *)data = (uint32_t)udata;
12665 		}
12666 	}
12667 
12668 	/* Check to see if it's a generic command */
12669 	switch (cmd) {
12670 	case FSIOC_SYNC_VOLUME:
12671 		error = handle_sync_volume(vp, arg_vp, data, ctx);
12672 		break;
12673 
12674 	case FSIOC_ROUTEFS_SETROUTEID:
12675 #if ROUTEFS
12676 		error = handle_routes(udata);
12677 #endif
12678 		break;
12679 
12680 	case FSIOC_SET_PACKAGE_EXTS: {
12681 		user_addr_t ext_strings;
12682 		uint32_t    num_entries;
12683 		uint32_t    max_width;
12684 
12685 		if (!IOTaskHasEntitlement(vfs_context_task(ctx),
12686 		    SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12687 			error = EPERM;
12688 			break;
12689 		}
12690 
12691 		if ((is64bit && size != sizeof(user64_package_ext_info))
12692 		    || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12693 			// either you're 64-bit and passed a 64-bit struct or
12694 			// you're 32-bit and passed a 32-bit struct.  otherwise
12695 			// it's not ok.
12696 			error = EINVAL;
12697 			break;
12698 		}
12699 
12700 		if (is64bit) {
12701 			if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12702 				assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12703 			}
12704 			ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12705 			num_entries = ((user64_package_ext_info *)data)->num_entries;
12706 			max_width   = ((user64_package_ext_info *)data)->max_width;
12707 		} else {
12708 			ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12709 			num_entries = ((user32_package_ext_info *)data)->num_entries;
12710 			max_width   = ((user32_package_ext_info *)data)->max_width;
12711 		}
12712 		error = set_package_extensions_table(ext_strings, num_entries, max_width);
12713 	}
12714 	break;
12715 
12716 	case FSIOC_SET_FSTYPENAME_OVERRIDE:
12717 	{
12718 		mount_t mp;
12719 
12720 		if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12721 			break;
12722 		}
12723 		if ((mp = vp->v_mount) != NULL) {
12724 			mount_lock(mp);
12725 			if (data[0] != 0) {
12726 				for (int i = 0; i < MFSTYPENAMELEN; i++) {
12727 					if (!data[i]) {
12728 						goto continue_copy;
12729 					}
12730 				}
12731 				/*
12732 				 * Getting here means we have a user data
12733 				 * string which has no NULL termination in
12734 				 * its first MFSTYPENAMELEN bytes.  This is
12735 				 * bogus, let's avoid strlcpy-ing the read
12736 				 * data and return an error.
12737 				 */
12738 				error = EINVAL;
12739 				goto unlock;
12740 continue_copy:
12741 				vfs_setfstypename_locked(mp, data);
12742 				if (vfs_isrdonly(mp) &&
12743 				    strcmp(data, "mtmfs") == 0) {
12744 					mp->mnt_kern_flag |=
12745 					    MNTK_EXTENDED_SECURITY;
12746 					mp->mnt_kern_flag &=
12747 					    ~MNTK_AUTH_OPAQUE;
12748 				}
12749 			} else if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12750 				const char *name =
12751 				    vfs_getfstypenameref_locked(mp, NULL);
12752 				if (strcmp(name, "mtmfs") == 0) {
12753 					mp->mnt_kern_flag &=
12754 					    ~MNTK_EXTENDED_SECURITY;
12755 				}
12756 				vfs_setfstypename_locked(mp, NULL);
12757 			}
12758 unlock:
12759 			mount_unlock(mp);
12760 		}
12761 	}
12762 	break;
12763 
12764 	case DISK_CONDITIONER_IOC_GET: {
12765 		error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12766 	}
12767 	break;
12768 
12769 	case DISK_CONDITIONER_IOC_SET: {
12770 		error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
12771 	}
12772 	break;
12773 
12774 	case FSIOC_CAS_BSDFLAGS:
12775 		error = handle_flags(vp, data, ctx);
12776 		break;
12777 
12778 	case FSIOC_FD_ONLY_OPEN_ONCE: {
12779 		error = 0;
12780 		if (vnode_usecount(vp) > 1) {
12781 			vnode_lock_spin(vp);
12782 			if (vp->v_lflag & VL_HASSTREAMS) {
12783 				if (vnode_isinuse_locked(vp, 1, 1)) {
12784 					error = EBUSY;
12785 				}
12786 			} else if (vnode_usecount(vp) > 1) {
12787 				error = EBUSY;
12788 			}
12789 			vnode_unlock(vp);
12790 		}
12791 	}
12792 	break;
12793 
12794 	case FSIOC_EVAL_ROOTAUTH:
12795 		error = handle_auth(vp, cmd, data, options, ctx);
12796 		break;
12797 
12798 	case FSIOC_TEST_FSE_ACCESS_GRANTED:
12799 		error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
12800 		break;
12801 
12802 #if CONFIG_EXCLAVES
12803 	case FSIOC_EXCLAVE_FS_REGISTER:
12804 		if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12805 			error = vfs_exclave_fs_register(((fsioc_exclave_fs_register_t *)data)->fs_tag, vp);
12806 		} else {
12807 			error = EPERM;
12808 		}
12809 		break;
12810 
12811 	case FSIOC_EXCLAVE_FS_UNREGISTER:
12812 		if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12813 			error = vfs_exclave_fs_unregister(vp);
12814 		} else {
12815 			error = EPERM;
12816 		}
12817 		break;
12818 
12819 	case FSIOC_EXCLAVE_FS_GET_BASE_DIRS: {
12820 		exclave_fs_get_base_dirs_t *get_base_dirs = ((exclave_fs_get_base_dirs_t *)data);
12821 		exclave_fs_base_dir_t *dirs = NULL;
12822 		if (!IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12823 			error = EPERM;
12824 			break;
12825 		}
12826 		if (get_base_dirs->base_dirs) {
12827 			if ((get_base_dirs->count == 0) || (get_base_dirs->count > EXCLAVE_FS_GET_BASE_DIRS_MAX_COUNT)) {
12828 				error = EINVAL;
12829 				break;
12830 			}
12831 			dirs = kalloc_type(exclave_fs_base_dir_t, get_base_dirs->count, Z_WAITOK | Z_ZERO);
12832 			if (!dirs) {
12833 				error = ENOSPC;
12834 				break;
12835 			}
12836 		}
12837 		error = vfs_exclave_fs_get_base_dirs(dirs, &get_base_dirs->count);
12838 		if (!error && dirs) {
12839 			error = copyout(dirs, (user_addr_t)get_base_dirs->base_dirs,
12840 			    get_base_dirs->count * sizeof(exclave_fs_base_dir_t));
12841 		}
12842 		if (dirs) {
12843 			kfree_type(exclave_fs_base_dir_t, get_base_dirs->count, dirs);
12844 		}
12845 	}
12846 	break;
12847 #endif
12848 
12849 	default: {
12850 		/*
12851 		 * Other, known commands shouldn't be passed down here.
12852 		 * (When adding a selector to this list, it may be prudent
12853 		 * to consider adding it to the list in sys_fcntl_nocancel() as well.)
12854 		 */
12855 		switch (cmd) {
12856 		case F_PUNCHHOLE:
12857 		case F_TRIM_ACTIVE_FILE:
12858 		case F_RDADVISE:
12859 		case F_TRANSCODEKEY:
12860 		case F_GETPROTECTIONLEVEL:
12861 		case F_GETDEFAULTPROTLEVEL:
12862 		case F_MAKECOMPRESSED:
12863 		case F_SET_GREEDY_MODE:
12864 		case F_SETSTATICCONTENT:
12865 		case F_SETIOTYPE:
12866 		case F_SETBACKINGSTORE:
12867 		case F_GETPATH_MTMINFO:
12868 		case APFSIOC_REVERT_TO_SNAPSHOT:
12869 		case FSIOC_FIOSEEKHOLE:
12870 		case FSIOC_FIOSEEKDATA:
12871 		case HFS_GET_BOOT_INFO:
12872 		case HFS_SET_BOOT_INFO:
12873 		case FIOPINSWAP:
12874 		case F_CHKCLEAN:
12875 		case F_FULLFSYNC:
12876 		case F_BARRIERFSYNC:
12877 		case F_FREEZE_FS:
12878 		case F_THAW_FS:
12879 		case FSIOC_KERNEL_ROOTAUTH:
12880 		case FSIOC_GRAFT_FS:
12881 		case FSIOC_UNGRAFT_FS:
12882 		case FSIOC_AUTH_FS:
12883 			error = EINVAL;
12884 			goto outdrop;
12885 		}
12886 		/* Invoke the filesystem-specific code */
12887 		error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12888 	}
12889 	} /* end switch stmt */
12890 
12891 	/*
12892 	 * if no errors, copy any data to user. Size was
12893 	 * already set and checked above.
12894 	 */
12895 	if (error == 0 && (cmd & IOC_OUT) && size) {
12896 		error = copyout(data, udata, size);
12897 	}
12898 
12899 outdrop:
12900 	if (memp) {
12901 		kfree_data(memp, size);
12902 	}
12903 
12904 	return error;
12905 }
12906 
12907 /* ARGSUSED */
12908 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)12909 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
12910 {
12911 	int error;
12912 	struct nameidata nd;
12913 	uint32_t nameiflags;
12914 	vnode_t vp = NULL;
12915 	vfs_context_t ctx = vfs_context_current();
12916 
12917 	AUDIT_ARG(cmd, (int)uap->cmd);
12918 	AUDIT_ARG(value32, uap->options);
12919 	/* Get the vnode for the file we are getting info on:  */
12920 	nameiflags = 0;
12921 	//
12922 	// if we come through fsctl() then the file is by definition not open.
12923 	// therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
12924 	// lest the caller mistakenly thinks the only open is their own (but in
12925 	// reality it's someone elses).
12926 	//
12927 	if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
12928 		return EINVAL;
12929 	}
12930 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
12931 		nameiflags |= FOLLOW;
12932 	}
12933 	if (uap->cmd == FSIOC_FIRMLINK_CTL) {
12934 		nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
12935 	}
12936 	NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
12937 	    UIO_USERSPACE, uap->path, ctx);
12938 	if ((error = namei(&nd))) {
12939 		goto done;
12940 	}
12941 	vp = nd.ni_vp;
12942 	nameidone(&nd);
12943 
12944 #if CONFIG_MACF
12945 	error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
12946 	if (error) {
12947 		goto done;
12948 	}
12949 #endif
12950 
12951 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12952 
12953 done:
12954 	if (vp) {
12955 		vnode_put(vp);
12956 	}
12957 	return error;
12958 }
12959 /* ARGSUSED */
12960 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)12961 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
12962 {
12963 	int error;
12964 	vnode_t vp = NULL;
12965 	vfs_context_t ctx = vfs_context_current();
12966 	int fd = -1;
12967 
12968 	AUDIT_ARG(fd, uap->fd);
12969 	AUDIT_ARG(cmd, (int)uap->cmd);
12970 	AUDIT_ARG(value32, uap->options);
12971 
12972 	/* Get the vnode for the file we are getting info on:  */
12973 	if ((error = file_vnode(uap->fd, &vp))) {
12974 		return error;
12975 	}
12976 	fd = uap->fd;
12977 	if ((error = vnode_getwithref(vp))) {
12978 		file_drop(fd);
12979 		return error;
12980 	}
12981 
12982 #if CONFIG_MACF
12983 	if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
12984 		file_drop(fd);
12985 		vnode_put(vp);
12986 		return error;
12987 	}
12988 #endif
12989 
12990 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12991 
12992 	file_drop(fd);
12993 
12994 	/*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
12995 	if (vp) {
12996 		vnode_put(vp);
12997 	}
12998 
12999 	return error;
13000 }
13001 /* end of fsctl system call */
13002 
13003 #define FILESEC_ACCESS_ENTITLEMENT              \
13004 	"com.apple.private.vfs.filesec-access"
13005 
13006 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)13007 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
13008 {
13009 	if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
13010 		/*
13011 		 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
13012 		 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
13013 		 */
13014 		if ((!setting && vfs_context_issuser(ctx)) ||
13015 		    IOTaskHasEntitlement(vfs_context_task(ctx),
13016 		    FILESEC_ACCESS_ENTITLEMENT)) {
13017 			return 0;
13018 		}
13019 	}
13020 
13021 	return EPERM;
13022 }
13023 
13024 /*
13025  *  Retrieve the data of an extended attribute.
13026  */
13027 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)13028 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
13029 {
13030 	vnode_t vp;
13031 	struct nameidata nd;
13032 	char attrname[XATTR_MAXNAMELEN + 1];
13033 	vfs_context_t ctx = vfs_context_current();
13034 	uio_t auio = NULL;
13035 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13036 	size_t attrsize = 0;
13037 	size_t namelen;
13038 	u_int32_t nameiflags;
13039 	int error;
13040 	UIO_STACKBUF(uio_buf, 1);
13041 
13042 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13043 		return EINVAL;
13044 	}
13045 
13046 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13047 	NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
13048 	if ((error = namei(&nd))) {
13049 		return error;
13050 	}
13051 	vp = nd.ni_vp;
13052 	nameidone(&nd);
13053 
13054 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13055 	if (error != 0) {
13056 		goto out;
13057 	}
13058 	if (xattr_protected(attrname) &&
13059 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13060 		goto out;
13061 	}
13062 	/*
13063 	 * the specific check for 0xffffffff is a hack to preserve
13064 	 * binaray compatibilty in K64 with applications that discovered
13065 	 * that passing in a buf pointer and a size of -1 resulted in
13066 	 * just the size of the indicated extended attribute being returned.
13067 	 * this isn't part of the documented behavior, but because of the
13068 	 * original implemtation's check for "uap->size > 0", this behavior
13069 	 * was allowed. In K32 that check turned into a signed comparison
13070 	 * even though uap->size is unsigned...  in K64, we blow by that
13071 	 * check because uap->size is unsigned and doesn't get sign smeared
13072 	 * in the munger for a 32 bit user app.  we also need to add a
13073 	 * check to limit the maximum size of the buffer being passed in...
13074 	 * unfortunately, the underlying fileystems seem to just malloc
13075 	 * the requested size even if the actual extended attribute is tiny.
13076 	 * because that malloc is for kernel wired memory, we have to put a
13077 	 * sane limit on it.
13078 	 *
13079 	 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
13080 	 * U64 running on K64 will yield -1 (64 bits wide)
13081 	 * U32/U64 running on K32 will yield -1 (32 bits wide)
13082 	 */
13083 	if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
13084 		goto no_uio;
13085 	}
13086 
13087 	if (uap->value) {
13088 		if (uap->size > (size_t)XATTR_MAXSIZE) {
13089 			uap->size = XATTR_MAXSIZE;
13090 		}
13091 
13092 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13093 		    &uio_buf[0], sizeof(uio_buf));
13094 		uio_addiov(auio, uap->value, uap->size);
13095 	}
13096 no_uio:
13097 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
13098 out:
13099 	vnode_put(vp);
13100 
13101 	if (auio) {
13102 		*retval = uap->size - uio_resid(auio);
13103 	} else {
13104 		*retval = (user_ssize_t)attrsize;
13105 	}
13106 
13107 	return error;
13108 }
13109 
13110 /*
13111  * Retrieve the data of an extended attribute.
13112  */
13113 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)13114 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
13115 {
13116 	vnode_t vp;
13117 	char attrname[XATTR_MAXNAMELEN + 1];
13118 	vfs_context_t ctx = vfs_context_current();
13119 	uio_t auio = NULL;
13120 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13121 	size_t attrsize = 0;
13122 	size_t namelen;
13123 	int error;
13124 	UIO_STACKBUF(uio_buf, 1);
13125 
13126 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13127 		return EINVAL;
13128 	}
13129 
13130 	if ((error = file_vnode(uap->fd, &vp))) {
13131 		return error;
13132 	}
13133 	if ((error = vnode_getwithref(vp))) {
13134 		file_drop(uap->fd);
13135 		return error;
13136 	}
13137 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13138 	if (error != 0) {
13139 		goto out;
13140 	}
13141 	if (xattr_protected(attrname) &&
13142 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13143 		goto out;
13144 	}
13145 	if (uap->value && uap->size > 0) {
13146 		if (uap->size > (size_t)XATTR_MAXSIZE) {
13147 			uap->size = XATTR_MAXSIZE;
13148 		}
13149 
13150 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13151 		    &uio_buf[0], sizeof(uio_buf));
13152 		uio_addiov(auio, uap->value, uap->size);
13153 	}
13154 
13155 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13156 out:
13157 	(void)vnode_put(vp);
13158 	file_drop(uap->fd);
13159 
13160 	if (auio) {
13161 		*retval = uap->size - uio_resid(auio);
13162 	} else {
13163 		*retval = (user_ssize_t)attrsize;
13164 	}
13165 	return error;
13166 }
13167 
13168 /* struct for checkdirs iteration */
13169 struct setxattr_ctx {
13170 	struct nameidata nd;
13171 	char attrname[XATTR_MAXNAMELEN + 1];
13172 	UIO_STACKBUF(uio_buf, 1);
13173 };
13174 
13175 /*
13176  * Set the data of an extended attribute.
13177  */
13178 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13179 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13180 {
13181 	vnode_t vp;
13182 	vfs_context_t ctx = vfs_context_current();
13183 	uio_t auio = NULL;
13184 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13185 	size_t namelen;
13186 	u_int32_t nameiflags;
13187 	int error;
13188 	struct setxattr_ctx *sactx;
13189 
13190 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13191 		return EINVAL;
13192 	}
13193 
13194 	sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13195 	if (sactx == NULL) {
13196 		return ENOMEM;
13197 	}
13198 
13199 	error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13200 	if (error != 0) {
13201 		if (error == EPERM) {
13202 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13203 			error = ENAMETOOLONG;
13204 		}
13205 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13206 		goto out;
13207 	}
13208 	if (xattr_protected(sactx->attrname) &&
13209 	    (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13210 		goto out;
13211 	}
13212 	if (uap->size != 0 && uap->value == 0) {
13213 		error = EINVAL;
13214 		goto out;
13215 	}
13216 	if (uap->size > INT_MAX) {
13217 		error = E2BIG;
13218 		goto out;
13219 	}
13220 
13221 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13222 #if CONFIG_FILE_LEASES
13223 	nameiflags |= WANTPARENT;
13224 #endif
13225 	NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13226 	if ((error = namei(&sactx->nd))) {
13227 		goto out;
13228 	}
13229 	vp = sactx->nd.ni_vp;
13230 #if CONFIG_FILE_LEASES
13231 	vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13232 	vnode_put(sactx->nd.ni_dvp);
13233 #endif
13234 	nameidone(&sactx->nd);
13235 
13236 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13237 	    &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13238 	uio_addiov(auio, uap->value, uap->size);
13239 
13240 	error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13241 #if CONFIG_FSE
13242 	if (error == 0) {
13243 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13244 		    FSE_ARG_VNODE, vp,
13245 		    FSE_ARG_DONE);
13246 	}
13247 #endif
13248 	vnode_put(vp);
13249 out:
13250 	kfree_type(struct setxattr_ctx, sactx);
13251 	*retval = 0;
13252 	return error;
13253 }
13254 
13255 /*
13256  * Set the data of an extended attribute.
13257  */
13258 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13259 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13260 {
13261 	vnode_t vp;
13262 	char attrname[XATTR_MAXNAMELEN + 1];
13263 	vfs_context_t ctx = vfs_context_current();
13264 	uio_t auio = NULL;
13265 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13266 	size_t namelen;
13267 	int error;
13268 	UIO_STACKBUF(uio_buf, 1);
13269 
13270 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13271 		return EINVAL;
13272 	}
13273 
13274 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13275 	if (error != 0) {
13276 		if (error == EPERM) {
13277 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13278 			return ENAMETOOLONG;
13279 		}
13280 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13281 		return error;
13282 	}
13283 	if (xattr_protected(attrname) &&
13284 	    (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13285 		return error;
13286 	}
13287 	if (uap->size != 0 && uap->value == 0) {
13288 		return EINVAL;
13289 	}
13290 	if (uap->size > INT_MAX) {
13291 		return E2BIG;
13292 	}
13293 	if ((error = file_vnode(uap->fd, &vp))) {
13294 		return error;
13295 	}
13296 	if ((error = vnode_getwithref(vp))) {
13297 		file_drop(uap->fd);
13298 		return error;
13299 	}
13300 
13301 #if CONFIG_FILE_LEASES
13302 	vnode_breakdirlease(vp, true, O_WRONLY);
13303 #endif
13304 
13305 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13306 	    &uio_buf[0], sizeof(uio_buf));
13307 	uio_addiov(auio, uap->value, uap->size);
13308 
13309 	error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13310 #if CONFIG_FSE
13311 	if (error == 0) {
13312 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13313 		    FSE_ARG_VNODE, vp,
13314 		    FSE_ARG_DONE);
13315 	}
13316 #endif
13317 	vnode_put(vp);
13318 	file_drop(uap->fd);
13319 	*retval = 0;
13320 	return error;
13321 }
13322 
13323 /*
13324  * Remove an extended attribute.
13325  * XXX Code duplication here.
13326  */
13327 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13328 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13329 {
13330 	vnode_t vp;
13331 	struct nameidata nd;
13332 	char attrname[XATTR_MAXNAMELEN + 1];
13333 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13334 	vfs_context_t ctx = vfs_context_current();
13335 	size_t namelen;
13336 	u_int32_t nameiflags;
13337 	int error;
13338 
13339 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13340 		return EINVAL;
13341 	}
13342 
13343 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13344 	if (error != 0) {
13345 		return error;
13346 	}
13347 	if (xattr_protected(attrname)) {
13348 		return EPERM;
13349 	}
13350 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13351 #if CONFIG_FILE_LEASES
13352 	nameiflags |= WANTPARENT;
13353 #endif
13354 	NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13355 	if ((error = namei(&nd))) {
13356 		return error;
13357 	}
13358 	vp = nd.ni_vp;
13359 #if CONFIG_FILE_LEASES
13360 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13361 	vnode_put(nd.ni_dvp);
13362 #endif
13363 	nameidone(&nd);
13364 
13365 	error = vn_removexattr(vp, attrname, uap->options, ctx);
13366 #if CONFIG_FSE
13367 	if (error == 0) {
13368 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13369 		    FSE_ARG_VNODE, vp,
13370 		    FSE_ARG_DONE);
13371 	}
13372 #endif
13373 	vnode_put(vp);
13374 	*retval = 0;
13375 	return error;
13376 }
13377 
13378 /*
13379  * Remove an extended attribute.
13380  * XXX Code duplication here.
13381  */
13382 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)13383 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13384 {
13385 	vnode_t vp;
13386 	char attrname[XATTR_MAXNAMELEN + 1];
13387 	size_t namelen;
13388 	int error;
13389 #if CONFIG_FSE
13390 	vfs_context_t ctx = vfs_context_current();
13391 #endif
13392 
13393 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13394 		return EINVAL;
13395 	}
13396 
13397 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13398 	if (error != 0) {
13399 		return error;
13400 	}
13401 	if (xattr_protected(attrname)) {
13402 		return EPERM;
13403 	}
13404 	if ((error = file_vnode(uap->fd, &vp))) {
13405 		return error;
13406 	}
13407 	if ((error = vnode_getwithref(vp))) {
13408 		file_drop(uap->fd);
13409 		return error;
13410 	}
13411 
13412 #if CONFIG_FILE_LEASES
13413 	vnode_breakdirlease(vp, true, O_WRONLY);
13414 #endif
13415 
13416 	error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13417 #if CONFIG_FSE
13418 	if (error == 0) {
13419 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13420 		    FSE_ARG_VNODE, vp,
13421 		    FSE_ARG_DONE);
13422 	}
13423 #endif
13424 	vnode_put(vp);
13425 	file_drop(uap->fd);
13426 	*retval = 0;
13427 	return error;
13428 }
13429 
13430 /*
13431  * Retrieve the list of extended attribute names.
13432  * XXX Code duplication here.
13433  */
13434 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)13435 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13436 {
13437 	vnode_t vp;
13438 	struct nameidata nd;
13439 	vfs_context_t ctx = vfs_context_current();
13440 	uio_t auio = NULL;
13441 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13442 	size_t attrsize = 0;
13443 	u_int32_t nameiflags;
13444 	int error;
13445 	UIO_STACKBUF(uio_buf, 1);
13446 
13447 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13448 		return EINVAL;
13449 	}
13450 
13451 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13452 	NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13453 	if ((error = namei(&nd))) {
13454 		return error;
13455 	}
13456 	vp = nd.ni_vp;
13457 	nameidone(&nd);
13458 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13459 		auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
13460 		    &uio_buf[0], sizeof(uio_buf));
13461 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13462 	}
13463 
13464 	error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13465 
13466 	vnode_put(vp);
13467 	if (auio) {
13468 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13469 	} else {
13470 		*retval = (user_ssize_t)attrsize;
13471 	}
13472 	return error;
13473 }
13474 
13475 /*
13476  * Retrieve the list of extended attribute names.
13477  * XXX Code duplication here.
13478  */
13479 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)13480 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13481 {
13482 	vnode_t vp;
13483 	uio_t auio = NULL;
13484 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13485 	size_t attrsize = 0;
13486 	int error;
13487 	UIO_STACKBUF(uio_buf, 1);
13488 
13489 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13490 		return EINVAL;
13491 	}
13492 
13493 	if ((error = file_vnode(uap->fd, &vp))) {
13494 		return error;
13495 	}
13496 	if ((error = vnode_getwithref(vp))) {
13497 		file_drop(uap->fd);
13498 		return error;
13499 	}
13500 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13501 		auio = uio_createwithbuffer(1, 0, spacetype,
13502 		    UIO_READ, &uio_buf[0], sizeof(uio_buf));
13503 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13504 	}
13505 
13506 	error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13507 
13508 	vnode_put(vp);
13509 	file_drop(uap->fd);
13510 	if (auio) {
13511 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13512 	} else {
13513 		*retval = (user_ssize_t)attrsize;
13514 	}
13515 	return error;
13516 }
13517 
13518 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)13519 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13520     vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13521 {
13522 	int error;
13523 	struct mount *mp = NULL;
13524 	vnode_t vp;
13525 	int length;
13526 	int bpflags;
13527 	/* maximum number of times to retry build_path */
13528 	unsigned int retries = 0x10;
13529 
13530 	if (bufsize > FSGETPATH_MAXBUFLEN) {
13531 		return EINVAL;
13532 	}
13533 
13534 	if (buf == NULL) {
13535 		return ENOMEM;
13536 	}
13537 
13538 retry:
13539 	if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13540 		error = ENOTSUP;  /* unexpected failure */
13541 		return ENOTSUP;
13542 	}
13543 
13544 #if CONFIG_UNION_MOUNTS
13545 unionget:
13546 #endif /* CONFIG_UNION_MOUNTS */
13547 	if (objid == 2) {
13548 		struct vfs_attr vfsattr;
13549 		int use_vfs_root = TRUE;
13550 
13551 		VFSATTR_INIT(&vfsattr);
13552 		VFSATTR_WANTED(&vfsattr, f_capabilities);
13553 		if (!(options & FSOPT_ISREALFSID) &&
13554 		    vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
13555 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13556 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13557 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13558 				use_vfs_root = FALSE;
13559 			}
13560 		}
13561 
13562 		if (use_vfs_root) {
13563 			error = VFS_ROOT(mp, &vp, ctx);
13564 		} else {
13565 			error = VFS_VGET(mp, objid, &vp, ctx);
13566 		}
13567 	} else {
13568 		error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13569 	}
13570 
13571 #if CONFIG_UNION_MOUNTS
13572 	if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13573 		/*
13574 		 * If the fileid isn't found and we're in a union
13575 		 * mount volume, then see if the fileid is in the
13576 		 * mounted-on volume.
13577 		 */
13578 		struct mount *tmp = mp;
13579 		mp = vnode_mount(tmp->mnt_vnodecovered);
13580 		vfs_unbusy(tmp);
13581 		if (vfs_busy(mp, LK_NOWAIT) == 0) {
13582 			goto unionget;
13583 		}
13584 	} else {
13585 		vfs_unbusy(mp);
13586 	}
13587 #else
13588 	vfs_unbusy(mp);
13589 #endif /* CONFIG_UNION_MOUNTS */
13590 
13591 	if (error) {
13592 		return error;
13593 	}
13594 
13595 #if CONFIG_MACF
13596 	error = mac_vnode_check_fsgetpath(ctx, vp);
13597 	if (error) {
13598 		vnode_put(vp);
13599 		return error;
13600 	}
13601 #endif
13602 
13603 	/* Obtain the absolute path to this vnode. */
13604 	bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13605 	if (options & FSOPT_NOFIRMLINKPATH) {
13606 		bpflags |= BUILDPATH_NO_FIRMLINK;
13607 	}
13608 	bpflags |= BUILDPATH_CHECK_MOVED;
13609 	error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
13610 	vnode_put(vp);
13611 
13612 	if (error) {
13613 		/* there was a race building the path, try a few more times */
13614 		if (error == EAGAIN) {
13615 			--retries;
13616 			if (retries > 0) {
13617 				goto retry;
13618 			}
13619 
13620 			error = ENOENT;
13621 		}
13622 		goto out;
13623 	}
13624 
13625 	AUDIT_ARG(text, buf);
13626 
13627 	if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13628 		unsigned long path_words[NUMPARMS];
13629 		size_t path_len = sizeof(path_words);
13630 
13631 		if ((size_t)length < path_len) {
13632 			memcpy((char *)path_words, buf, length);
13633 			memset((char *)path_words + length, 0, path_len - length);
13634 
13635 			path_len = length;
13636 		} else {
13637 			memcpy((char *)path_words, buf + (length - path_len), path_len);
13638 		}
13639 
13640 		kdebug_vfs_lookup(path_words, (int)path_len, vp,
13641 		    KDBG_VFS_LOOKUP_FLAG_LOOKUP);
13642 	}
13643 
13644 	*pathlen = length; /* may be superseded by error */
13645 
13646 out:
13647 	return error;
13648 }
13649 
13650 /*
13651  * Obtain the full pathname of a file system object by id.
13652  */
13653 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)13654 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13655     uint32_t options, user_ssize_t *retval)
13656 {
13657 	vfs_context_t ctx = vfs_context_current();
13658 	fsid_t fsid;
13659 	char *realpath;
13660 	int length;
13661 	int error;
13662 
13663 	if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13664 		return EINVAL;
13665 	}
13666 
13667 	if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13668 		return error;
13669 	}
13670 	AUDIT_ARG(value32, fsid.val[0]);
13671 	AUDIT_ARG(value64, objid);
13672 	/* Restrict output buffer size for now. */
13673 
13674 	if (bufsize > FSGETPATH_MAXBUFLEN || bufsize <= 0) {
13675 		return EINVAL;
13676 	}
13677 	realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13678 	if (realpath == NULL) {
13679 		return ENOMEM;
13680 	}
13681 
13682 	error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
13683 	    options, &length);
13684 
13685 	if (error) {
13686 		goto out;
13687 	}
13688 
13689 	error = copyout((caddr_t)realpath, buf, length);
13690 
13691 	*retval = (user_ssize_t)length; /* may be superseded by error */
13692 out:
13693 	kfree_data(realpath, bufsize);
13694 	return error;
13695 }
13696 
13697 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)13698 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13699 {
13700 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13701 	           0, retval);
13702 }
13703 
13704 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)13705 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13706 {
13707 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13708 	           uap->options, retval);
13709 }
13710 
13711 /*
13712  * Common routine to handle various flavors of statfs data heading out
13713  *	to user space.
13714  *
13715  * Returns:	0			Success
13716  *		EFAULT
13717  */
13718 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)13719 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
13720     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
13721     boolean_t partial_copy)
13722 {
13723 	int             error;
13724 	int             my_size, copy_size;
13725 
13726 	if (is_64_bit) {
13727 		struct user64_statfs sfs;
13728 		my_size = copy_size = sizeof(sfs);
13729 		bzero(&sfs, my_size);
13730 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13731 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13732 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13733 		sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
13734 		sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
13735 		sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
13736 		sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
13737 		sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
13738 		sfs.f_files = (user64_long_t)sfsp->f_files;
13739 		sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
13740 		sfs.f_fsid = sfsp->f_fsid;
13741 		sfs.f_owner = sfsp->f_owner;
13742 		vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
13743 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13744 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13745 
13746 		if (partial_copy) {
13747 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13748 		}
13749 		error = copyout((caddr_t)&sfs, bufp, copy_size);
13750 	} else {
13751 		struct user32_statfs sfs;
13752 
13753 		my_size = copy_size = sizeof(sfs);
13754 		bzero(&sfs, my_size);
13755 
13756 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13757 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13758 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13759 
13760 		/*
13761 		 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
13762 		 * have to fudge the numbers here in that case.   We inflate the blocksize in order
13763 		 * to reflect the filesystem size as best we can.
13764 		 */
13765 		if ((sfsp->f_blocks > INT_MAX)
13766 		    /* Hack for 4061702 . I think the real fix is for Carbon to
13767 		     * look for some volume capability and not depend on hidden
13768 		     * semantics agreed between a FS and carbon.
13769 		     * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
13770 		     * for Carbon to set bNoVolumeSizes volume attribute.
13771 		     * Without this the webdavfs files cannot be copied onto
13772 		     * disk as they look huge. This change should not affect
13773 		     * XSAN as they should not setting these to -1..
13774 		     */
13775 		    && (sfsp->f_blocks != 0xffffffffffffffffULL)
13776 		    && (sfsp->f_bfree != 0xffffffffffffffffULL)
13777 		    && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
13778 			int             shift;
13779 
13780 			/*
13781 			 * Work out how far we have to shift the block count down to make it fit.
13782 			 * Note that it's possible to have to shift so far that the resulting
13783 			 * blocksize would be unreportably large.  At that point, we will clip
13784 			 * any values that don't fit.
13785 			 *
13786 			 * For safety's sake, we also ensure that f_iosize is never reported as
13787 			 * being smaller than f_bsize.
13788 			 */
13789 			for (shift = 0; shift < 32; shift++) {
13790 				if ((sfsp->f_blocks >> shift) <= INT_MAX) {
13791 					break;
13792 				}
13793 				if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
13794 					break;
13795 				}
13796 			}
13797 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
13798 			sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
13799 			sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
13800 			sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
13801 #undef __SHIFT_OR_CLIP
13802 			sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
13803 			sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
13804 		} else {
13805 			/* filesystem is small enough to be reported honestly */
13806 			sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
13807 			sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
13808 			sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
13809 			sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
13810 			sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
13811 		}
13812 		sfs.f_files = (user32_long_t)sfsp->f_files;
13813 		sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
13814 		sfs.f_fsid = sfsp->f_fsid;
13815 		sfs.f_owner = sfsp->f_owner;
13816 		vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
13817 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13818 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13819 
13820 		if (partial_copy) {
13821 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13822 		}
13823 		error = copyout((caddr_t)&sfs, bufp, copy_size);
13824 	}
13825 
13826 	if (sizep != NULL) {
13827 		*sizep = my_size;
13828 	}
13829 	return error;
13830 }
13831 
13832 /*
13833  * copy stat structure into user_stat structure.
13834  */
13835 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)13836 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
13837 {
13838 	bzero(usbp, sizeof(*usbp));
13839 
13840 	usbp->st_dev = sbp->st_dev;
13841 	usbp->st_ino = sbp->st_ino;
13842 	usbp->st_mode = sbp->st_mode;
13843 	usbp->st_nlink = sbp->st_nlink;
13844 	usbp->st_uid = sbp->st_uid;
13845 	usbp->st_gid = sbp->st_gid;
13846 	usbp->st_rdev = sbp->st_rdev;
13847 #ifndef _POSIX_C_SOURCE
13848 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13849 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13850 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13851 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13852 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13853 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13854 #else
13855 	usbp->st_atime = sbp->st_atime;
13856 	usbp->st_atimensec = sbp->st_atimensec;
13857 	usbp->st_mtime = sbp->st_mtime;
13858 	usbp->st_mtimensec = sbp->st_mtimensec;
13859 	usbp->st_ctime = sbp->st_ctime;
13860 	usbp->st_ctimensec = sbp->st_ctimensec;
13861 #endif
13862 	usbp->st_size = sbp->st_size;
13863 	usbp->st_blocks = sbp->st_blocks;
13864 	usbp->st_blksize = sbp->st_blksize;
13865 	usbp->st_flags = sbp->st_flags;
13866 	usbp->st_gen = sbp->st_gen;
13867 	usbp->st_lspare = sbp->st_lspare;
13868 	usbp->st_qspare[0] = sbp->st_qspare[0];
13869 	usbp->st_qspare[1] = sbp->st_qspare[1];
13870 }
13871 
13872 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)13873 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
13874 {
13875 	bzero(usbp, sizeof(*usbp));
13876 
13877 	usbp->st_dev = sbp->st_dev;
13878 	usbp->st_ino = sbp->st_ino;
13879 	usbp->st_mode = sbp->st_mode;
13880 	usbp->st_nlink = sbp->st_nlink;
13881 	usbp->st_uid = sbp->st_uid;
13882 	usbp->st_gid = sbp->st_gid;
13883 	usbp->st_rdev = sbp->st_rdev;
13884 #ifndef _POSIX_C_SOURCE
13885 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13886 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13887 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13888 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13889 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13890 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13891 #else
13892 	usbp->st_atime = sbp->st_atime;
13893 	usbp->st_atimensec = sbp->st_atimensec;
13894 	usbp->st_mtime = sbp->st_mtime;
13895 	usbp->st_mtimensec = sbp->st_mtimensec;
13896 	usbp->st_ctime = sbp->st_ctime;
13897 	usbp->st_ctimensec = sbp->st_ctimensec;
13898 #endif
13899 	usbp->st_size = sbp->st_size;
13900 	usbp->st_blocks = sbp->st_blocks;
13901 	usbp->st_blksize = sbp->st_blksize;
13902 	usbp->st_flags = sbp->st_flags;
13903 	usbp->st_gen = sbp->st_gen;
13904 	usbp->st_lspare = sbp->st_lspare;
13905 	usbp->st_qspare[0] = sbp->st_qspare[0];
13906 	usbp->st_qspare[1] = sbp->st_qspare[1];
13907 }
13908 
13909 /*
13910  * copy stat64 structure into user_stat64 structure.
13911  */
13912 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)13913 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
13914 {
13915 	bzero(usbp, sizeof(*usbp));
13916 
13917 	usbp->st_dev = sbp->st_dev;
13918 	usbp->st_ino = sbp->st_ino;
13919 	usbp->st_mode = sbp->st_mode;
13920 	usbp->st_nlink = sbp->st_nlink;
13921 	usbp->st_uid = sbp->st_uid;
13922 	usbp->st_gid = sbp->st_gid;
13923 	usbp->st_rdev = sbp->st_rdev;
13924 #ifndef _POSIX_C_SOURCE
13925 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13926 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13927 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13928 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13929 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13930 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13931 	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
13932 	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
13933 #else
13934 	usbp->st_atime = sbp->st_atime;
13935 	usbp->st_atimensec = sbp->st_atimensec;
13936 	usbp->st_mtime = sbp->st_mtime;
13937 	usbp->st_mtimensec = sbp->st_mtimensec;
13938 	usbp->st_ctime = sbp->st_ctime;
13939 	usbp->st_ctimensec = sbp->st_ctimensec;
13940 	usbp->st_birthtime = sbp->st_birthtime;
13941 	usbp->st_birthtimensec = sbp->st_birthtimensec;
13942 #endif
13943 	usbp->st_size = sbp->st_size;
13944 	usbp->st_blocks = sbp->st_blocks;
13945 	usbp->st_blksize = sbp->st_blksize;
13946 	usbp->st_flags = sbp->st_flags;
13947 	usbp->st_gen = sbp->st_gen;
13948 	usbp->st_lspare = sbp->st_lspare;
13949 	usbp->st_qspare[0] = sbp->st_qspare[0];
13950 	usbp->st_qspare[1] = sbp->st_qspare[1];
13951 }
13952 
13953 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)13954 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
13955 {
13956 	bzero(usbp, sizeof(*usbp));
13957 
13958 	usbp->st_dev = sbp->st_dev;
13959 	usbp->st_ino = sbp->st_ino;
13960 	usbp->st_mode = sbp->st_mode;
13961 	usbp->st_nlink = sbp->st_nlink;
13962 	usbp->st_uid = sbp->st_uid;
13963 	usbp->st_gid = sbp->st_gid;
13964 	usbp->st_rdev = sbp->st_rdev;
13965 #ifndef _POSIX_C_SOURCE
13966 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13967 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13968 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13969 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13970 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13971 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13972 	usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
13973 	usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
13974 #else
13975 	usbp->st_atime = sbp->st_atime;
13976 	usbp->st_atimensec = sbp->st_atimensec;
13977 	usbp->st_mtime = sbp->st_mtime;
13978 	usbp->st_mtimensec = sbp->st_mtimensec;
13979 	usbp->st_ctime = sbp->st_ctime;
13980 	usbp->st_ctimensec = sbp->st_ctimensec;
13981 	usbp->st_birthtime = sbp->st_birthtime;
13982 	usbp->st_birthtimensec = sbp->st_birthtimensec;
13983 #endif
13984 	usbp->st_size = sbp->st_size;
13985 	usbp->st_blocks = sbp->st_blocks;
13986 	usbp->st_blksize = sbp->st_blksize;
13987 	usbp->st_flags = sbp->st_flags;
13988 	usbp->st_gen = sbp->st_gen;
13989 	usbp->st_lspare = sbp->st_lspare;
13990 	usbp->st_qspare[0] = sbp->st_qspare[0];
13991 	usbp->st_qspare[1] = sbp->st_qspare[1];
13992 }
13993 
13994 /*
13995  * Purge buffer cache for simulating cold starts
13996  */
13997 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)13998 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
13999 {
14000 	ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
14001 
14002 	return VNODE_RETURNED;
14003 }
14004 
14005 static int
vfs_purge_callback(mount_t mp,__unused void * arg)14006 vfs_purge_callback(mount_t mp, __unused void * arg)
14007 {
14008 	vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
14009 
14010 	return VFS_RETURNED;
14011 }
14012 
14013 static TUNABLE_WRITEABLE(boolean_t, vfs_purge_vm_pagers, "vfs_purge_vm_pagers", TRUE);
14014 SYSCTL_INT(_vfs, OID_AUTO, purge_vm_pagers, CTLFLAG_RW | CTLFLAG_LOCKED, &vfs_purge_vm_pagers, 0, "VFS purge also purges file-backed VM pagers");
14015 
14016 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)14017 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
14018 {
14019 	if (!kauth_cred_issuser(kauth_cred_get())) {
14020 		return EPERM;
14021 	}
14022 
14023 	vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
14024 
14025 	/* also flush any VM pagers backed by files */
14026 	if (vfs_purge_vm_pagers) {
14027 		vm_purge_filebacked_pagers();
14028 	}
14029 
14030 	return 0;
14031 }
14032 
14033 /*
14034  * gets the vnode associated with the (unnamed) snapshot directory
14035  * for a Filesystem. The snapshot directory vnode is returned with
14036  * an iocount on it.
14037  */
14038 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)14039 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
14040 {
14041 	return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
14042 }
14043 
14044 /*
14045  * Get the snapshot vnode.
14046  *
14047  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
14048  * needs nameidone() on ndp.
14049  *
14050  * If the snapshot vnode exists it is returned in ndp->ni_vp.
14051  *
14052  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
14053  * not needed.
14054  */
14055 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)14056 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
14057     user_addr_t name, struct nameidata *ndp, int32_t op,
14058 #if !CONFIG_TRIGGERS
14059     __unused
14060 #endif
14061     enum path_operation pathop,
14062     vfs_context_t ctx)
14063 {
14064 	int error, i;
14065 	caddr_t name_buf;
14066 	size_t name_len;
14067 	struct vfs_attr vfa;
14068 
14069 	*sdvpp = NULLVP;
14070 	*rvpp = NULLVP;
14071 
14072 	error = vnode_getfromfd(ctx, dirfd, rvpp);
14073 	if (error) {
14074 		return error;
14075 	}
14076 
14077 	if (!vnode_isvroot(*rvpp)) {
14078 		error = EINVAL;
14079 		goto out;
14080 	}
14081 
14082 	/* Make sure the filesystem supports snapshots */
14083 	VFSATTR_INIT(&vfa);
14084 	VFSATTR_WANTED(&vfa, f_capabilities);
14085 	if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
14086 	    !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
14087 	    !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
14088 	    VOL_CAP_INT_SNAPSHOT)) ||
14089 	    !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
14090 	    VOL_CAP_INT_SNAPSHOT))) {
14091 		error = ENOTSUP;
14092 		goto out;
14093 	}
14094 
14095 	error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
14096 	if (error) {
14097 		goto out;
14098 	}
14099 
14100 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14101 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14102 	if (error) {
14103 		goto out1;
14104 	}
14105 
14106 	/*
14107 	 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
14108 	 * (the length returned by copyinstr includes the terminating NUL)
14109 	 */
14110 	if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
14111 	    (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
14112 		error = EINVAL;
14113 		goto out1;
14114 	}
14115 	for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
14116 		;
14117 	}
14118 	if (i < (int)name_len) {
14119 		error = EINVAL;
14120 		goto out1;
14121 	}
14122 
14123 #if CONFIG_MACF
14124 	if (op == CREATE) {
14125 		error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
14126 		    name_buf);
14127 	} else if (op == DELETE) {
14128 		error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
14129 		    name_buf);
14130 	}
14131 	if (error) {
14132 		goto out1;
14133 	}
14134 #endif
14135 
14136 	/* Check if the snapshot already exists ... */
14137 	NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
14138 	    UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14139 	ndp->ni_dvp = *sdvpp;
14140 
14141 	error = namei(ndp);
14142 out1:
14143 	zfree(ZV_NAMEI, name_buf);
14144 out:
14145 	if (error) {
14146 		if (*sdvpp) {
14147 			vnode_put(*sdvpp);
14148 			*sdvpp = NULLVP;
14149 		}
14150 		if (*rvpp) {
14151 			vnode_put(*rvpp);
14152 			*rvpp = NULLVP;
14153 		}
14154 	}
14155 	return error;
14156 }
14157 
14158 /*
14159  * create a filesystem snapshot (for supporting filesystems)
14160  *
14161  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14162  * We get to the (unnamed) snapshot directory vnode and create the vnode
14163  * for the snapshot in it.
14164  *
14165  * Restrictions:
14166  *
14167  *    a) Passed in name for snapshot cannot have slashes.
14168  *    b) name can't be "." or ".."
14169  *
14170  * Since this requires superuser privileges, vnode_authorize calls are not
14171  * made.
14172  */
14173 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14174 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
14175     vfs_context_t ctx)
14176 {
14177 	vnode_t rvp, snapdvp;
14178 	int error;
14179 	struct nameidata *ndp;
14180 
14181 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14182 
14183 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14184 	    OP_LINK, ctx);
14185 	if (error) {
14186 		goto out;
14187 	}
14188 
14189 	if (ndp->ni_vp) {
14190 		vnode_put(ndp->ni_vp);
14191 		error = EEXIST;
14192 	} else {
14193 		struct vnode_attr *vap;
14194 		vnode_t vp = NULLVP;
14195 
14196 		vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14197 
14198 		VATTR_INIT(vap);
14199 		VATTR_SET(vap, va_type, VREG);
14200 		VATTR_SET(vap, va_mode, 0);
14201 
14202 		error = vn_create(snapdvp, &vp, ndp, vap,
14203 		    VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14204 		if (!error && vp) {
14205 			vnode_put(vp);
14206 		}
14207 
14208 		kfree_type(struct vnode_attr, vap);
14209 	}
14210 
14211 	nameidone(ndp);
14212 	vnode_put(snapdvp);
14213 	vnode_put(rvp);
14214 out:
14215 	kfree_type(struct nameidata, ndp);
14216 
14217 	return error;
14218 }
14219 
14220 /*
14221  * Delete a Filesystem snapshot
14222  *
14223  * get the vnode for the unnamed snapshot directory and the snapshot and
14224  * delete the snapshot.
14225  */
14226 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14227 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14228     vfs_context_t ctx)
14229 {
14230 	vnode_t rvp, snapdvp;
14231 	int error;
14232 	struct nameidata *ndp;
14233 
14234 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14235 
14236 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14237 	    OP_UNLINK, ctx);
14238 	if (error) {
14239 		goto out;
14240 	}
14241 
14242 	error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14243 	    VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14244 
14245 	vnode_put(ndp->ni_vp);
14246 	nameidone(ndp);
14247 	vnode_put(snapdvp);
14248 	vnode_put(rvp);
14249 out:
14250 	kfree_type(struct nameidata, ndp);
14251 
14252 	return error;
14253 }
14254 
14255 /*
14256  * Revert a filesystem to a snapshot
14257  *
14258  * Marks the filesystem to revert to the given snapshot on next mount.
14259  */
14260 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14261 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14262     vfs_context_t ctx)
14263 {
14264 	int error;
14265 	vnode_t rvp;
14266 	mount_t mp;
14267 	struct fs_snapshot_revert_args revert_data;
14268 	struct componentname cnp;
14269 	caddr_t name_buf;
14270 	size_t name_len;
14271 
14272 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14273 	if (error) {
14274 		return error;
14275 	}
14276 	mp = vnode_mount(rvp);
14277 
14278 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14279 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14280 	if (error) {
14281 		zfree(ZV_NAMEI, name_buf);
14282 		vnode_put(rvp);
14283 		return error;
14284 	}
14285 
14286 #if CONFIG_MACF
14287 	error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14288 	if (error) {
14289 		zfree(ZV_NAMEI, name_buf);
14290 		vnode_put(rvp);
14291 		return error;
14292 	}
14293 #endif
14294 
14295 	/*
14296 	 * Grab mount_iterref so that we can release the vnode,
14297 	 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14298 	 */
14299 	error = mount_iterref(mp, 0);
14300 	vnode_put(rvp);
14301 	if (error) {
14302 		zfree(ZV_NAMEI, name_buf);
14303 		return error;
14304 	}
14305 
14306 	memset(&cnp, 0, sizeof(cnp));
14307 	cnp.cn_pnbuf = (char *)name_buf;
14308 	cnp.cn_nameiop = LOOKUP;
14309 	cnp.cn_flags = ISLASTCN | HASBUF;
14310 	cnp.cn_pnlen = MAXPATHLEN;
14311 	cnp.cn_nameptr = cnp.cn_pnbuf;
14312 	cnp.cn_namelen = (int)name_len;
14313 	revert_data.sr_cnp = &cnp;
14314 
14315 	error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14316 	mount_iterdrop(mp);
14317 	zfree(ZV_NAMEI, name_buf);
14318 
14319 	if (error) {
14320 		/* If there was any error, try again using VNOP_IOCTL */
14321 
14322 		vnode_t snapdvp;
14323 		struct nameidata namend;
14324 
14325 		error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14326 		    OP_LOOKUP, ctx);
14327 		if (error) {
14328 			return error;
14329 		}
14330 
14331 
14332 		error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14333 		    0, ctx);
14334 
14335 		vnode_put(namend.ni_vp);
14336 		nameidone(&namend);
14337 		vnode_put(snapdvp);
14338 		vnode_put(rvp);
14339 	}
14340 
14341 	return error;
14342 }
14343 
14344 /*
14345  * rename a Filesystem snapshot
14346  *
14347  * get the vnode for the unnamed snapshot directory and the snapshot and
14348  * rename the snapshot. This is a very specialised (and simple) case of
14349  * rename(2) (which has to deal with a lot more complications). It differs
14350  * slightly from rename(2) in that EEXIST is returned if the new name exists.
14351  */
14352 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)14353 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14354     __unused uint32_t flags, vfs_context_t ctx)
14355 {
14356 	vnode_t rvp, snapdvp;
14357 	int error, i;
14358 	caddr_t newname_buf;
14359 	size_t name_len;
14360 	vnode_t fvp;
14361 	struct nameidata *fromnd, *tond;
14362 	/* carving out a chunk for structs that are too big to be on stack. */
14363 	struct {
14364 		struct nameidata from_node;
14365 		struct nameidata to_node;
14366 	} * __rename_data;
14367 
14368 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14369 	fromnd = &__rename_data->from_node;
14370 	tond = &__rename_data->to_node;
14371 
14372 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14373 	    OP_UNLINK, ctx);
14374 	if (error) {
14375 		goto out;
14376 	}
14377 	fvp  = fromnd->ni_vp;
14378 
14379 	newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14380 	error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14381 	if (error) {
14382 		goto out1;
14383 	}
14384 
14385 	/*
14386 	 * Some sanity checks- new name can't be empty, "." or ".." or have
14387 	 * slashes.
14388 	 * (the length returned by copyinstr includes the terminating NUL)
14389 	 *
14390 	 * The FS rename VNOP is suppossed to handle this but we'll pick it
14391 	 * off here itself.
14392 	 */
14393 	if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14394 	    (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14395 		error = EINVAL;
14396 		goto out1;
14397 	}
14398 	for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14399 		;
14400 	}
14401 	if (i < (int)name_len) {
14402 		error = EINVAL;
14403 		goto out1;
14404 	}
14405 
14406 #if CONFIG_MACF
14407 	error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14408 	    newname_buf);
14409 	if (error) {
14410 		goto out1;
14411 	}
14412 #endif
14413 
14414 	NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14415 	    UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14416 	tond->ni_dvp = snapdvp;
14417 
14418 	error = namei(tond);
14419 	if (error) {
14420 		goto out2;
14421 	} else if (tond->ni_vp) {
14422 		/*
14423 		 * snapshot rename behaves differently than rename(2) - if the
14424 		 * new name exists, EEXIST is returned.
14425 		 */
14426 		vnode_put(tond->ni_vp);
14427 		error = EEXIST;
14428 		goto out2;
14429 	}
14430 
14431 	error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14432 	    &tond->ni_cnd, ctx);
14433 
14434 out2:
14435 	nameidone(tond);
14436 out1:
14437 	zfree(ZV_NAMEI, newname_buf);
14438 	vnode_put(fvp);
14439 	vnode_put(snapdvp);
14440 	vnode_put(rvp);
14441 	nameidone(fromnd);
14442 out:
14443 	kfree_type(typeof(*__rename_data), __rename_data);
14444 	return error;
14445 }
14446 
14447 /*
14448  * Mount a Filesystem snapshot
14449  *
14450  * get the vnode for the unnamed snapshot directory and the snapshot and
14451  * mount the snapshot.
14452  */
14453 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)14454 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14455     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14456 {
14457 	mount_t mp;
14458 	vnode_t rvp, snapdvp, snapvp, vp, pvp;
14459 	struct fs_snapshot_mount_args smnt_data;
14460 	int error;
14461 	struct nameidata *snapndp, *dirndp;
14462 	/* carving out a chunk for structs that are too big to be on stack. */
14463 	struct {
14464 		struct nameidata snapnd;
14465 		struct nameidata dirnd;
14466 	} * __snapshot_mount_data;
14467 
14468 	__snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14469 	snapndp = &__snapshot_mount_data->snapnd;
14470 	dirndp = &__snapshot_mount_data->dirnd;
14471 
14472 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
14473 	    OP_LOOKUP, ctx);
14474 	if (error) {
14475 		goto out;
14476 	}
14477 
14478 	snapvp  = snapndp->ni_vp;
14479 	if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
14480 		error = EIO;
14481 		goto out1;
14482 	}
14483 
14484 	/* Get the vnode to be covered */
14485 	NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14486 	    UIO_USERSPACE, directory, ctx);
14487 	error = namei(dirndp);
14488 	if (error) {
14489 		goto out1;
14490 	}
14491 
14492 	vp = dirndp->ni_vp;
14493 	pvp = dirndp->ni_dvp;
14494 	mp = vnode_mount(rvp);
14495 
14496 	if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14497 		error = EINVAL;
14498 		goto out2;
14499 	}
14500 
14501 #if CONFIG_MACF
14502 	error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
14503 	    mp->mnt_vfsstat.f_fstypename);
14504 	if (error) {
14505 		goto out2;
14506 	}
14507 #endif
14508 
14509 	smnt_data.sm_mp  = mp;
14510 	smnt_data.sm_cnp = &snapndp->ni_cnd;
14511 	error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
14512 	    &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & (MNT_DONTBROWSE | MNT_IGNORE_OWNERSHIP),
14513 	    KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14514 
14515 out2:
14516 	vnode_put(vp);
14517 	vnode_put(pvp);
14518 	nameidone(dirndp);
14519 out1:
14520 	vnode_put(snapvp);
14521 	vnode_put(snapdvp);
14522 	vnode_put(rvp);
14523 	nameidone(snapndp);
14524 out:
14525 	kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14526 	return error;
14527 }
14528 
14529 /*
14530  * Root from a snapshot of the filesystem
14531  *
14532  * Marks the filesystem to root from the given snapshot on next boot.
14533  */
14534 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14535 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14536     vfs_context_t ctx)
14537 {
14538 	int error;
14539 	vnode_t rvp;
14540 	mount_t mp;
14541 	struct fs_snapshot_root_args root_data;
14542 	struct componentname cnp;
14543 	caddr_t name_buf;
14544 	size_t name_len;
14545 
14546 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14547 	if (error) {
14548 		return error;
14549 	}
14550 	mp = vnode_mount(rvp);
14551 
14552 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14553 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14554 	if (error) {
14555 		zfree(ZV_NAMEI, name_buf);
14556 		vnode_put(rvp);
14557 		return error;
14558 	}
14559 
14560 	// XXX MAC checks ?
14561 
14562 	/*
14563 	 * Grab mount_iterref so that we can release the vnode,
14564 	 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14565 	 */
14566 	error = mount_iterref(mp, 0);
14567 	vnode_put(rvp);
14568 	if (error) {
14569 		zfree(ZV_NAMEI, name_buf);
14570 		return error;
14571 	}
14572 
14573 	memset(&cnp, 0, sizeof(cnp));
14574 	cnp.cn_pnbuf = (char *)name_buf;
14575 	cnp.cn_nameiop = LOOKUP;
14576 	cnp.cn_flags = ISLASTCN | HASBUF;
14577 	cnp.cn_pnlen = MAXPATHLEN;
14578 	cnp.cn_nameptr = cnp.cn_pnbuf;
14579 	cnp.cn_namelen = (int)name_len;
14580 	root_data.sr_cnp = &cnp;
14581 
14582 	error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
14583 
14584 	mount_iterdrop(mp);
14585 	zfree(ZV_NAMEI, name_buf);
14586 
14587 	return error;
14588 }
14589 
14590 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)14591 vfs_context_can_snapshot(vfs_context_t ctx)
14592 {
14593 	static const char * const snapshot_entitlements[] = {
14594 		"com.apple.private.vfs.snapshot",
14595 		"com.apple.developer.vfs.snapshot",
14596 		"com.apple.private.apfs.arv.limited.snapshot",
14597 	};
14598 	static const size_t nentitlements =
14599 	    sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14600 	size_t i;
14601 
14602 	task_t task = vfs_context_task(ctx);
14603 	for (i = 0; i < nentitlements; i++) {
14604 		if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
14605 			return TRUE;
14606 		}
14607 	}
14608 	return FALSE;
14609 }
14610 
14611 /*
14612  * FS snapshot operations dispatcher
14613  */
14614 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)14615 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14616     __unused int32_t *retval)
14617 {
14618 	int error;
14619 	vfs_context_t ctx = vfs_context_current();
14620 
14621 	AUDIT_ARG(fd, uap->dirfd);
14622 	AUDIT_ARG(value32, uap->op);
14623 
14624 	if (!vfs_context_can_snapshot(ctx)) {
14625 		return EPERM;
14626 	}
14627 
14628 	/*
14629 	 * Enforce user authorization for snapshot modification operations,
14630 	 * or if trying to root from snapshot.
14631 	 */
14632 	if (uap->op != SNAPSHOT_OP_MOUNT) {
14633 		vnode_t dvp = NULLVP;
14634 		vnode_t devvp = NULLVP;
14635 		mount_t mp;
14636 
14637 		error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
14638 		if (error) {
14639 			return error;
14640 		}
14641 		mp = vnode_mount(dvp);
14642 		devvp = mp->mnt_devvp;
14643 
14644 		/* get an iocount on devvp */
14645 		if (devvp == NULLVP) {
14646 			error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
14647 			/* for mounts which arent block devices */
14648 			if (error == ENOENT) {
14649 				error = ENXIO;
14650 			}
14651 		} else {
14652 			error = vnode_getwithref(devvp);
14653 		}
14654 
14655 		if (error) {
14656 			vnode_put(dvp);
14657 			return error;
14658 		}
14659 
14660 		if ((vfs_context_issuser(ctx) == 0) &&
14661 		    (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14662 		    (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
14663 			error = EPERM;
14664 		}
14665 		vnode_put(dvp);
14666 		vnode_put(devvp);
14667 
14668 		if (error) {
14669 			return error;
14670 		}
14671 	}
14672 
14673 	switch (uap->op) {
14674 	case SNAPSHOT_OP_CREATE:
14675 		error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
14676 		break;
14677 	case SNAPSHOT_OP_DELETE:
14678 		error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
14679 		break;
14680 	case SNAPSHOT_OP_RENAME:
14681 		error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
14682 		    uap->flags, ctx);
14683 		break;
14684 	case SNAPSHOT_OP_MOUNT:
14685 		error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
14686 		    uap->data, uap->flags, ctx);
14687 		break;
14688 	case SNAPSHOT_OP_REVERT:
14689 		error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
14690 		break;
14691 #if CONFIG_MNT_ROOTSNAP
14692 	case SNAPSHOT_OP_ROOT:
14693 		error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
14694 		break;
14695 #endif /* CONFIG_MNT_ROOTSNAP */
14696 	default:
14697 		error = ENOSYS;
14698 	}
14699 
14700 	return error;
14701 }
14702