xref: /xnu-10002.61.3/bsd/vfs/vfs_syscalls.c (revision 0f4c859e951fba394238ab619495c4e1d54d0f34)
1 /*
2  * Copyright (c) 1995-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1989, 1993
30  *	The Regents of the University of California.  All rights reserved.
31  * (c) UNIX System Laboratories, Inc.
32  * All or some portions of this file are derived from material licensed
33  * to the University of California by American Telephone and Telegraph
34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35  * the permission of UNIX System Laboratories, Inc.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. All advertising materials mentioning features or use of this software
46  *    must display the following acknowledgement:
47  *	This product includes software developed by the University of
48  *	California, Berkeley and its contributors.
49  * 4. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.41 (Berkeley) 6/15/95
66  */
67 /*
68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69  * support for mandatory and extensible security protections.  This notice
70  * is included in support of clause 2.2 (b) of the Apple Public License,
71  * Version 2.0.
72  */
73 
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112 
113 #include <vfs/vfs_disk_conditioner.h>
114 
115 #include <security/audit/audit.h>
116 #include <bsm/audit_kevents.h>
117 
118 #include <mach/mach_types.h>
119 #include <kern/kern_types.h>
120 #include <kern/kalloc.h>
121 #include <kern/task.h>
122 
123 #include <vm/vm_pageout.h>
124 #include <vm/vm_protos.h>
125 
126 #include <libkern/OSAtomic.h>
127 #include <os/atomic_private.h>
128 #include <pexpert/pexpert.h>
129 #include <IOKit/IOBSD.h>
130 
131 // deps for MIG call
132 #include <kern/host.h>
133 #include <kern/ipc_misc.h>
134 #include <mach/host_priv.h>
135 #include <mach/vfs_nspace.h>
136 #include <os/log.h>
137 
138 #include <nfs/nfs_conf.h>
139 
140 #if ROUTEFS
141 #include <miscfs/routefs/routefs.h>
142 #endif /* ROUTEFS */
143 
144 #if CONFIG_MACF
145 #include <security/mac.h>
146 #include <security/mac_framework.h>
147 #endif
148 
149 #if CONFIG_FSE
150 #define GET_PATH(x) \
151 	((x) = get_pathbuff())
152 #define RELEASE_PATH(x) \
153 	release_pathbuff(x)
154 #else
155 #define GET_PATH(x)     \
156 	((x) = zalloc(ZV_NAMEI))
157 #define RELEASE_PATH(x) \
158 	zfree(ZV_NAMEI, x)
159 #endif /* CONFIG_FSE */
160 
161 #ifndef HFS_GET_BOOT_INFO
162 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
163 #endif
164 
165 #ifndef HFS_SET_BOOT_INFO
166 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
167 #endif
168 
169 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
170 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
171 #endif
172 
173 extern void disk_conditioner_unmount(mount_t mp);
174 
175 /* struct for checkdirs iteration */
176 struct cdirargs {
177 	vnode_t olddp;
178 	vnode_t newdp;
179 };
180 /* callback  for checkdirs iteration */
181 static int checkdirs_callback(proc_t p, void * arg);
182 
183 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
184 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
185 void enablequotas(struct mount *mp, vfs_context_t ctx);
186 static int getfsstat_callback(mount_t mp, void * arg);
187 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
188 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
189 static int sync_callback(mount_t, void *);
190 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
191     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
192     boolean_t partial_copy);
193 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
194 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
195     struct componentname *cnp, user_addr_t fsmountargs,
196     int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
197 void vfs_notify_mount(vnode_t pdvp);
198 
199 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
200 
201 struct fd_vn_data * fg_vn_data_alloc(void);
202 
203 /*
204  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
205  * Concurrent lookups (or lookups by ids) on hard links can cause the
206  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
207  * does) to return ENOENT as the path cannot be returned from the name cache
208  * alone. We have no option but to retry and hope to get one namei->reverse path
209  * generation done without an intervening lookup, lookup by id on the hard link
210  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
211  * which currently are the MAC hooks for rename, unlink and rmdir.
212  */
213 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
214 
215 /* Max retry limit for rename due to vnode recycling. */
216 #define MAX_RENAME_ERECYCLE_RETRIES 1024
217 
218 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
219     int unlink_flags);
220 
221 #ifdef CONFIG_IMGSRC_ACCESS
222 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
223 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
224 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
225 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
226 static void mount_end_update(mount_t mp);
227 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
228 #endif /* CONFIG_IMGSRC_ACCESS */
229 
230 //snapshot functions
231 #if CONFIG_MNT_ROOTSNAP
232 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
233 #else
234 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
235 #endif
236 
237 __private_extern__
238 int sync_internal(void);
239 
240 __private_extern__
241 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
242 
243 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
244 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
245 
246 /* vars for sync mutex */
247 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
248 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
249 
250 extern lck_rw_t rootvnode_rw_lock;
251 
252 VFS_SMR_DECLARE;
253 extern uint32_t nc_smr_enabled;
254 
255 /*
256  * incremented each time a mount or unmount operation occurs
257  * used to invalidate the cached value of the rootvp in the
258  * mount structure utilized by cache_lookup_path
259  */
260 uint32_t mount_generation = 0;
261 
262 /* counts number of mount and unmount operations */
263 unsigned int vfs_nummntops = 0;
264 
265 /* system-wide, per-boot unique mount ID */
266 static _Atomic uint64_t mount_unique_id = 1;
267 
268 extern const struct fileops vnops;
269 #if CONFIG_APPLEDOUBLE
270 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
271 #endif /* CONFIG_APPLEDOUBLE */
272 
273 /* Maximum buffer length supported by fsgetpath(2) */
274 #define FSGETPATH_MAXBUFLEN  8192
275 
276 /*
277  * Virtual File System System Calls
278  */
279 
280 /*
281  * Private in-kernel mounting spi (specific use-cases only)
282  */
283 boolean_t
vfs_iskernelmount(mount_t mp)284 vfs_iskernelmount(mount_t mp)
285 {
286 	return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
287 }
288 
289 __private_extern__
290 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)291 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
292     void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
293     vfs_context_t ctx)
294 {
295 	struct nameidata nd;
296 	boolean_t did_namei;
297 	int error;
298 
299 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
300 	    UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
301 
302 	kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
303 
304 	/*
305 	 * Get the vnode to be covered if it's not supplied
306 	 */
307 	if (vp == NULLVP) {
308 		error = namei(&nd);
309 		if (error) {
310 			if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
311 				printf("failed to locate mount-on path: %s ", path);
312 			}
313 			return error;
314 		}
315 		vp = nd.ni_vp;
316 		pvp = nd.ni_dvp;
317 		did_namei = TRUE;
318 	} else {
319 		char *pnbuf = CAST_DOWN(char *, path);
320 
321 		nd.ni_cnd.cn_pnbuf = pnbuf;
322 		nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
323 		did_namei = FALSE;
324 	}
325 
326 	kern_flags |= KERNEL_MOUNT_KMOUNT;
327 	error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
328 	    syscall_flags, kern_flags, NULL, ctx);
329 
330 	if (did_namei) {
331 		vnode_put(vp);
332 		vnode_put(pvp);
333 		nameidone(&nd);
334 	}
335 
336 	return error;
337 }
338 
339 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)340 vfs_mount_at_path(const char *fstype, const char *path,
341     vnode_t pvp, vnode_t vp, void *data, size_t datalen,
342     int mnt_flags, int flags)
343 {
344 	int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
345 	int error, km_flags = 0;
346 	vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
347 
348 	/*
349 	 * This call is currently restricted to specific use cases.
350 	 */
351 	if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
352 		return ENOTSUP;
353 	}
354 
355 #if !defined(XNU_TARGET_OS_OSX)
356 	if (strcmp(fstype, "lifs") == 0) {
357 		syscall_flags |= MNT_NOEXEC;
358 	}
359 #endif
360 
361 	if (flags & VFS_MOUNT_FLAG_NOAUTH) {
362 		km_flags |= KERNEL_MOUNT_NOAUTH;
363 	}
364 	if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
365 		km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
366 	}
367 
368 	error = kernel_mount(fstype, pvp, vp, path, data, datalen,
369 	    syscall_flags, km_flags, ctx);
370 	if (error) {
371 		printf("%s: mount on %s failed, error %d\n", __func__, path,
372 		    error);
373 	}
374 
375 	return error;
376 }
377 
378 /*
379  * Mount a file system.
380  */
381 /* ARGSUSED */
382 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)383 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
384 {
385 	struct __mac_mount_args muap;
386 
387 	muap.type = uap->type;
388 	muap.path = uap->path;
389 	muap.flags = uap->flags;
390 	muap.data = uap->data;
391 	muap.mac_p = USER_ADDR_NULL;
392 	return __mac_mount(p, &muap, retval);
393 }
394 
395 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)396 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
397 {
398 	struct componentname    cn;
399 	vfs_context_t           ctx = vfs_context_current();
400 	size_t                  dummy = 0;
401 	int                     error;
402 	int                     flags = uap->flags;
403 	char                    fstypename[MFSNAMELEN];
404 	char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
405 	vnode_t                 pvp;
406 	vnode_t                 vp;
407 
408 	AUDIT_ARG(fd, uap->fd);
409 	AUDIT_ARG(fflags, flags);
410 	/* fstypename will get audited by mount_common */
411 
412 	/* Sanity check the flags */
413 	if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
414 		return ENOTSUP;
415 	}
416 
417 	if (flags & MNT_UNION) {
418 		return EPERM;
419 	}
420 
421 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
422 	if (error) {
423 		return error;
424 	}
425 
426 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
427 		return error;
428 	}
429 
430 	if ((error = vnode_getwithref(vp)) != 0) {
431 		file_drop(uap->fd);
432 		return error;
433 	}
434 
435 	pvp = vnode_getparent(vp);
436 	if (pvp == NULL) {
437 		if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
438 			error = EBUSY;
439 		} else {
440 			error = EINVAL;
441 		}
442 		vnode_put(vp);
443 		file_drop(uap->fd);
444 		return error;
445 	}
446 
447 	memset(&cn, 0, sizeof(struct componentname));
448 	cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
449 	cn.cn_pnlen = MAXPATHLEN;
450 
451 	if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
452 		zfree(ZV_NAMEI, cn.cn_pnbuf);
453 		vnode_put(pvp);
454 		vnode_put(vp);
455 		file_drop(uap->fd);
456 		return error;
457 	}
458 
459 	error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
460 
461 	zfree(ZV_NAMEI, cn.cn_pnbuf);
462 	vnode_put(pvp);
463 	vnode_put(vp);
464 	file_drop(uap->fd);
465 
466 	return error;
467 }
468 
469 #define MAX_GRAFT_METADATA_SIZE             16384 /* bytes */
470 
471 /*
472  * Get the size of a graft file (a manifest or payload file).
473  * The vp should be an iocounted vnode.
474  */
475 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)476 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
477 {
478 	struct stat64 sb = {};
479 	int error;
480 
481 	*size = 0;
482 
483 	error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
484 	if (error) {
485 		return error;
486 	}
487 
488 	if (sb.st_size == 0) {
489 		error = ENODATA;
490 	} else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
491 		error = EFBIG;
492 	} else {
493 		*size = (size_t) sb.st_size;
494 	}
495 
496 	return error;
497 }
498 
499 /*
500  * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
501  * `size` must already be validated.
502  */
503 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)504 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
505 {
506 	return vn_rdwr(UIO_READ, graft_vp,
507 	           (caddr_t) buf, (int) size, /* offset */ 0,
508 	           UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
509 	           vfs_context_ucred(vctx), /* resid */ NULL,
510 	           vfs_context_proc(vctx));
511 }
512 
513 /*
514  * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
515  * and read it into `buf`.
516  */
517 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,size_t * size,void * buf)518 graft_secureboot_read_fd(int fd, vfs_context_t vctx, size_t *size, void *buf)
519 {
520 	vnode_t metadata_vp = NULLVP;
521 	int error;
522 
523 	// Convert this graft fd to a vnode.
524 	if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
525 		goto out;
526 	}
527 
528 	// Get (and validate) size information.
529 	if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
530 		goto out;
531 	}
532 
533 	// Read each file into the provided buffer - we must get the expected amount of bytes.
534 	if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
535 		goto out;
536 	}
537 
538 out:
539 	if (metadata_vp) {
540 		vnode_put(metadata_vp);
541 		metadata_vp = NULLVP;
542 	}
543 
544 	return error;
545 }
546 
547 /*
548  * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
549  * provided in `gfs`, saving the size of data read in `gfs`.
550  */
551 static int
graft_secureboot_read_metadata(secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)552 graft_secureboot_read_metadata(secure_boot_cryptex_args_t *sbc_args, vfs_context_t vctx,
553     fsioc_graft_fs_t *gfs)
554 {
555 	int error;
556 
557 	// Read the authentic manifest.
558 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
559 	    &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
560 		return error;
561 	}
562 
563 	// The user manifest is currently unused, but set its size.
564 	gfs->user_manifest_size = 0;
565 
566 	// Read the payload.
567 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
568 	    &gfs->payload_size, gfs->payload))) {
569 		return error;
570 	}
571 
572 	return 0;
573 }
574 
575 /*
576  * Call into the filesystem to verify and graft a cryptex.
577  */
578 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)579 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
580     vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
581 {
582 	fsioc_graft_fs_t gfs = {};
583 	uint64_t graft_dir_ino = 0;
584 	struct stat64 sb = {};
585 	int error;
586 
587 	// Pre-flight arguments.
588 	if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
589 		// Make sure that this graft version matches what we support.
590 		return ENOTSUP;
591 	} else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
592 		// For this type, cryptex VP must live on same volume as the target of graft.
593 		return EXDEV;
594 	} else if (mounton_vp && mounton_vp->v_type != VDIR) {
595 		// We cannot graft upon non-directories.
596 		return ENOTDIR;
597 	} else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
598 	    sbc_args->sbc_payload_fd < 0) {
599 		// We cannot graft without a manifest and payload.
600 		return EINVAL;
601 	}
602 
603 	if (mounton_vp) {
604 		// Get the mounton's inode number.
605 		error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
606 		if (error) {
607 			return error;
608 		}
609 		graft_dir_ino = (uint64_t) sb.st_ino;
610 	}
611 
612 	// Create buffers (of our maximum-defined size) to store authentication info.
613 	gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
614 	gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
615 
616 	if (!gfs.authentic_manifest || !gfs.payload) {
617 		error = ENOMEM;
618 		goto out;
619 	}
620 
621 	// Read our fd's into our buffers.
622 	// (Note that this will set the buffer size fields in `gfs`.)
623 	error = graft_secureboot_read_metadata(sbc_args, vctx, &gfs);
624 	if (error) {
625 		goto out;
626 	}
627 
628 	gfs.graft_version = FSIOC_GRAFT_VERSION;
629 	gfs.graft_type = graft_type;
630 	gfs.graft_4cc = sbc_args->sbc_4cc;
631 	if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
632 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
633 	}
634 	if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
635 		gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
636 	}
637 	if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
638 		gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
639 	}
640 	if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
641 		gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
642 	}
643 	if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
644 		gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
645 	}
646 	if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
647 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
648 	}
649 	gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
650 
651 	// Call into the FS to perform the graft (and validation).
652 	error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
653 
654 out:
655 	if (gfs.authentic_manifest) {
656 		kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
657 		gfs.authentic_manifest = NULL;
658 	}
659 	if (gfs.payload) {
660 		kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
661 		gfs.payload = NULL;
662 	}
663 
664 	return error;
665 }
666 
667 #define GRAFTDMG_ENTITLEMENT  "com.apple.private.vfs.graftdmg"
668 
669 /*
670  * Graft a cryptex disk image (via FD) onto the appropriate mount-point
671  * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
672  */
673 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)674 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
675 {
676 	int ua_dmgfd = uap->dmg_fd;
677 	user_addr_t ua_mountdir = uap->mountdir;
678 	uint32_t ua_grafttype = uap->graft_type;
679 	user_addr_t ua_graftargs = uap->gda;
680 
681 	graftdmg_args_un kern_gda = {};
682 	int error = 0;
683 	secure_boot_cryptex_args_t *sbc_args = NULL;
684 
685 	vnode_t cryptex_vp = NULLVP;
686 	vnode_t mounton_vp = NULLVP;
687 	struct nameidata nd = {};
688 	vfs_context_t ctx = vfs_context_current();
689 
690 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
691 		return EPERM;
692 	}
693 
694 	error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
695 	if (error) {
696 		return error;
697 	}
698 
699 	// Copy mount dir in, if provided.
700 	if (ua_mountdir != USER_ADDR_NULL) {
701 		// Acquire vnode for mount-on path
702 		NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
703 		    UIO_USERSPACE, ua_mountdir, ctx);
704 
705 		error = namei(&nd);
706 		if (error) {
707 			return error;
708 		}
709 		mounton_vp = nd.ni_vp;
710 	}
711 
712 	// Convert fd to vnode.
713 	error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
714 	if (error) {
715 		goto graftout;
716 	}
717 
718 	if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_MAX) {
719 		error = EINVAL;
720 	} else {
721 		sbc_args = &kern_gda.sbc_args;
722 		error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp);
723 	}
724 
725 graftout:
726 	if (cryptex_vp) {
727 		vnode_put(cryptex_vp);
728 		cryptex_vp = NULLVP;
729 	}
730 	if (mounton_vp) {
731 		vnode_put(mounton_vp);
732 		mounton_vp = NULLVP;
733 	}
734 	if (ua_mountdir != USER_ADDR_NULL) {
735 		nameidone(&nd);
736 	}
737 
738 	return error;
739 }
740 
741 /*
742  * Ungraft a cryptex disk image (via mount dir FD)
743  * { int ungraftdmg(const char *mountdir, uint64_t flags); }
744  */
745 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)746 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
747 {
748 	int error = 0;
749 	user_addr_t ua_mountdir = uap->mountdir;
750 	fsioc_ungraft_fs_t ugfs;
751 	vnode_t mounton_vp = NULLVP;
752 	struct nameidata nd = {};
753 	vfs_context_t ctx = vfs_context_current();
754 
755 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
756 		return EPERM;
757 	}
758 
759 	if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
760 		return EINVAL;
761 	}
762 
763 	ugfs.ungraft_flags = 0;
764 
765 	// Acquire vnode for mount-on path
766 	NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
767 	    UIO_USERSPACE, ua_mountdir, ctx);
768 
769 	error = namei(&nd);
770 	if (error) {
771 		return error;
772 	}
773 	mounton_vp = nd.ni_vp;
774 
775 	// Call into the FS to perform the ungraft
776 	error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
777 
778 	vnode_put(mounton_vp);
779 	nameidone(&nd);
780 
781 	return error;
782 }
783 
784 
785 void
vfs_notify_mount(vnode_t pdvp)786 vfs_notify_mount(vnode_t pdvp)
787 {
788 	vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
789 	lock_vnode_and_post(pdvp, NOTE_WRITE);
790 }
791 
792 /*
793  * __mac_mount:
794  *	Mount a file system taking into account MAC label behavior.
795  *	See mount(2) man page for more information
796  *
797  * Parameters:    p                        Process requesting the mount
798  *                uap                      User argument descriptor (see below)
799  *                retval                   (ignored)
800  *
801  * Indirect:      uap->type                Filesystem type
802  *                uap->path                Path to mount
803  *                uap->data                Mount arguments
804  *                uap->mac_p               MAC info
805  *                uap->flags               Mount flags
806  *
807  *
808  * Returns:        0                       Success
809  *                !0                       Not success
810  */
811 boolean_t root_fs_upgrade_try = FALSE;
812 
813 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)814 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
815 {
816 	vnode_t pvp = NULL;
817 	vnode_t vp = NULL;
818 	int need_nameidone = 0;
819 	vfs_context_t ctx = vfs_context_current();
820 	char fstypename[MFSNAMELEN];
821 	struct nameidata nd;
822 	size_t dummy = 0;
823 	char *labelstr = NULL;
824 	size_t labelsz = 0;
825 	int flags = uap->flags;
826 	int error;
827 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
828 	boolean_t is_64bit = IS_64BIT_PROCESS(p);
829 #else
830 #pragma unused(p)
831 #endif
832 	/*
833 	 * Get the fs type name from user space
834 	 */
835 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
836 	if (error) {
837 		return error;
838 	}
839 
840 	/*
841 	 * Get the vnode to be covered
842 	 */
843 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
844 	    UIO_USERSPACE, uap->path, ctx);
845 	if (flags & MNT_NOFOLLOW) {
846 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
847 	}
848 	error = namei(&nd);
849 	if (error) {
850 		goto out;
851 	}
852 	need_nameidone = 1;
853 	vp = nd.ni_vp;
854 	pvp = nd.ni_dvp;
855 
856 #ifdef CONFIG_IMGSRC_ACCESS
857 	/* Mounting image source cannot be batched with other operations */
858 	if (flags == MNT_IMGSRC_BY_INDEX) {
859 		error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
860 		    ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
861 		goto out;
862 	}
863 #endif /* CONFIG_IMGSRC_ACCESS */
864 
865 #if CONFIG_MACF
866 	/*
867 	 * Get the label string (if any) from user space
868 	 */
869 	if (uap->mac_p != USER_ADDR_NULL) {
870 		struct user_mac mac;
871 		size_t ulen = 0;
872 
873 		if (is_64bit) {
874 			struct user64_mac mac64;
875 			error = copyin(uap->mac_p, &mac64, sizeof(mac64));
876 			mac.m_buflen = (user_size_t)mac64.m_buflen;
877 			mac.m_string = (user_addr_t)mac64.m_string;
878 		} else {
879 			struct user32_mac mac32;
880 			error = copyin(uap->mac_p, &mac32, sizeof(mac32));
881 			mac.m_buflen = mac32.m_buflen;
882 			mac.m_string = mac32.m_string;
883 		}
884 		if (error) {
885 			goto out;
886 		}
887 		if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
888 		    (mac.m_buflen < 2)) {
889 			error = EINVAL;
890 			goto out;
891 		}
892 		labelsz = mac.m_buflen;
893 		labelstr = kalloc_data(labelsz, Z_WAITOK);
894 		error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
895 		if (error) {
896 			goto out;
897 		}
898 		AUDIT_ARG(mac_string, labelstr);
899 	}
900 #endif /* CONFIG_MACF */
901 
902 	AUDIT_ARG(fflags, flags);
903 
904 #if !CONFIG_UNION_MOUNTS
905 	if (flags & MNT_UNION) {
906 		error = EPERM;
907 		goto out;
908 	}
909 #endif
910 
911 	if ((vp->v_flag & VROOT) &&
912 	    (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
913 #if CONFIG_UNION_MOUNTS
914 		if (!(flags & MNT_UNION)) {
915 			flags |= MNT_UPDATE;
916 		} else {
917 			/*
918 			 * For a union mount on '/', treat it as fresh
919 			 * mount instead of update.
920 			 * Otherwise, union mouting on '/' used to panic the
921 			 * system before, since mnt_vnodecovered was found to
922 			 * be NULL for '/' which is required for unionlookup
923 			 * after it gets ENOENT on union mount.
924 			 */
925 			flags = (flags & ~(MNT_UPDATE));
926 		}
927 #else
928 		flags |= MNT_UPDATE;
929 #endif /* CONFIG_UNION_MOUNTS */
930 
931 #if SECURE_KERNEL
932 		if ((flags & MNT_RDONLY) == 0) {
933 			/* Release kernels are not allowed to mount "/" as rw */
934 			error = EPERM;
935 			goto out;
936 		}
937 #endif
938 
939 		/*
940 		 * See 7392553 for more details on why this check exists.
941 		 * Suffice to say: If this check is ON and something tries
942 		 * to mount the rootFS RW, we'll turn off the codesign
943 		 * bitmap optimization.
944 		 */
945 #if CHECK_CS_VALIDATION_BITMAP
946 		if ((flags & MNT_RDONLY) == 0) {
947 			root_fs_upgrade_try = TRUE;
948 		}
949 #endif
950 	}
951 
952 	error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
953 	    labelstr, ctx);
954 
955 out:
956 
957 #if CONFIG_MACF
958 	kfree_data(labelstr, labelsz);
959 #endif /* CONFIG_MACF */
960 
961 	if (vp) {
962 		vnode_put(vp);
963 	}
964 	if (pvp) {
965 		vnode_put(pvp);
966 	}
967 	if (need_nameidone) {
968 		nameidone(&nd);
969 	}
970 
971 	return error;
972 }
973 
974 /*
975  * common mount implementation (final stage of mounting)
976  *
977  * Arguments:
978  *  fstypename	file system type (ie it's vfs name)
979  *  pvp		parent of covered vnode
980  *  vp		covered vnode
981  *  cnp		component name (ie path) of covered vnode
982  *  flags	generic mount flags
983  *  fsmountargs	file system specific data
984  *  labelstr	optional MAC label
985  *  kernelmount	TRUE for mounts initiated from inside the kernel
986  *  ctx		caller's context
987  */
988 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)989 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
990     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
991     char *labelstr, vfs_context_t ctx)
992 {
993 #if !CONFIG_MACF
994 #pragma unused(labelstr)
995 #endif
996 	struct vnode *devvp = NULLVP;
997 	struct vnode *device_vnode = NULLVP;
998 #if CONFIG_MACF
999 	struct vnode *rvp;
1000 #endif
1001 	struct mount *mp = NULL;
1002 	struct vfstable *vfsp = (struct vfstable *)0;
1003 	struct proc *p = vfs_context_proc(ctx);
1004 	int error, flag = 0;
1005 	bool flag_set = false;
1006 	user_addr_t devpath = USER_ADDR_NULL;
1007 	int ronly = 0;
1008 	int mntalloc = 0;
1009 	boolean_t vfsp_ref = FALSE;
1010 	boolean_t is_rwlock_locked = FALSE;
1011 	boolean_t did_rele = FALSE;
1012 	boolean_t have_usecount = FALSE;
1013 	boolean_t did_set_lmount = FALSE;
1014 	boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1015 
1016 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1017 	/* Check for mutually-exclusive flag bits */
1018 	uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1019 	int bitcount = 0;
1020 	while (checkflags != 0) {
1021 		checkflags &= (checkflags - 1);
1022 		bitcount++;
1023 	}
1024 
1025 	if (bitcount > 1) {
1026 		//not allowed to request multiple mount-by-role flags
1027 		error = EINVAL;
1028 		goto out1;
1029 	}
1030 #endif
1031 
1032 	/*
1033 	 * Process an update for an existing mount
1034 	 */
1035 	if (flags & MNT_UPDATE) {
1036 		if ((vp->v_flag & VROOT) == 0) {
1037 			error = EINVAL;
1038 			goto out1;
1039 		}
1040 		mp = vp->v_mount;
1041 
1042 		/* if unmount or mount in progress, return error */
1043 		mount_lock_spin(mp);
1044 		if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1045 			mount_unlock(mp);
1046 			error = EBUSY;
1047 			goto out1;
1048 		}
1049 		mp->mnt_lflag |= MNT_LMOUNT;
1050 		did_set_lmount = TRUE;
1051 		mount_unlock(mp);
1052 		lck_rw_lock_exclusive(&mp->mnt_rwlock);
1053 		is_rwlock_locked = TRUE;
1054 		/*
1055 		 * We only allow the filesystem to be reloaded if it
1056 		 * is currently mounted read-only.
1057 		 */
1058 		if ((flags & MNT_RELOAD) &&
1059 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1060 			error = ENOTSUP;
1061 			goto out1;
1062 		}
1063 
1064 		/*
1065 		 * If content protection is enabled, update mounts are not
1066 		 * allowed to turn it off.
1067 		 */
1068 		if ((mp->mnt_flag & MNT_CPROTECT) &&
1069 		    ((flags & MNT_CPROTECT) == 0)) {
1070 			error = EINVAL;
1071 			goto out1;
1072 		}
1073 
1074 		/*
1075 		 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1076 		 * failure to return an error for this so we'll just silently
1077 		 * add it if it is not passed in.
1078 		 */
1079 		if ((mp->mnt_flag & MNT_REMOVABLE) &&
1080 		    ((flags & MNT_REMOVABLE) == 0)) {
1081 			flags |= MNT_REMOVABLE;
1082 		}
1083 
1084 		/* Can't downgrade the backer of the root FS */
1085 		if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1086 		    (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1087 			error = ENOTSUP;
1088 			goto out1;
1089 		}
1090 
1091 		/*
1092 		 * Only root, or the user that did the original mount is
1093 		 * permitted to update it.
1094 		 */
1095 		if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1096 		    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1097 			goto out1;
1098 		}
1099 #if CONFIG_MACF
1100 		error = mac_mount_check_remount(ctx, mp);
1101 		if (error != 0) {
1102 			goto out1;
1103 		}
1104 #endif
1105 		/*
1106 		 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1107 		 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1108 		 */
1109 		if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1110 			flags |= MNT_NOSUID | MNT_NODEV;
1111 			if (mp->mnt_flag & MNT_NOEXEC) {
1112 				flags |= MNT_NOEXEC;
1113 			}
1114 		}
1115 		flag = mp->mnt_flag;
1116 		flag_set = true;
1117 
1118 
1119 
1120 		mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1121 
1122 		vfsp = mp->mnt_vtable;
1123 		goto update;
1124 	} // MNT_UPDATE
1125 
1126 	/*
1127 	 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1128 	 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1129 	 */
1130 	if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1131 		flags |= MNT_NOSUID | MNT_NODEV;
1132 		if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1133 			flags |= MNT_NOEXEC;
1134 		}
1135 	}
1136 
1137 	/* XXXAUDIT: Should we capture the type on the error path as well? */
1138 	/* XXX cast-away const (audit_arg_text() does not modify its input) */
1139 	AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1140 	mount_list_lock();
1141 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1142 		if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1143 			vfsp->vfc_refcount++;
1144 			vfsp_ref = TRUE;
1145 			break;
1146 		}
1147 	}
1148 	mount_list_unlock();
1149 	if (vfsp == NULL) {
1150 		error = ENODEV;
1151 		goto out1;
1152 	}
1153 
1154 	/*
1155 	 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1156 	 * except in ROSV configs and for the initial BaseSystem root.
1157 	 */
1158 	if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1159 	    ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1160 	    ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1161 		error = EINVAL;  /* unsupported request */
1162 		goto out1;
1163 	}
1164 
1165 	error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1166 	if (error != 0) {
1167 		goto out1;
1168 	}
1169 
1170 	/*
1171 	 * Allocate and initialize the filesystem (mount_t)
1172 	 */
1173 	mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1174 	mntalloc = 1;
1175 
1176 	/* Initialize the default IO constraints */
1177 	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1178 	mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1179 	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1180 	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1181 	mp->mnt_devblocksize = DEV_BSIZE;
1182 	mp->mnt_alignmentmask = PAGE_MASK;
1183 	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1184 	mp->mnt_ioscale = 1;
1185 	mp->mnt_ioflags = 0;
1186 	mp->mnt_realrootvp = NULLVP;
1187 	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1188 
1189 	mp->mnt_lflag |= MNT_LMOUNT;
1190 	did_set_lmount = TRUE;
1191 
1192 	TAILQ_INIT(&mp->mnt_vnodelist);
1193 	TAILQ_INIT(&mp->mnt_workerqueue);
1194 	TAILQ_INIT(&mp->mnt_newvnodes);
1195 	mount_lock_init(mp);
1196 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
1197 	is_rwlock_locked = TRUE;
1198 	mp->mnt_op = vfsp->vfc_vfsops;
1199 	mp->mnt_vtable = vfsp;
1200 	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
1201 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1202 	strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1203 	do {
1204 		size_t pathlen = MAXPATHLEN;
1205 
1206 		if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1207 			strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1208 		}
1209 	} while (0);
1210 	mp->mnt_vnodecovered = vp;
1211 	mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1212 	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1213 	mp->mnt_devbsdunit = 0;
1214 	mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1215 
1216 	/* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1217 	vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1218 
1219 	if (kernelmount) {
1220 		mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1221 	}
1222 	if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1223 		mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1224 	}
1225 
1226 	if (KERNEL_MOUNT_DEVFS & internal_flags) {
1227 		// kernel mounted devfs
1228 		mp->mnt_kern_flag |= MNTK_SYSTEM;
1229 	}
1230 
1231 update:
1232 
1233 	/*
1234 	 * Set the mount level flags.
1235 	 */
1236 	if (flags & MNT_RDONLY) {
1237 		mp->mnt_flag |= MNT_RDONLY;
1238 	} else if (mp->mnt_flag & MNT_RDONLY) {
1239 		// disallow read/write upgrades of file systems that
1240 		// had the TYPENAME_OVERRIDE feature set.
1241 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1242 			error = EPERM;
1243 			goto out1;
1244 		}
1245 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
1246 	}
1247 	mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1248 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1249 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1250 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1251 	    MNT_QUARANTINE | MNT_CPROTECT);
1252 
1253 #if SECURE_KERNEL
1254 #if !CONFIG_MNT_SUID
1255 	/*
1256 	 * On release builds of iOS based platforms, always enforce NOSUID on
1257 	 * all mounts. We do this here because we can catch update mounts as well as
1258 	 * non-update mounts in this case.
1259 	 */
1260 	mp->mnt_flag |= (MNT_NOSUID);
1261 #endif
1262 #endif
1263 
1264 	mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1265 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1266 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1267 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1268 	    MNT_QUARANTINE | MNT_CPROTECT);
1269 
1270 #if CONFIG_MACF
1271 	if (flags & MNT_MULTILABEL) {
1272 		if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1273 			error = EINVAL;
1274 			goto out1;
1275 		}
1276 		mp->mnt_flag |= MNT_MULTILABEL;
1277 	}
1278 #endif
1279 	/*
1280 	 * Process device path for local file systems if requested.
1281 	 *
1282 	 * Snapshot and mount-by-role mounts do not use this path; they are
1283 	 * passing other opaque data in the device path field.
1284 	 *
1285 	 * Basesystemroot mounts pass a device path to be resolved here,
1286 	 * but it's just a char * already inside the kernel, which
1287 	 * kernel_mount() shoved into a user_addr_t to call us. So for such
1288 	 * mounts we must skip copyin (both of the address and of the string
1289 	 * (in NDINIT).
1290 	 */
1291 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1292 	    !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1293 		boolean_t do_copyin_devpath = true;
1294 #if CONFIG_BASESYSTEMROOT
1295 		if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1296 			// KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1297 			// We have been passed fsmountargs, which is typed as a user_addr_t,
1298 			// but is actually a char ** pointing to a (kernelspace) string.
1299 			// We manually unpack it with a series of casts and dereferences
1300 			// that reverses what was done just above us on the stack in
1301 			// imageboot_pivot_image().
1302 			// After retrieving the path to the dev node (which we will NDINIT
1303 			// in a moment), we pass NULL fsmountargs on to the filesystem.
1304 			_Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1305 			char **devnamepp = (char **)fsmountargs;
1306 			char *devnamep = *devnamepp;
1307 			devpath = CAST_USER_ADDR_T(devnamep);
1308 			do_copyin_devpath = false;
1309 			fsmountargs = USER_ADDR_NULL;
1310 
1311 			//Now that we have a mp, denote that this mount is for the basesystem.
1312 			mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1313 		}
1314 #endif // CONFIG_BASESYSTEMROOT
1315 
1316 		if (do_copyin_devpath) {
1317 			if (vfs_context_is64bit(ctx)) {
1318 				if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1319 					goto out1;
1320 				}
1321 				fsmountargs += sizeof(devpath);
1322 			} else {
1323 				user32_addr_t tmp;
1324 				if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1325 					goto out1;
1326 				}
1327 				/* munge into LP64 addr */
1328 				devpath = CAST_USER_ADDR_T(tmp);
1329 				fsmountargs += sizeof(tmp);
1330 			}
1331 		}
1332 
1333 		/* Lookup device and authorize access to it */
1334 		if ((devpath)) {
1335 			struct nameidata nd;
1336 
1337 			enum uio_seg seg = UIO_USERSPACE;
1338 #if CONFIG_BASESYSTEMROOT
1339 			if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1340 				seg = UIO_SYSSPACE;
1341 			}
1342 #endif // CONFIG_BASESYSTEMROOT
1343 
1344 			NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1345 			if ((error = namei(&nd))) {
1346 				goto out1;
1347 			}
1348 
1349 			strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1350 			devvp = nd.ni_vp;
1351 
1352 			nameidone(&nd);
1353 
1354 			if (devvp->v_type != VBLK) {
1355 				error = ENOTBLK;
1356 				goto out2;
1357 			}
1358 			if (major(devvp->v_rdev) >= nblkdev) {
1359 				error = ENXIO;
1360 				goto out2;
1361 			}
1362 			/*
1363 			 * If mount by non-root, then verify that user has necessary
1364 			 * permissions on the device.
1365 			 */
1366 			if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1367 				kauth_action_t accessmode = KAUTH_VNODE_READ_DATA;
1368 
1369 				if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1370 					accessmode |= KAUTH_VNODE_WRITE_DATA;
1371 				}
1372 				if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1373 					goto out2;
1374 				}
1375 			}
1376 		}
1377 		/* On first mount, preflight and open device */
1378 		if (devpath && ((flags & MNT_UPDATE) == 0)) {
1379 			if ((error = vnode_ref(devvp))) {
1380 				goto out2;
1381 			}
1382 			/*
1383 			 * Disallow multiple mounts of the same device.
1384 			 * Disallow mounting of a device that is currently in use
1385 			 * (except for root, which might share swap device for miniroot).
1386 			 * Flush out any old buffers remaining from a previous use.
1387 			 */
1388 			if ((error = vfs_setmounting(devvp))) {
1389 				vnode_rele(devvp);
1390 				goto out2;
1391 			}
1392 
1393 			if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1394 				error = EBUSY;
1395 				goto out3;
1396 			}
1397 			if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1398 				error = ENOTBLK;
1399 				goto out3;
1400 			}
1401 			if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1402 				goto out3;
1403 			}
1404 
1405 			ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1406 #if CONFIG_MACF
1407 			error = mac_vnode_check_open(ctx,
1408 			    devvp,
1409 			    ronly ? FREAD : FREAD | FWRITE);
1410 			if (error) {
1411 				goto out3;
1412 			}
1413 #endif /* MAC */
1414 			if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1415 				goto out3;
1416 			}
1417 
1418 			mp->mnt_devvp = devvp;
1419 			device_vnode = devvp;
1420 		} else if ((mp->mnt_flag & MNT_RDONLY) &&
1421 		    (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1422 		    (device_vnode = mp->mnt_devvp)) {
1423 			dev_t dev;
1424 			int maj;
1425 			/*
1426 			 * If upgrade to read-write by non-root, then verify
1427 			 * that user has necessary permissions on the device.
1428 			 */
1429 			vnode_getalways(device_vnode);
1430 
1431 			if (suser(vfs_context_ucred(ctx), NULL) &&
1432 			    (error = vnode_authorize(device_vnode, NULL,
1433 			    KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1434 			    ctx)) != 0) {
1435 				vnode_put(device_vnode);
1436 				goto out2;
1437 			}
1438 
1439 			/* Tell the device that we're upgrading */
1440 			dev = (dev_t)device_vnode->v_rdev;
1441 			maj = major(dev);
1442 
1443 			if ((u_int)maj >= (u_int)nblkdev) {
1444 				panic("Volume mounted on a device with invalid major number.");
1445 			}
1446 
1447 			error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1448 			vnode_put(device_vnode);
1449 			device_vnode = NULLVP;
1450 			if (error != 0) {
1451 				goto out2;
1452 			}
1453 		}
1454 	} // localargs && !(snapshot | data | vm)
1455 
1456 #if CONFIG_MACF
1457 	if ((flags & MNT_UPDATE) == 0) {
1458 		mac_mount_label_init(mp);
1459 		mac_mount_label_associate(ctx, mp);
1460 	}
1461 	if (labelstr) {
1462 		if ((flags & MNT_UPDATE) != 0) {
1463 			error = mac_mount_check_label_update(ctx, mp);
1464 			if (error != 0) {
1465 				goto out3;
1466 			}
1467 		}
1468 	}
1469 #endif
1470 	/*
1471 	 * Mount the filesystem.  We already asserted that internal_flags
1472 	 * cannot have more than one mount-by-role bit set.
1473 	 */
1474 	if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1475 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1476 		    (caddr_t)fsmountargs, 0, ctx);
1477 	} else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1478 #if CONFIG_ROSV_STARTUP
1479 		struct mount *origin_mp = (struct mount*)fsmountargs;
1480 		fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1481 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1482 		if (error) {
1483 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1484 		} else {
1485 			/* Mark volume associated with system volume */
1486 			mp->mnt_kern_flag |= MNTK_SYSTEM;
1487 
1488 			/* Attempt to acquire the mnt_devvp and set it up */
1489 			struct vnode *mp_devvp = NULL;
1490 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1491 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1492 				    0, &mp_devvp, vfs_context_kernel());
1493 				if (!lerr) {
1494 					mp->mnt_devvp = mp_devvp;
1495 					//vnode_lookup took an iocount, need to drop it.
1496 					vnode_put(mp_devvp);
1497 					// now set `device_vnode` to the devvp that was acquired.
1498 					// this is needed in order to ensure vfs_init_io_attributes is invoked.
1499 					// note that though the iocount above was dropped, the mount acquires
1500 					// an implicit reference against the device.
1501 					device_vnode = mp_devvp;
1502 				}
1503 			}
1504 		}
1505 #else
1506 		error = EINVAL;
1507 #endif
1508 	} else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1509 #if CONFIG_MOUNT_VM
1510 		struct mount *origin_mp = (struct mount*)fsmountargs;
1511 		fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1512 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1513 		if (error) {
1514 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1515 		} else {
1516 			/* Mark volume associated with system volume and a swap mount */
1517 			mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1518 			/* Attempt to acquire the mnt_devvp and set it up */
1519 			struct vnode *mp_devvp = NULL;
1520 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1521 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1522 				    0, &mp_devvp, vfs_context_kernel());
1523 				if (!lerr) {
1524 					mp->mnt_devvp = mp_devvp;
1525 					//vnode_lookup took an iocount, need to drop it.
1526 					vnode_put(mp_devvp);
1527 
1528 					// now set `device_vnode` to the devvp that was acquired.
1529 					// note that though the iocount above was dropped, the mount acquires
1530 					// an implicit reference against the device.
1531 					device_vnode = mp_devvp;
1532 				}
1533 			}
1534 		}
1535 #else
1536 		error = EINVAL;
1537 #endif
1538 	} else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1539 #if CONFIG_MOUNT_PREBOOTRECOVERY
1540 		struct mount *origin_mp = (struct mount*)fsmountargs;
1541 		uint32_t mount_role = 0;
1542 		if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1543 			mount_role = VFS_PREBOOT_ROLE;
1544 		} else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1545 			mount_role = VFS_RECOVERY_ROLE;
1546 		}
1547 
1548 		if (mount_role != 0) {
1549 			fs_role_mount_args_t frma = {origin_mp, mount_role};
1550 			error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1551 			if (error) {
1552 				printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1553 			} else {
1554 				// NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1555 				/* Mark volume associated with system volume */
1556 				//mp->mnt_kern_flag |= MNTK_SYSTEM;
1557 				/* Attempt to acquire the mnt_devvp and set it up */
1558 				struct vnode *mp_devvp = NULL;
1559 				if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1560 					errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1561 					    0, &mp_devvp, vfs_context_kernel());
1562 					if (!lerr) {
1563 						mp->mnt_devvp = mp_devvp;
1564 						//vnode_lookup took an iocount, need to drop it.
1565 						vnode_put(mp_devvp);
1566 
1567 						// now set `device_vnode` to the devvp that was acquired.
1568 						// note that though the iocount above was dropped, the mount acquires
1569 						// an implicit reference against the device.
1570 						device_vnode = mp_devvp;
1571 					}
1572 				}
1573 			}
1574 		} else {
1575 			printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1576 			error = EINVAL;
1577 		}
1578 #else
1579 		error = EINVAL;
1580 #endif
1581 	} else {
1582 		error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1583 	}
1584 
1585 	if (flags & MNT_UPDATE) {
1586 		if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1587 			mp->mnt_flag &= ~MNT_RDONLY;
1588 		}
1589 		mp->mnt_flag &= ~
1590 		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1591 		mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1592 		if (error) {
1593 			mp->mnt_flag = flag;  /* restore flag value */
1594 		}
1595 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1596 		lck_rw_done(&mp->mnt_rwlock);
1597 		is_rwlock_locked = FALSE;
1598 		if (!error) {
1599 			enablequotas(mp, ctx);
1600 		}
1601 		goto exit;
1602 	}
1603 
1604 	/*
1605 	 * Put the new filesystem on the mount list after root.
1606 	 */
1607 	if (error == 0) {
1608 		struct vfs_attr vfsattr;
1609 		if (device_vnode) {
1610 			/*
1611 			 *   cache the IO attributes for the underlying physical media...
1612 			 *   an error return indicates the underlying driver doesn't
1613 			 *   support all the queries necessary... however, reasonable
1614 			 *   defaults will have been set, so no reason to bail or care
1615 			 *
1616 			 *   Need to do this before calling the MAC hook as it needs
1617 			 *   information from this call.
1618 			 */
1619 			vfs_init_io_attributes(device_vnode, mp);
1620 		}
1621 
1622 #if CONFIG_MACF
1623 		error = mac_mount_check_mount_late(ctx, mp);
1624 		if (error != 0) {
1625 			goto out4;
1626 		}
1627 
1628 		if (vfs_flags(mp) & MNT_MULTILABEL) {
1629 			error = VFS_ROOT(mp, &rvp, ctx);
1630 			if (error) {
1631 				printf("%s() VFS_ROOT returned %d\n", __func__, error);
1632 				goto out4;
1633 			}
1634 			error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1635 			/*
1636 			 * drop reference provided by VFS_ROOT
1637 			 */
1638 			vnode_put(rvp);
1639 
1640 			if (error) {
1641 				goto out4;
1642 			}
1643 		}
1644 #endif  /* MAC */
1645 
1646 		vnode_lock_spin(vp);
1647 		CLR(vp->v_flag, VMOUNT);
1648 		vp->v_mountedhere = mp;
1649 		SET(vp->v_flag, VMOUNTEDHERE);
1650 		vnode_unlock(vp);
1651 
1652 		/*
1653 		 * taking the name_cache_lock exclusively will
1654 		 * insure that everyone is out of the fast path who
1655 		 * might be trying to use a now stale copy of
1656 		 * vp->v_mountedhere->mnt_realrootvp
1657 		 * bumping mount_generation causes the cached values
1658 		 * to be invalidated
1659 		 */
1660 		name_cache_lock();
1661 		mount_generation++;
1662 		name_cache_unlock();
1663 
1664 		error = vnode_ref(vp);
1665 		if (error != 0) {
1666 			goto out4;
1667 		}
1668 
1669 		have_usecount = TRUE;
1670 
1671 		error = checkdirs(vp, ctx);
1672 		if (error != 0) {
1673 			/* Unmount the filesystem as cdir/rdirs cannot be updated */
1674 			goto out4;
1675 		}
1676 		/*
1677 		 * there is no cleanup code here so I have made it void
1678 		 * we need to revisit this
1679 		 */
1680 		(void)VFS_START(mp, 0, ctx);
1681 
1682 		if (mount_list_add(mp) != 0) {
1683 			/*
1684 			 * The system is shutting down trying to umount
1685 			 * everything, so fail with a plausible errno.
1686 			 */
1687 			error = EBUSY;
1688 			goto out4;
1689 		}
1690 		lck_rw_done(&mp->mnt_rwlock);
1691 		is_rwlock_locked = FALSE;
1692 
1693 		/* Check if this mounted file system supports EAs or named streams. */
1694 		/* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1695 		VFSATTR_INIT(&vfsattr);
1696 		VFSATTR_WANTED(&vfsattr, f_capabilities);
1697 		if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1698 		    vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1699 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1700 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1701 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1702 				mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1703 			}
1704 #if NAMEDSTREAMS
1705 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1706 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1707 				mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1708 			}
1709 #endif
1710 			/* Check if this file system supports path from id lookups. */
1711 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1712 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1713 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1714 			} else if (mp->mnt_flag & MNT_DOVOLFS) {
1715 				/* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1716 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1717 			}
1718 
1719 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1720 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1721 				mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1722 			}
1723 		}
1724 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1725 			mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1726 		}
1727 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1728 			mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1729 		}
1730 		/* increment the operations count */
1731 		OSAddAtomic(1, &vfs_nummntops);
1732 		enablequotas(mp, ctx);
1733 
1734 		if (device_vnode) {
1735 			vfs_setmountedon(device_vnode);
1736 		}
1737 
1738 		/* Now that mount is setup, notify the listeners */
1739 		vfs_notify_mount(pvp);
1740 		IOBSDMountChange(mp, kIOMountChangeMount);
1741 	} else {
1742 		/* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1743 		if (mp->mnt_vnodelist.tqh_first != NULL) {
1744 			panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1745 			    mp->mnt_vtable->vfc_name, error);
1746 		}
1747 
1748 		vnode_lock_spin(vp);
1749 		CLR(vp->v_flag, VMOUNT);
1750 		vnode_unlock(vp);
1751 		mount_list_lock();
1752 		mp->mnt_vtable->vfc_refcount--;
1753 		mount_list_unlock();
1754 
1755 		if (device_vnode) {
1756 			vnode_rele(device_vnode);
1757 			VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1758 			vfs_clearmounting(device_vnode);
1759 		}
1760 		lck_rw_done(&mp->mnt_rwlock);
1761 		is_rwlock_locked = FALSE;
1762 
1763 		if (nc_smr_enabled) {
1764 			vfs_smr_synchronize();
1765 		}
1766 
1767 		/*
1768 		 * if we get here, we have a mount structure that needs to be freed,
1769 		 * but since the coveredvp hasn't yet been updated to point at it,
1770 		 * no need to worry about other threads holding a crossref on this mp
1771 		 * so it's ok to just free it
1772 		 */
1773 		mount_lock_destroy(mp);
1774 #if CONFIG_MACF
1775 		mac_mount_label_destroy(mp);
1776 #endif
1777 		zfree(mount_zone, mp);
1778 		did_set_lmount = false;
1779 	}
1780 exit:
1781 	/*
1782 	 * drop I/O count on the device vp if there was one
1783 	 */
1784 	if (devpath && devvp) {
1785 		vnode_put(devvp);
1786 	}
1787 
1788 	if (did_set_lmount) {
1789 		mount_lock_spin(mp);
1790 		mp->mnt_lflag &= ~MNT_LMOUNT;
1791 		mount_unlock(mp);
1792 	}
1793 
1794 	return error;
1795 
1796 /* Error condition exits */
1797 out4:
1798 	(void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1799 
1800 	/*
1801 	 * If the mount has been placed on the covered vp,
1802 	 * it may have been discovered by now, so we have
1803 	 * to treat this just like an unmount
1804 	 */
1805 	mount_lock_spin(mp);
1806 	mp->mnt_lflag |= MNT_LDEAD;
1807 	mount_unlock(mp);
1808 
1809 	if (device_vnode != NULLVP) {
1810 		vnode_rele(device_vnode);
1811 		VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1812 		    ctx);
1813 		vfs_clearmounting(device_vnode);
1814 		did_rele = TRUE;
1815 	}
1816 
1817 	vnode_lock_spin(vp);
1818 
1819 	mp->mnt_crossref++;
1820 	CLR(vp->v_flag, VMOUNTEDHERE);
1821 	vp->v_mountedhere = (mount_t) 0;
1822 
1823 	vnode_unlock(vp);
1824 
1825 	if (have_usecount) {
1826 		vnode_rele(vp);
1827 	}
1828 out3:
1829 	if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1830 		vnode_rele(devvp);
1831 		vfs_clearmounting(devvp);
1832 	}
1833 out2:
1834 	if (devpath && devvp) {
1835 		vnode_put(devvp);
1836 	}
1837 out1:
1838 	/* Release mnt_rwlock only when it was taken */
1839 	if (is_rwlock_locked == TRUE) {
1840 		if (flag_set) {
1841 			mp->mnt_flag = flag;  /* restore mnt_flag value */
1842 		}
1843 		lck_rw_done(&mp->mnt_rwlock);
1844 	}
1845 
1846 	if (did_set_lmount) {
1847 		mount_lock_spin(mp);
1848 		mp->mnt_lflag &= ~MNT_LMOUNT;
1849 		mount_unlock(mp);
1850 	}
1851 
1852 	if (mntalloc) {
1853 		if (mp->mnt_crossref) {
1854 			mount_dropcrossref(mp, vp, 0);
1855 		} else {
1856 			if (nc_smr_enabled) {
1857 				vfs_smr_synchronize();
1858 			}
1859 
1860 			mount_lock_destroy(mp);
1861 #if CONFIG_MACF
1862 			mac_mount_label_destroy(mp);
1863 #endif
1864 			zfree(mount_zone, mp);
1865 		}
1866 	}
1867 	if (vfsp_ref) {
1868 		mount_list_lock();
1869 		vfsp->vfc_refcount--;
1870 		mount_list_unlock();
1871 	}
1872 
1873 	return error;
1874 }
1875 
1876 /*
1877  * Flush in-core data, check for competing mount attempts,
1878  * and set VMOUNT
1879  */
1880 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)1881 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1882 {
1883 #if !CONFIG_MACF
1884 #pragma unused(cnp,fsname)
1885 #endif
1886 	struct vnode_attr va;
1887 	int error;
1888 	boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1889 	boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1890 	boolean_t is_busy;
1891 
1892 	if (!skip_auth) {
1893 		/*
1894 		 * If the user is not root, ensure that they own the directory
1895 		 * onto which we are attempting to mount.
1896 		 */
1897 		VATTR_INIT(&va);
1898 		VATTR_WANTED(&va, va_uid);
1899 		if ((error = vnode_getattr(vp, &va, ctx)) ||
1900 		    (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1901 		    (!vfs_context_issuser(ctx)))) {
1902 			error = EPERM;
1903 			goto out;
1904 		}
1905 	}
1906 
1907 	if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1908 		goto out;
1909 	}
1910 
1911 	if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1912 		goto out;
1913 	}
1914 
1915 	if (vp->v_type != VDIR) {
1916 		error = ENOTDIR;
1917 		goto out;
1918 	}
1919 
1920 	vnode_lock_spin(vp);
1921 	is_busy = is_fmount ?
1922 	    (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL)) :
1923 	    (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL));
1924 	if (is_busy) {
1925 		vnode_unlock(vp);
1926 		error = EBUSY;
1927 		goto out;
1928 	}
1929 	SET(vp->v_flag, VMOUNT);
1930 	vnode_unlock(vp);
1931 
1932 #if CONFIG_MACF
1933 	error = mac_mount_check_mount(ctx, vp,
1934 	    cnp, fsname);
1935 	if (error != 0) {
1936 		vnode_lock_spin(vp);
1937 		CLR(vp->v_flag, VMOUNT);
1938 		vnode_unlock(vp);
1939 	}
1940 #endif
1941 
1942 out:
1943 	return error;
1944 }
1945 
1946 #if CONFIG_IMGSRC_ACCESS
1947 
1948 #define DEBUG_IMGSRC 0
1949 
1950 #if DEBUG_IMGSRC
1951 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1952 #else
1953 #define IMGSRC_DEBUG(args...) do { } while(0)
1954 #endif
1955 
1956 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)1957 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1958 {
1959 	struct nameidata nd;
1960 	vnode_t vp, realdevvp;
1961 	kauth_action_t accessmode;
1962 	int error;
1963 	enum uio_seg uio = UIO_USERSPACE;
1964 
1965 	if (ctx == vfs_context_kernel()) {
1966 		uio = UIO_SYSSPACE;
1967 	}
1968 
1969 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1970 	if ((error = namei(&nd))) {
1971 		IMGSRC_DEBUG("namei() failed with %d\n", error);
1972 		return error;
1973 	}
1974 
1975 	vp = nd.ni_vp;
1976 
1977 	if (!vnode_isblk(vp)) {
1978 		IMGSRC_DEBUG("Not block device.\n");
1979 		error = ENOTBLK;
1980 		goto out;
1981 	}
1982 
1983 	realdevvp = mp->mnt_devvp;
1984 	if (realdevvp == NULLVP) {
1985 		IMGSRC_DEBUG("No device backs the mount.\n");
1986 		error = ENXIO;
1987 		goto out;
1988 	}
1989 
1990 	error = vnode_getwithref(realdevvp);
1991 	if (error != 0) {
1992 		IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1993 		goto out;
1994 	}
1995 
1996 	if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1997 		IMGSRC_DEBUG("Wrong dev_t.\n");
1998 		error = ENXIO;
1999 		goto out1;
2000 	}
2001 
2002 	strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2003 
2004 	/*
2005 	 * If mount by non-root, then verify that user has necessary
2006 	 * permissions on the device.
2007 	 */
2008 	if (!vfs_context_issuser(ctx)) {
2009 		accessmode = KAUTH_VNODE_READ_DATA;
2010 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2011 			accessmode |= KAUTH_VNODE_WRITE_DATA;
2012 		}
2013 		if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2014 			IMGSRC_DEBUG("Access denied.\n");
2015 			goto out1;
2016 		}
2017 	}
2018 
2019 	*devvpp = vp;
2020 
2021 out1:
2022 	vnode_put(realdevvp);
2023 
2024 out:
2025 	nameidone(&nd);
2026 
2027 	if (error) {
2028 		vnode_put(vp);
2029 	}
2030 
2031 	return error;
2032 }
2033 
2034 /*
2035  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2036  * and call checkdirs()
2037  */
2038 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2039 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2040 {
2041 	int error;
2042 
2043 	mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2044 
2045 	IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2046 	    mp->mnt_vtable->vfc_name, vnode_getname(vp));
2047 
2048 	vnode_lock_spin(vp);
2049 	CLR(vp->v_flag, VMOUNT);
2050 	vp->v_mountedhere = mp;
2051 	SET(vp->v_flag, VMOUNTEDHERE);
2052 	vnode_unlock(vp);
2053 
2054 	/*
2055 	 * taking the name_cache_lock exclusively will
2056 	 * insure that everyone is out of the fast path who
2057 	 * might be trying to use a now stale copy of
2058 	 * vp->v_mountedhere->mnt_realrootvp
2059 	 * bumping mount_generation causes the cached values
2060 	 * to be invalidated
2061 	 */
2062 	name_cache_lock();
2063 	mount_generation++;
2064 	name_cache_unlock();
2065 
2066 	error = vnode_ref(vp);
2067 	if (error != 0) {
2068 		goto out;
2069 	}
2070 
2071 	error = checkdirs(vp, ctx);
2072 	if (error != 0) {
2073 		/* Unmount the filesystem as cdir/rdirs cannot be updated */
2074 		vnode_rele(vp);
2075 		goto out;
2076 	}
2077 
2078 out:
2079 	if (error != 0) {
2080 		mp->mnt_vnodecovered = NULLVP;
2081 	}
2082 	return error;
2083 }
2084 
2085 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2086 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2087 {
2088 	vnode_rele(vp);
2089 	vnode_lock_spin(vp);
2090 	CLR(vp->v_flag, (VMOUNT | VMOUNTEDHERE));
2091 	vp->v_mountedhere = (mount_t)NULL;
2092 	vnode_unlock(vp);
2093 
2094 	mp->mnt_vnodecovered = NULLVP;
2095 }
2096 
2097 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2098 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2099 {
2100 	int error;
2101 
2102 	/* unmount in progress return error */
2103 	mount_lock_spin(mp);
2104 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2105 		mount_unlock(mp);
2106 		return EBUSY;
2107 	}
2108 	mount_unlock(mp);
2109 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2110 
2111 	/*
2112 	 * We only allow the filesystem to be reloaded if it
2113 	 * is currently mounted read-only.
2114 	 */
2115 	if ((flags & MNT_RELOAD) &&
2116 	    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2117 		error = ENOTSUP;
2118 		goto out;
2119 	}
2120 
2121 	/*
2122 	 * Only root, or the user that did the original mount is
2123 	 * permitted to update it.
2124 	 */
2125 	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2126 	    (!vfs_context_issuser(ctx))) {
2127 		error = EPERM;
2128 		goto out;
2129 	}
2130 #if CONFIG_MACF
2131 	error = mac_mount_check_remount(ctx, mp);
2132 	if (error != 0) {
2133 		goto out;
2134 	}
2135 #endif
2136 
2137 out:
2138 	if (error) {
2139 		lck_rw_done(&mp->mnt_rwlock);
2140 	}
2141 
2142 	return error;
2143 }
2144 
2145 static void
mount_end_update(mount_t mp)2146 mount_end_update(mount_t mp)
2147 {
2148 	lck_rw_done(&mp->mnt_rwlock);
2149 }
2150 
2151 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2152 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2153 {
2154 	vnode_t vp;
2155 
2156 	if (height >= MAX_IMAGEBOOT_NESTING) {
2157 		return EINVAL;
2158 	}
2159 
2160 	vp = imgsrc_rootvnodes[height];
2161 	if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2162 		*rvpp = vp;
2163 		return 0;
2164 	} else {
2165 		return ENOENT;
2166 	}
2167 }
2168 
2169 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2170 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2171     struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2172     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2173 {
2174 	int error;
2175 	mount_t mp;
2176 	boolean_t placed = FALSE;
2177 	struct vfstable *vfsp;
2178 	user_addr_t devpath;
2179 	char *old_mntonname;
2180 	vnode_t rvp;
2181 	vnode_t devvp;
2182 	uint32_t height;
2183 	uint32_t flags;
2184 
2185 	/* If we didn't imageboot, nothing to move */
2186 	if (imgsrc_rootvnodes[0] == NULLVP) {
2187 		return EINVAL;
2188 	}
2189 
2190 	/* Only root can do this */
2191 	if (!vfs_context_issuser(ctx)) {
2192 		return EPERM;
2193 	}
2194 
2195 	IMGSRC_DEBUG("looking for root vnode.\n");
2196 
2197 	/*
2198 	 * Get root vnode of filesystem we're moving.
2199 	 */
2200 	if (by_index) {
2201 		if (is64bit) {
2202 			struct user64_mnt_imgsrc_args mia64;
2203 			error = copyin(fsmountargs, &mia64, sizeof(mia64));
2204 			if (error != 0) {
2205 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2206 				return error;
2207 			}
2208 
2209 			height = mia64.mi_height;
2210 			flags = mia64.mi_flags;
2211 			devpath = (user_addr_t)mia64.mi_devpath;
2212 		} else {
2213 			struct user32_mnt_imgsrc_args mia32;
2214 			error = copyin(fsmountargs, &mia32, sizeof(mia32));
2215 			if (error != 0) {
2216 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2217 				return error;
2218 			}
2219 
2220 			height = mia32.mi_height;
2221 			flags = mia32.mi_flags;
2222 			devpath = mia32.mi_devpath;
2223 		}
2224 	} else {
2225 		/*
2226 		 * For binary compatibility--assumes one level of nesting.
2227 		 */
2228 		if (is64bit) {
2229 			if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2230 				return error;
2231 			}
2232 		} else {
2233 			user32_addr_t tmp;
2234 			if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2235 				return error;
2236 			}
2237 
2238 			/* munge into LP64 addr */
2239 			devpath = CAST_USER_ADDR_T(tmp);
2240 		}
2241 
2242 		height = 0;
2243 		flags = 0;
2244 	}
2245 
2246 	if (flags != 0) {
2247 		IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2248 		return EINVAL;
2249 	}
2250 
2251 	error = get_imgsrc_rootvnode(height, &rvp);
2252 	if (error != 0) {
2253 		IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2254 		return error;
2255 	}
2256 
2257 	IMGSRC_DEBUG("got old root vnode\n");
2258 
2259 	old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2260 
2261 	/* Can only move once */
2262 	mp = vnode_mount(rvp);
2263 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2264 		IMGSRC_DEBUG("Already moved.\n");
2265 		error = EBUSY;
2266 		goto out0;
2267 	}
2268 
2269 	IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2270 	IMGSRC_DEBUG("Starting updated.\n");
2271 
2272 	/* Get exclusive rwlock on mount, authorize update on mp */
2273 	error = mount_begin_update(mp, ctx, 0);
2274 	if (error != 0) {
2275 		IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2276 		goto out0;
2277 	}
2278 
2279 	/*
2280 	 * It can only be moved once.  Flag is set under the rwlock,
2281 	 * so we're now safe to proceed.
2282 	 */
2283 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2284 		IMGSRC_DEBUG("Already moved [2]\n");
2285 		goto out1;
2286 	}
2287 
2288 	IMGSRC_DEBUG("Preparing coveredvp.\n");
2289 
2290 	/* Mark covered vnode as mount in progress, authorize placing mount on top */
2291 	error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2292 	if (error != 0) {
2293 		IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2294 		goto out1;
2295 	}
2296 
2297 	IMGSRC_DEBUG("Covered vp OK.\n");
2298 
2299 	/* Sanity check the name caller has provided */
2300 	vfsp = mp->mnt_vtable;
2301 	if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2302 		IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2303 		    vfsp->vfc_name, fsname);
2304 		error = EINVAL;
2305 		goto out2;
2306 	}
2307 
2308 	/* Check the device vnode and update mount-from name, for local filesystems */
2309 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2310 		IMGSRC_DEBUG("Local, doing device validation.\n");
2311 
2312 		if (devpath != USER_ADDR_NULL) {
2313 			error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2314 			if (error) {
2315 				IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2316 				goto out2;
2317 			}
2318 
2319 			vnode_put(devvp);
2320 		}
2321 	}
2322 
2323 	/*
2324 	 * Place mp on top of vnode, ref the vnode,  call checkdirs(),
2325 	 * and increment the name cache's mount generation
2326 	 */
2327 
2328 	IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2329 	error = place_mount_and_checkdirs(mp, vp, ctx);
2330 	if (error != 0) {
2331 		goto out2;
2332 	}
2333 
2334 	placed = TRUE;
2335 
2336 	strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2337 	strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2338 
2339 	/* Forbid future moves */
2340 	mount_lock(mp);
2341 	mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2342 	mount_unlock(mp);
2343 
2344 	/* Finally, add to mount list, completely ready to go */
2345 	if (mount_list_add(mp) != 0) {
2346 		/*
2347 		 * The system is shutting down trying to umount
2348 		 * everything, so fail with a plausible errno.
2349 		 */
2350 		error = EBUSY;
2351 		goto out3;
2352 	}
2353 
2354 	mount_end_update(mp);
2355 	vnode_put(rvp);
2356 	zfree(ZV_NAMEI, old_mntonname);
2357 
2358 	vfs_notify_mount(pvp);
2359 
2360 	return 0;
2361 out3:
2362 	strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2363 
2364 	mount_lock(mp);
2365 	mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2366 	mount_unlock(mp);
2367 
2368 out2:
2369 	/*
2370 	 * Placing the mp on the vnode clears VMOUNT,
2371 	 * so cleanup is different after that point
2372 	 */
2373 	if (placed) {
2374 		/* Rele the vp, clear VMOUNT and v_mountedhere */
2375 		undo_place_on_covered_vp(mp, vp);
2376 	} else {
2377 		vnode_lock_spin(vp);
2378 		CLR(vp->v_flag, VMOUNT);
2379 		vnode_unlock(vp);
2380 	}
2381 out1:
2382 	mount_end_update(mp);
2383 
2384 out0:
2385 	vnode_put(rvp);
2386 	zfree(ZV_NAMEI, old_mntonname);
2387 	return error;
2388 }
2389 
2390 #endif /* CONFIG_IMGSRC_ACCESS */
2391 
2392 void
enablequotas(struct mount * mp,vfs_context_t ctx)2393 enablequotas(struct mount *mp, vfs_context_t ctx)
2394 {
2395 	struct nameidata qnd;
2396 	int type;
2397 	char qfpath[MAXPATHLEN];
2398 	const char *qfname = QUOTAFILENAME;
2399 	const char *qfopsname = QUOTAOPSNAME;
2400 	const char *qfextension[] = INITQFNAMES;
2401 
2402 	/* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2403 	if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2404 		return;
2405 	}
2406 	/*
2407 	 * Enable filesystem disk quotas if necessary.
2408 	 * We ignore errors as this should not interfere with final mount
2409 	 */
2410 	for (type = 0; type < MAXQUOTAS; type++) {
2411 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2412 		NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2413 		    CAST_USER_ADDR_T(qfpath), ctx);
2414 		if (namei(&qnd) != 0) {
2415 			continue;           /* option file to trigger quotas is not present */
2416 		}
2417 		vnode_put(qnd.ni_vp);
2418 		nameidone(&qnd);
2419 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2420 
2421 		(void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2422 	}
2423 	return;
2424 }
2425 
2426 
2427 static int
checkdirs_callback(proc_t p,void * arg)2428 checkdirs_callback(proc_t p, void * arg)
2429 {
2430 	struct cdirargs *cdrp = (struct cdirargs *)arg;
2431 	vnode_t olddp = cdrp->olddp;
2432 	vnode_t newdp = cdrp->newdp;
2433 	struct filedesc *fdp = &p->p_fd;
2434 	vnode_t new_cvp = newdp;
2435 	vnode_t new_rvp = newdp;
2436 	vnode_t old_cvp = NULL;
2437 	vnode_t old_rvp = NULL;
2438 
2439 	/*
2440 	 * XXX Also needs to iterate each thread in the process to see if it
2441 	 * XXX is using a per-thread current working directory, and, if so,
2442 	 * XXX update that as well.
2443 	 */
2444 
2445 	/*
2446 	 * First, with the proc_fdlock held, check to see if we will need
2447 	 * to do any work.  If not, we will get out fast.
2448 	 */
2449 	proc_fdlock(p);
2450 	if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2451 		proc_fdunlock(p);
2452 		return PROC_RETURNED;
2453 	}
2454 	proc_fdunlock(p);
2455 
2456 	/*
2457 	 * Ok, we will have to do some work.  Always take two refs
2458 	 * because we might need that many.  We'll dispose of whatever
2459 	 * we ended up not using.
2460 	 */
2461 	if (vnode_ref(newdp) != 0) {
2462 		return PROC_RETURNED;
2463 	}
2464 	if (vnode_ref(newdp) != 0) {
2465 		vnode_rele(newdp);
2466 		return PROC_RETURNED;
2467 	}
2468 
2469 	proc_dirs_lock_exclusive(p);
2470 	/*
2471 	 * Now do the work.  Note: we dropped the proc_fdlock, so we
2472 	 * have to do all of the checks again.
2473 	 */
2474 	proc_fdlock(p);
2475 	if (fdp->fd_cdir == olddp) {
2476 		old_cvp = olddp;
2477 		fdp->fd_cdir = newdp;
2478 		new_cvp = NULL;
2479 	}
2480 	if (fdp->fd_rdir == olddp) {
2481 		old_rvp = olddp;
2482 		fdp->fd_rdir = newdp;
2483 		new_rvp = NULL;
2484 	}
2485 	proc_fdunlock(p);
2486 	proc_dirs_unlock_exclusive(p);
2487 
2488 	/*
2489 	 * Dispose of any references that are no longer needed.
2490 	 */
2491 	if (old_cvp != NULL) {
2492 		vnode_rele(old_cvp);
2493 	}
2494 	if (old_rvp != NULL) {
2495 		vnode_rele(old_rvp);
2496 	}
2497 	if (new_cvp != NULL) {
2498 		vnode_rele(new_cvp);
2499 	}
2500 	if (new_rvp != NULL) {
2501 		vnode_rele(new_rvp);
2502 	}
2503 
2504 	return PROC_RETURNED;
2505 }
2506 
2507 
2508 
2509 /*
2510  * Scan all active processes to see if any of them have a current
2511  * or root directory onto which the new filesystem has just been
2512  * mounted. If so, replace them with the new mount point.
2513  */
2514 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2515 checkdirs(vnode_t olddp, vfs_context_t ctx)
2516 {
2517 	vnode_t newdp;
2518 	vnode_t tvp;
2519 	int err;
2520 	struct cdirargs cdr;
2521 
2522 	if (olddp->v_usecount == 1) {
2523 		return 0;
2524 	}
2525 	err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2526 
2527 	if (err != 0) {
2528 #if DIAGNOSTIC
2529 		panic("mount: lost mount: error %d", err);
2530 #endif
2531 		return err;
2532 	}
2533 
2534 	cdr.olddp = olddp;
2535 	cdr.newdp = newdp;
2536 	/* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2537 	proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2538 
2539 	if (rootvnode == olddp) {
2540 		vnode_ref(newdp);
2541 		lck_rw_lock_exclusive(&rootvnode_rw_lock);
2542 		tvp = rootvnode;
2543 		rootvnode = newdp;
2544 		lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2545 		vnode_rele(tvp);
2546 	}
2547 
2548 	vnode_put(newdp);
2549 	return 0;
2550 }
2551 
2552 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT        \
2553 	"com.apple.private.vfs.role-account-unmount"
2554 
2555 /*
2556  * Unmount a file system.
2557  *
2558  * Note: unmount takes a path to the vnode mounted on as argument,
2559  * not special file (as before).
2560  */
2561 /* ARGSUSED */
2562 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2563 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2564 {
2565 	vnode_t vp;
2566 	struct mount *mp;
2567 	int error;
2568 	struct nameidata nd;
2569 	vfs_context_t ctx;
2570 
2571 	/*
2572 	 * If the process has the entitlement, use the kernel's context when
2573 	 * performing lookup on the mount path as the process might lack proper
2574 	 * permission to access the directory.
2575 	 */
2576 	ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2577 	    vfs_context_kernel() : vfs_context_current();
2578 
2579 	NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2580 	    UIO_USERSPACE, uap->path, ctx);
2581 	error = namei(&nd);
2582 	if (error) {
2583 		return error;
2584 	}
2585 	vp = nd.ni_vp;
2586 	mp = vp->v_mount;
2587 	nameidone(&nd);
2588 
2589 	/*
2590 	 * Must be the root of the filesystem
2591 	 */
2592 	if ((vp->v_flag & VROOT) == 0) {
2593 		vnode_put(vp);
2594 		return EINVAL;
2595 	}
2596 #if CONFIG_MACF
2597 	error = mac_mount_check_umount(ctx, mp);
2598 	if (error != 0) {
2599 		vnode_put(vp);
2600 		return error;
2601 	}
2602 #endif
2603 	mount_ref(mp, 0);
2604 	vnode_put(vp);
2605 	/* safedounmount consumes the mount ref */
2606 	return safedounmount(mp, uap->flags, ctx);
2607 }
2608 
2609 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2610 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2611 {
2612 	mount_t mp;
2613 
2614 	mp = mount_list_lookupby_fsid(fsid, 0, 1);
2615 	if (mp == (mount_t)0) {
2616 		return ENOENT;
2617 	}
2618 	mount_ref(mp, 0);
2619 	mount_iterdrop(mp);
2620 	/* safedounmount consumes the mount ref */
2621 	return safedounmount(mp, flags, ctx);
2622 }
2623 
2624 /*
2625  * The mount struct comes with a mount ref which will be consumed.
2626  * Do the actual file system unmount, prevent some common foot shooting.
2627  */
2628 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2629 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2630 {
2631 	int error;
2632 	proc_t p = vfs_context_proc(ctx);
2633 
2634 	/*
2635 	 * If the file system is not responding and MNT_NOBLOCK
2636 	 * is set and not a forced unmount then return EBUSY.
2637 	 */
2638 	if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2639 	    (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2640 		error = EBUSY;
2641 		goto out;
2642 	}
2643 
2644 	/*
2645 	 * Skip authorization in two cases:
2646 	 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2647 	 *   This entitlement allows non-root processes unmount volumes mounted by
2648 	 *   other processes.
2649 	 * - If the mount is tagged as permissive and this is not a forced-unmount
2650 	 *   attempt.
2651 	 */
2652 	if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2653 	    (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2654 		/*
2655 		 * Only root, or the user that did the original mount is
2656 		 * permitted to unmount this filesystem.
2657 		 */
2658 		if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2659 		    (error = suser(kauth_cred_get(), &p->p_acflag))) {
2660 			goto out;
2661 		}
2662 	}
2663 	/*
2664 	 * Don't allow unmounting the root file system, or other volumes
2665 	 * associated with it (for example, the associated VM or DATA mounts) .
2666 	 */
2667 	if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2668 		if (!(mp->mnt_flag & MNT_ROOTFS)) {
2669 			printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2670 			    mp->mnt_vfsstat.f_mntonname);
2671 		}
2672 		error = EBUSY; /* the root (or associated volumes) is always busy */
2673 		goto out;
2674 	}
2675 
2676 	/*
2677 	 * If the mount is providing the root filesystem's disk image
2678 	 * (i.e. imageboot), don't allow unmounting
2679 	 */
2680 	if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2681 		error = EBUSY;
2682 		goto out;
2683 	}
2684 
2685 	return dounmount(mp, flags, 1, ctx);
2686 
2687 out:
2688 	mount_drop(mp, 0);
2689 	return error;
2690 }
2691 
2692 /*
2693  * Do the actual file system unmount.
2694  */
2695 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2696 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2697 {
2698 	vnode_t coveredvp = (vnode_t)0;
2699 	int error;
2700 	int needwakeup = 0;
2701 	int forcedunmount = 0;
2702 	int lflags = 0;
2703 	struct vnode *devvp = NULLVP;
2704 #if CONFIG_TRIGGERS
2705 	proc_t p = vfs_context_proc(ctx);
2706 	int did_vflush = 0;
2707 	int pflags_save = 0;
2708 #endif /* CONFIG_TRIGGERS */
2709 
2710 #if CONFIG_FSE
2711 	if (!(flags & MNT_FORCE)) {
2712 		fsevent_unmount(mp, ctx);  /* has to come first! */
2713 	}
2714 #endif
2715 
2716 	mount_lock(mp);
2717 
2718 	/*
2719 	 * If already an unmount in progress just return EBUSY.
2720 	 * Even a forced unmount cannot override.
2721 	 */
2722 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2723 		if (withref != 0) {
2724 			mount_drop(mp, 1);
2725 		}
2726 		mount_unlock(mp);
2727 		return EBUSY;
2728 	}
2729 
2730 	if (flags & MNT_FORCE) {
2731 		forcedunmount = 1;
2732 		mp->mnt_lflag |= MNT_LFORCE;
2733 	}
2734 
2735 #if CONFIG_TRIGGERS
2736 	if (flags & MNT_NOBLOCK && p != kernproc) {
2737 		pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2738 	}
2739 #endif
2740 
2741 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
2742 	mp->mnt_lflag |= MNT_LUNMOUNT;
2743 	mp->mnt_flag &= ~MNT_ASYNC;
2744 	/*
2745 	 * anyone currently in the fast path that
2746 	 * trips over the cached rootvp will be
2747 	 * dumped out and forced into the slow path
2748 	 * to regenerate a new cached value
2749 	 */
2750 	mp->mnt_realrootvp = NULLVP;
2751 	mount_unlock(mp);
2752 
2753 	if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2754 		/*
2755 		 * Force unmount any mounts in this filesystem.
2756 		 * If any unmounts fail - just leave them dangling.
2757 		 * Avoids recursion.
2758 		 */
2759 		(void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2760 	}
2761 
2762 	/*
2763 	 * taking the name_cache_lock exclusively will
2764 	 * insure that everyone is out of the fast path who
2765 	 * might be trying to use a now stale copy of
2766 	 * vp->v_mountedhere->mnt_realrootvp
2767 	 * bumping mount_generation causes the cached values
2768 	 * to be invalidated
2769 	 */
2770 	name_cache_lock();
2771 	mount_generation++;
2772 	name_cache_unlock();
2773 
2774 
2775 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2776 	if (withref != 0) {
2777 		mount_drop(mp, 0);
2778 	}
2779 	error = 0;
2780 	if (forcedunmount == 0) {
2781 		ubc_umount(mp); /* release cached vnodes */
2782 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2783 			error = VFS_SYNC(mp, MNT_WAIT, ctx);
2784 			if (error) {
2785 				mount_lock(mp);
2786 				mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2787 				mp->mnt_lflag &= ~MNT_LUNMOUNT;
2788 				mp->mnt_lflag &= ~MNT_LFORCE;
2789 				goto out;
2790 			}
2791 		}
2792 	}
2793 
2794 	IOBSDMountChange(mp, kIOMountChangeUnmount);
2795 
2796 #if CONFIG_TRIGGERS
2797 	vfs_nested_trigger_unmounts(mp, flags, ctx);
2798 	did_vflush = 1;
2799 #endif
2800 	if (forcedunmount) {
2801 		lflags |= FORCECLOSE;
2802 	}
2803 	error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2804 	if ((forcedunmount == 0) && error) {
2805 		mount_lock(mp);
2806 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2807 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2808 		mp->mnt_lflag &= ~MNT_LFORCE;
2809 		goto out;
2810 	}
2811 
2812 	/* make sure there are no one in the mount iterations or lookup */
2813 	mount_iterdrain(mp);
2814 
2815 	error = VFS_UNMOUNT(mp, flags, ctx);
2816 	if (error) {
2817 		mount_iterreset(mp);
2818 		mount_lock(mp);
2819 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2820 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2821 		mp->mnt_lflag &= ~MNT_LFORCE;
2822 		goto out;
2823 	}
2824 
2825 	/* increment the operations count */
2826 	if (!error) {
2827 		OSAddAtomic(1, &vfs_nummntops);
2828 	}
2829 
2830 	if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2831 		/* hold an io reference and drop the usecount before close */
2832 		devvp = mp->mnt_devvp;
2833 		vnode_getalways(devvp);
2834 		vnode_rele(devvp);
2835 		VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2836 		    ctx);
2837 		vnode_clearmountedon(devvp);
2838 		vnode_put(devvp);
2839 	}
2840 	lck_rw_done(&mp->mnt_rwlock);
2841 	mount_list_remove(mp);
2842 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2843 
2844 	/* mark the mount point hook in the vp but not drop the ref yet */
2845 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2846 		/*
2847 		 * The covered vnode needs special handling. Trying to get an
2848 		 * iocount must not block here as this may lead to deadlocks
2849 		 * if the Filesystem to which the covered vnode belongs is
2850 		 * undergoing forced unmounts. Since we hold a usecount, the
2851 		 * vnode cannot be reused (it can, however, still be terminated)
2852 		 */
2853 		vnode_getalways(coveredvp);
2854 		vnode_lock_spin(coveredvp);
2855 
2856 		mp->mnt_crossref++;
2857 		coveredvp->v_mountedhere = (struct mount *)0;
2858 		CLR(coveredvp->v_flag, VMOUNT | VMOUNTEDHERE);
2859 		vnode_unlock(coveredvp);
2860 		vnode_put(coveredvp);
2861 	}
2862 
2863 	mount_list_lock();
2864 	mp->mnt_vtable->vfc_refcount--;
2865 	mount_list_unlock();
2866 
2867 	cache_purgevfs(mp);     /* remove cache entries for this file sys */
2868 	vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2869 	mount_lock(mp);
2870 	mp->mnt_lflag |= MNT_LDEAD;
2871 
2872 	if (mp->mnt_lflag & MNT_LWAIT) {
2873 		/*
2874 		 * do the wakeup here
2875 		 * in case we block in mount_refdrain
2876 		 * which will drop the mount lock
2877 		 * and allow anyone blocked in vfs_busy
2878 		 * to wakeup and see the LDEAD state
2879 		 */
2880 		mp->mnt_lflag &= ~MNT_LWAIT;
2881 		wakeup((caddr_t)mp);
2882 	}
2883 	mount_refdrain(mp);
2884 
2885 	/* free disk_conditioner_info structure for this mount */
2886 	disk_conditioner_unmount(mp);
2887 
2888 out:
2889 	if (mp->mnt_lflag & MNT_LWAIT) {
2890 		mp->mnt_lflag &= ~MNT_LWAIT;
2891 		needwakeup = 1;
2892 	}
2893 
2894 #if CONFIG_TRIGGERS
2895 	if (flags & MNT_NOBLOCK && p != kernproc) {
2896 		// Restore P_NOREMOTEHANG bit to its previous value
2897 		if ((pflags_save & P_NOREMOTEHANG) == 0) {
2898 			OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2899 		}
2900 	}
2901 
2902 	/*
2903 	 * Callback and context are set together under the mount lock, and
2904 	 * never cleared, so we're safe to examine them here, drop the lock,
2905 	 * and call out.
2906 	 */
2907 	if (mp->mnt_triggercallback != NULL) {
2908 		mount_unlock(mp);
2909 		if (error == 0) {
2910 			mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2911 		} else if (did_vflush) {
2912 			mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2913 		}
2914 	} else {
2915 		mount_unlock(mp);
2916 	}
2917 #else
2918 	mount_unlock(mp);
2919 #endif /* CONFIG_TRIGGERS */
2920 
2921 	lck_rw_done(&mp->mnt_rwlock);
2922 
2923 	if (needwakeup) {
2924 		wakeup((caddr_t)mp);
2925 	}
2926 
2927 	if (!error) {
2928 		if ((coveredvp != NULLVP)) {
2929 			vnode_t pvp = NULLVP;
2930 
2931 			/*
2932 			 * The covered vnode needs special handling. Trying to
2933 			 * get an iocount must not block here as this may lead
2934 			 * to deadlocks if the Filesystem to which the covered
2935 			 * vnode belongs is undergoing forced unmounts. Since we
2936 			 * hold a usecount, the  vnode cannot be reused
2937 			 * (it can, however, still be terminated).
2938 			 */
2939 			vnode_getalways(coveredvp);
2940 
2941 			mount_dropcrossref(mp, coveredvp, 0);
2942 			/*
2943 			 * We'll _try_ to detect if this really needs to be
2944 			 * done. The coveredvp can only be in termination (or
2945 			 * terminated) if the coveredvp's mount point is in a
2946 			 * forced unmount (or has been) since we still hold the
2947 			 * ref.
2948 			 */
2949 			if (!vnode_isrecycled(coveredvp)) {
2950 				pvp = vnode_getparent(coveredvp);
2951 #if CONFIG_TRIGGERS
2952 				if (coveredvp->v_resolve) {
2953 					vnode_trigger_rearm(coveredvp, ctx);
2954 				}
2955 #endif
2956 			}
2957 
2958 			vnode_rele(coveredvp);
2959 			vnode_put(coveredvp);
2960 			coveredvp = NULLVP;
2961 
2962 			if (pvp) {
2963 				lock_vnode_and_post(pvp, NOTE_WRITE);
2964 				vnode_put(pvp);
2965 			}
2966 		} else if (mp->mnt_flag & MNT_ROOTFS) {
2967 			if (nc_smr_enabled) {
2968 				vfs_smr_synchronize();
2969 			}
2970 
2971 			mount_lock_destroy(mp);
2972 #if CONFIG_MACF
2973 			mac_mount_label_destroy(mp);
2974 #endif
2975 			zfree(mount_zone, mp);
2976 		} else {
2977 			panic("dounmount: no coveredvp");
2978 		}
2979 	}
2980 	return error;
2981 }
2982 
2983 /*
2984  * Unmount any mounts in this filesystem.
2985  */
2986 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)2987 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2988 {
2989 	mount_t smp;
2990 	fsid_t *fsids, fsid;
2991 	int fsids_sz;
2992 	int count = 0, i, m = 0;
2993 	vnode_t vp;
2994 
2995 	mount_list_lock();
2996 
2997 	// Get an array to hold the submounts fsids.
2998 	TAILQ_FOREACH(smp, &mountlist, mnt_list)
2999 	count++;
3000 	fsids_sz = count * sizeof(fsid_t);
3001 	fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3002 	if (fsids == NULL) {
3003 		mount_list_unlock();
3004 		goto out;
3005 	}
3006 	fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
3007 
3008 	/*
3009 	 * Fill the array with submount fsids.
3010 	 * Since mounts are always added to the tail of the mount list, the
3011 	 * list is always in mount order.
3012 	 * For each mount check if the mounted-on vnode belongs to a
3013 	 * mount that's already added to our array of mounts to be unmounted.
3014 	 */
3015 	for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3016 		vp = smp->mnt_vnodecovered;
3017 		if (vp == NULL) {
3018 			continue;
3019 		}
3020 		fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
3021 		for (i = 0; i <= m; i++) {
3022 			if (fsids[i].val[0] == fsid.val[0] &&
3023 			    fsids[i].val[1] == fsid.val[1]) {
3024 				fsids[++m] = smp->mnt_vfsstat.f_fsid;
3025 				break;
3026 			}
3027 		}
3028 	}
3029 	mount_list_unlock();
3030 
3031 	// Unmount the submounts in reverse order. Ignore errors.
3032 	for (i = m; i > 0; i--) {
3033 		smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3034 		if (smp) {
3035 			mount_ref(smp, 0);
3036 			mount_iterdrop(smp);
3037 			(void) dounmount(smp, flags, 1, ctx);
3038 		}
3039 	}
3040 out:
3041 	kfree_data(fsids, fsids_sz);
3042 }
3043 
3044 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3045 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3046 {
3047 	vnode_hold(dp);
3048 	vnode_lock(dp);
3049 	mp->mnt_crossref--;
3050 
3051 	if (mp->mnt_crossref < 0) {
3052 		panic("mount cross refs -ve");
3053 	}
3054 
3055 	if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3056 		if (need_put) {
3057 			vnode_put_locked(dp);
3058 		}
3059 		vnode_drop_and_unlock(dp);
3060 
3061 		if (nc_smr_enabled) {
3062 			vfs_smr_synchronize();
3063 		}
3064 
3065 		mount_lock_destroy(mp);
3066 #if CONFIG_MACF
3067 		mac_mount_label_destroy(mp);
3068 #endif
3069 		zfree(mount_zone, mp);
3070 		return;
3071 	}
3072 	if (need_put) {
3073 		vnode_put_locked(dp);
3074 	}
3075 	vnode_drop_and_unlock(dp);
3076 }
3077 
3078 
3079 /*
3080  * Sync each mounted filesystem.
3081  */
3082 #if DIAGNOSTIC
3083 int syncprt = 0;
3084 #endif
3085 
3086 int print_vmpage_stat = 0;
3087 
3088 /*
3089  * sync_callback:	simple wrapper that calls VFS_SYNC() on volumes
3090  *			mounted read-write with the passed waitfor value.
3091  *
3092  * Parameters:	mp	mount-point descriptor per mounted file-system instance.
3093  *		arg	user argument (please see below)
3094  *
3095  * User argument is a pointer to 32 bit unsigned integer which describes the
3096  * type of waitfor value to set for calling VFS_SYNC().  If user argument is
3097  * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3098  * waitfor value.
3099  *
3100  * Returns:		VFS_RETURNED
3101  */
3102 static int
sync_callback(mount_t mp,void * arg)3103 sync_callback(mount_t mp, void *arg)
3104 {
3105 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3106 		int asyncflag = mp->mnt_flag & MNT_ASYNC;
3107 		unsigned waitfor = MNT_NOWAIT;
3108 
3109 		if (arg) {
3110 			waitfor = *(uint32_t*)arg;
3111 		}
3112 
3113 		/* Sanity check for flags - these are the only valid combinations for the flag bits*/
3114 		if (waitfor != MNT_WAIT &&
3115 		    waitfor != (MNT_WAIT | MNT_VOLUME) &&
3116 		    waitfor != MNT_NOWAIT &&
3117 		    waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3118 		    waitfor != MNT_DWAIT &&
3119 		    waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3120 			panic("Passed inappropriate waitfor %u to "
3121 			    "sync_callback()", waitfor);
3122 		}
3123 
3124 		mp->mnt_flag &= ~MNT_ASYNC;
3125 		(void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3126 		if (asyncflag) {
3127 			mp->mnt_flag |= MNT_ASYNC;
3128 		}
3129 	}
3130 
3131 	return VFS_RETURNED;
3132 }
3133 
3134 /* ARGSUSED */
3135 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3136 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3137 {
3138 	vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3139 
3140 	if (print_vmpage_stat) {
3141 		vm_countdirtypages();
3142 	}
3143 
3144 #if DIAGNOSTIC
3145 	if (syncprt) {
3146 		vfs_bufstats();
3147 	}
3148 #endif /* DIAGNOSTIC */
3149 	return 0;
3150 }
3151 
3152 typedef enum {
3153 	SYNC_ALL = 0,
3154 	SYNC_ONLY_RELIABLE_MEDIA = 1,
3155 	SYNC_ONLY_UNRELIABLE_MEDIA = 2
3156 } sync_type_t;
3157 
3158 static int
sync_internal_callback(mount_t mp,void * arg)3159 sync_internal_callback(mount_t mp, void *arg)
3160 {
3161 	if (arg) {
3162 		int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3163 		    (mp->mnt_flag & MNT_LOCAL);
3164 		sync_type_t sync_type = *((sync_type_t *)arg);
3165 
3166 		if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3167 			return VFS_RETURNED;
3168 		} else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3169 			return VFS_RETURNED;
3170 		}
3171 	}
3172 
3173 	(void)sync_callback(mp, NULL);
3174 
3175 	return VFS_RETURNED;
3176 }
3177 
3178 int sync_thread_state = 0;
3179 int sync_timeout_seconds = 5;
3180 
3181 #define SYNC_THREAD_RUN       0x0001
3182 #define SYNC_THREAD_RUNNING   0x0002
3183 
3184 #if CONFIG_PHYS_WRITE_ACCT
3185 thread_t pm_sync_thread;
3186 #endif /* CONFIG_PHYS_WRITE_ACCT */
3187 
3188 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3189 sync_thread(__unused void *arg, __unused wait_result_t wr)
3190 {
3191 	sync_type_t sync_type;
3192 #if CONFIG_PHYS_WRITE_ACCT
3193 	pm_sync_thread = current_thread();
3194 #endif /* CONFIG_PHYS_WRITE_ACCT */
3195 
3196 	lck_mtx_lock(&sync_mtx_lck);
3197 	while (sync_thread_state & SYNC_THREAD_RUN) {
3198 		sync_thread_state &= ~SYNC_THREAD_RUN;
3199 		lck_mtx_unlock(&sync_mtx_lck);
3200 
3201 		sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3202 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3203 		sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3204 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3205 
3206 		lck_mtx_lock(&sync_mtx_lck);
3207 	}
3208 	/*
3209 	 * This wakeup _has_ to be issued before the lock is released otherwise
3210 	 * we may end up waking up a thread in sync_internal which is
3211 	 * expecting a wakeup from a thread it just created and not from this
3212 	 * thread which is about to exit.
3213 	 */
3214 	wakeup(&sync_thread_state);
3215 	sync_thread_state &= ~SYNC_THREAD_RUNNING;
3216 #if CONFIG_PHYS_WRITE_ACCT
3217 	pm_sync_thread = NULL;
3218 #endif /* CONFIG_PHYS_WRITE_ACCT */
3219 	lck_mtx_unlock(&sync_mtx_lck);
3220 
3221 	if (print_vmpage_stat) {
3222 		vm_countdirtypages();
3223 	}
3224 
3225 #if DIAGNOSTIC
3226 	if (syncprt) {
3227 		vfs_bufstats();
3228 	}
3229 #endif /* DIAGNOSTIC */
3230 }
3231 
3232 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3233 
3234 /*
3235  * An in-kernel sync for power management to call.
3236  * This function always returns within sync_timeout seconds.
3237  */
3238 __private_extern__ int
sync_internal(void)3239 sync_internal(void)
3240 {
3241 	thread_t thd = NULL;
3242 	int error;
3243 	int thread_created = FALSE;
3244 	struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3245 
3246 	lck_mtx_lock(&sync_mtx_lck);
3247 	sync_thread_state |= SYNC_THREAD_RUN;
3248 	if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3249 		int kr;
3250 
3251 		sync_thread_state |= SYNC_THREAD_RUNNING;
3252 		kr = kernel_thread_start(sync_thread, NULL, &thd);
3253 		if (kr != KERN_SUCCESS) {
3254 			sync_thread_state &= ~SYNC_THREAD_RUNNING;
3255 			lck_mtx_unlock(&sync_mtx_lck);
3256 			printf("sync_thread failed\n");
3257 			return 0;
3258 		}
3259 		thread_created = TRUE;
3260 	}
3261 
3262 	error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3263 	    (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3264 	if (error) {
3265 		struct timeval now;
3266 
3267 		microtime(&now);
3268 		if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3269 			printf("sync timed out: %d sec\n", sync_timeout_seconds);
3270 			sync_timeout_last_print.tv_sec = now.tv_sec;
3271 		}
3272 	}
3273 
3274 	if (thread_created) {
3275 		thread_deallocate(thd);
3276 	}
3277 
3278 	return 0;
3279 } /* end of sync_internal call */
3280 
3281 /*
3282  * Change filesystem quotas.
3283  */
3284 #if QUOTA
3285 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3286 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3287 {
3288 	struct mount *mp;
3289 	int error, quota_cmd, quota_status = 0;
3290 	caddr_t datap;
3291 	size_t fnamelen;
3292 	struct nameidata nd;
3293 	vfs_context_t ctx = vfs_context_current();
3294 	struct dqblk my_dqblk = {};
3295 
3296 	AUDIT_ARG(uid, uap->uid);
3297 	AUDIT_ARG(cmd, uap->cmd);
3298 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3299 	    uap->path, ctx);
3300 	error = namei(&nd);
3301 	if (error) {
3302 		return error;
3303 	}
3304 	mp = nd.ni_vp->v_mount;
3305 	mount_ref(mp, 0);
3306 	vnode_put(nd.ni_vp);
3307 	nameidone(&nd);
3308 
3309 #if CONFIG_MACF
3310 	error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3311 	if (error != 0) {
3312 		goto out;
3313 	}
3314 #endif
3315 
3316 	/* copyin any data we will need for downstream code */
3317 	quota_cmd = uap->cmd >> SUBCMDSHIFT;
3318 
3319 	switch (quota_cmd) {
3320 	case Q_QUOTAON:
3321 		/* uap->arg specifies a file from which to take the quotas */
3322 		fnamelen = MAXPATHLEN;
3323 		datap = zalloc(ZV_NAMEI);
3324 		error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3325 		break;
3326 	case Q_GETQUOTA:
3327 		/* uap->arg is a pointer to a dqblk structure. */
3328 		datap = (caddr_t) &my_dqblk;
3329 		break;
3330 	case Q_SETQUOTA:
3331 	case Q_SETUSE:
3332 		/* uap->arg is a pointer to a dqblk structure. */
3333 		datap = (caddr_t) &my_dqblk;
3334 		if (proc_is64bit(p)) {
3335 			struct user_dqblk       my_dqblk64;
3336 			error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3337 			if (error == 0) {
3338 				munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3339 			}
3340 		} else {
3341 			error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3342 		}
3343 		break;
3344 	case Q_QUOTASTAT:
3345 		/* uap->arg is a pointer to an integer */
3346 		datap = (caddr_t) &quota_status;
3347 		break;
3348 	default:
3349 		datap = NULL;
3350 		break;
3351 	} /* switch */
3352 
3353 	if (error == 0) {
3354 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3355 	}
3356 
3357 	switch (quota_cmd) {
3358 	case Q_QUOTAON:
3359 		if (datap != NULL) {
3360 			zfree(ZV_NAMEI, datap);
3361 		}
3362 		break;
3363 	case Q_GETQUOTA:
3364 		/* uap->arg is a pointer to a dqblk structure we need to copy out to */
3365 		if (error == 0) {
3366 			if (proc_is64bit(p)) {
3367 				struct user_dqblk       my_dqblk64;
3368 
3369 				memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3370 				munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3371 				error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3372 			} else {
3373 				error = copyout(datap, uap->arg, sizeof(struct dqblk));
3374 			}
3375 		}
3376 		break;
3377 	case Q_QUOTASTAT:
3378 		/* uap->arg is a pointer to an integer */
3379 		if (error == 0) {
3380 			error = copyout(datap, uap->arg, sizeof(quota_status));
3381 		}
3382 		break;
3383 	default:
3384 		break;
3385 	} /* switch */
3386 
3387 out:
3388 	mount_drop(mp, 0);
3389 	return error;
3390 }
3391 #else
3392 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3393 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3394 {
3395 	return EOPNOTSUPP;
3396 }
3397 #endif /* QUOTA */
3398 
3399 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3400 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3401 {
3402 	int error;
3403 	vfs_context_t ctx = vfs_context_current();
3404 
3405 #if CONFIG_MACF
3406 	error = mac_mount_check_stat(ctx, mp);
3407 	if (error != 0) {
3408 		return error;
3409 	}
3410 #endif
3411 
3412 	error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3413 	if (error != 0) {
3414 		return error;
3415 	}
3416 
3417 	return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3418 }
3419 
3420 /*
3421  * Get filesystem statistics.
3422  *
3423  * Returns:	0			Success
3424  *	namei:???
3425  *	vfs_update_vfsstat:???
3426  *	munge_statfs:EFAULT
3427  */
3428 /* ARGSUSED */
3429 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3430 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3431 {
3432 	int error;
3433 	struct mount *mp;
3434 	struct nameidata nd;
3435 	vfs_context_t ctx = vfs_context_current();
3436 	vnode_t vp;
3437 
3438 	NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3439 	    UIO_USERSPACE, uap->path, ctx);
3440 	error = namei(&nd);
3441 	if (error != 0) {
3442 		return error;
3443 	}
3444 	vp = nd.ni_vp;
3445 	mp = vp->v_mount;
3446 	nameidone(&nd);
3447 
3448 	error = statfs_internal(p, mp, uap->buf);
3449 	vnode_put(vp);
3450 
3451 	return error;
3452 }
3453 
3454 /*
3455  * Get filesystem statistics.
3456  */
3457 /* ARGSUSED */
3458 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3459 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3460 {
3461 	int error;
3462 	vnode_t vp = NULL;
3463 	struct mount *mp;
3464 
3465 	AUDIT_ARG(fd, uap->fd);
3466 
3467 	if ((error = file_vnode(uap->fd, &vp)) ||
3468 	    (error = vnode_getwithref(vp))) {
3469 		goto out;
3470 	}
3471 
3472 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3473 
3474 	mp = vp->v_mount;
3475 	if (!mp) {
3476 		error = EBADF;
3477 		goto out_vnode;
3478 	}
3479 
3480 	error = statfs_internal(p, mp, uap->buf);
3481 
3482 out_vnode:
3483 	vnode_put(vp);
3484 
3485 out:
3486 	if (vp != NULL) {
3487 		file_drop(uap->fd);
3488 	}
3489 
3490 	return error;
3491 }
3492 
3493 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3494 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3495 {
3496 	struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3497 
3498 	bzero(sfs, sizeof(*sfs));
3499 
3500 	sfs->f_bsize = vsfs->f_bsize;
3501 	sfs->f_iosize = (int32_t)vsfs->f_iosize;
3502 	sfs->f_blocks = vsfs->f_blocks;
3503 	sfs->f_bfree = vsfs->f_bfree;
3504 	sfs->f_bavail = vsfs->f_bavail;
3505 	sfs->f_files = vsfs->f_files;
3506 	sfs->f_ffree = vsfs->f_ffree;
3507 	sfs->f_fsid = vsfs->f_fsid;
3508 	sfs->f_owner = vsfs->f_owner;
3509 	sfs->f_type = mp->mnt_vtable->vfc_typenum;
3510 	sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3511 	sfs->f_fssubtype = vsfs->f_fssubtype;
3512 	sfs->f_flags_ext = 0;
3513 	if (mp->mnt_kern_flag & MNTK_SYSTEMDATA) {
3514 		sfs->f_flags_ext |= MNT_EXT_ROOT_DATA_VOL;
3515 	}
3516 	if (mp->mnt_kern_flag & MNTK_FSKIT) {
3517 		sfs->f_flags_ext |= MNT_EXT_FSKIT;
3518 	}
3519 	vfs_getfstypename(mp, sfs->f_fstypename, MFSTYPENAMELEN);
3520 	strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3521 	strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3522 }
3523 
3524 /*
3525  * Get file system statistics in 64-bit mode
3526  */
3527 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3528 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3529 {
3530 	struct mount *mp;
3531 	int error;
3532 	struct nameidata *ndp;
3533 	struct statfs64 *sfsp;
3534 	vfs_context_t ctxp = vfs_context_current();
3535 	vnode_t vp;
3536 	struct {
3537 		struct nameidata nd;
3538 		struct statfs64 sfs;
3539 	} *__nameidata_statfs64;
3540 
3541 	__nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3542 	    Z_WAITOK);
3543 	ndp = &__nameidata_statfs64->nd;
3544 
3545 	NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3546 	    UIO_USERSPACE, uap->path, ctxp);
3547 	error = namei(ndp);
3548 	if (error != 0) {
3549 		goto out;
3550 	}
3551 	vp = ndp->ni_vp;
3552 	mp = vp->v_mount;
3553 	nameidone(ndp);
3554 
3555 #if CONFIG_MACF
3556 	error = mac_mount_check_stat(ctxp, mp);
3557 	if (error != 0) {
3558 		vnode_put(vp);
3559 		goto out;
3560 	}
3561 #endif
3562 
3563 	error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3564 	if (error != 0) {
3565 		vnode_put(vp);
3566 		goto out;
3567 	}
3568 
3569 	sfsp = &__nameidata_statfs64->sfs;
3570 	vfs_get_statfs64(mp, sfsp);
3571 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3572 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3573 		/* This process does not want to see a seperate data volume mountpoint */
3574 		strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3575 	}
3576 	error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3577 	vnode_put(vp);
3578 
3579 out:
3580 	kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3581 
3582 	return error;
3583 }
3584 
3585 /*
3586  * Get file system statistics in 64-bit mode
3587  */
3588 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3589 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3590 {
3591 	struct vnode *vp;
3592 	struct mount *mp;
3593 	struct statfs64 sfs;
3594 	int error;
3595 
3596 	AUDIT_ARG(fd, uap->fd);
3597 
3598 	if ((error = file_vnode(uap->fd, &vp))) {
3599 		return error;
3600 	}
3601 
3602 	error = vnode_getwithref(vp);
3603 	if (error) {
3604 		file_drop(uap->fd);
3605 		return error;
3606 	}
3607 
3608 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3609 
3610 	mp = vp->v_mount;
3611 	if (!mp) {
3612 		error = EBADF;
3613 		goto out;
3614 	}
3615 
3616 #if CONFIG_MACF
3617 	error = mac_mount_check_stat(vfs_context_current(), mp);
3618 	if (error != 0) {
3619 		goto out;
3620 	}
3621 #endif
3622 
3623 	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3624 		goto out;
3625 	}
3626 
3627 	vfs_get_statfs64(mp, &sfs);
3628 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3629 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3630 		/* This process does not want to see a seperate data volume mountpoint */
3631 		strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3632 	}
3633 	error = copyout(&sfs, uap->buf, sizeof(sfs));
3634 
3635 out:
3636 	file_drop(uap->fd);
3637 	vnode_put(vp);
3638 
3639 	return error;
3640 }
3641 
3642 struct getfsstat_struct {
3643 	user_addr_t     sfsp;
3644 	user_addr_t     *mp;
3645 	int             count;
3646 	int             maxcount;
3647 	int             flags;
3648 	int             error;
3649 };
3650 
3651 
3652 static int
getfsstat_callback(mount_t mp,void * arg)3653 getfsstat_callback(mount_t mp, void * arg)
3654 {
3655 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3656 	struct vfsstatfs *sp;
3657 	int error, my_size;
3658 	vfs_context_t ctx = vfs_context_current();
3659 
3660 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3661 #if CONFIG_MACF
3662 		error = mac_mount_check_stat(ctx, mp);
3663 		if (error != 0) {
3664 			fstp->error = error;
3665 			return VFS_RETURNED_DONE;
3666 		}
3667 #endif
3668 		sp = &mp->mnt_vfsstat;
3669 		/*
3670 		 * If MNT_NOWAIT is specified, do not refresh the
3671 		 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3672 		 */
3673 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3674 		    (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3675 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3676 		    (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3677 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3678 			return VFS_RETURNED;
3679 		}
3680 
3681 		/*
3682 		 * Need to handle LP64 version of struct statfs
3683 		 */
3684 		error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3685 		if (error) {
3686 			fstp->error = error;
3687 			return VFS_RETURNED_DONE;
3688 		}
3689 		fstp->sfsp += my_size;
3690 
3691 		if (fstp->mp) {
3692 #if CONFIG_MACF
3693 			error = mac_mount_label_get(mp, *fstp->mp);
3694 			if (error) {
3695 				fstp->error = error;
3696 				return VFS_RETURNED_DONE;
3697 			}
3698 #endif
3699 			fstp->mp++;
3700 		}
3701 	}
3702 	fstp->count++;
3703 	return VFS_RETURNED;
3704 }
3705 
3706 /*
3707  * Get statistics on all filesystems.
3708  */
3709 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3710 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3711 {
3712 	struct __mac_getfsstat_args muap;
3713 
3714 	muap.buf = uap->buf;
3715 	muap.bufsize = uap->bufsize;
3716 	muap.mac = USER_ADDR_NULL;
3717 	muap.macsize = 0;
3718 	muap.flags = uap->flags;
3719 
3720 	return __mac_getfsstat(p, &muap, retval);
3721 }
3722 
3723 /*
3724  * __mac_getfsstat: Get MAC-related file system statistics
3725  *
3726  * Parameters:    p                        (ignored)
3727  *                uap                      User argument descriptor (see below)
3728  *                retval                   Count of file system statistics (N stats)
3729  *
3730  * Indirect:      uap->bufsize             Buffer size
3731  *                uap->macsize             MAC info size
3732  *                uap->buf                 Buffer where information will be returned
3733  *                uap->mac                 MAC info
3734  *                uap->flags               File system flags
3735  *
3736  *
3737  * Returns:        0                       Success
3738  *                !0                       Not success
3739  *
3740  */
3741 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3742 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3743 {
3744 	user_addr_t sfsp;
3745 	user_addr_t *mp;
3746 	size_t count, maxcount, bufsize, macsize;
3747 	struct getfsstat_struct fst;
3748 
3749 	if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3750 		return EINVAL;
3751 	}
3752 
3753 	bufsize = (size_t) uap->bufsize;
3754 	macsize = (size_t) uap->macsize;
3755 
3756 	if (IS_64BIT_PROCESS(p)) {
3757 		maxcount = bufsize / sizeof(struct user64_statfs);
3758 	} else {
3759 		maxcount = bufsize / sizeof(struct user32_statfs);
3760 	}
3761 	sfsp = uap->buf;
3762 	count = 0;
3763 
3764 	mp = NULL;
3765 
3766 #if CONFIG_MACF
3767 	if (uap->mac != USER_ADDR_NULL) {
3768 		u_int32_t *mp0;
3769 		int error;
3770 		unsigned int i;
3771 
3772 		count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3773 		if (count != maxcount) {
3774 			return EINVAL;
3775 		}
3776 
3777 		/* Copy in the array */
3778 		mp0 = kalloc_data(macsize, Z_WAITOK);
3779 		if (mp0 == NULL) {
3780 			return ENOMEM;
3781 		}
3782 
3783 		error = copyin(uap->mac, mp0, macsize);
3784 		if (error) {
3785 			kfree_data(mp0, macsize);
3786 			return error;
3787 		}
3788 
3789 		/* Normalize to an array of user_addr_t */
3790 		mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3791 		if (mp == NULL) {
3792 			kfree_data(mp0, macsize);
3793 			return ENOMEM;
3794 		}
3795 
3796 		for (i = 0; i < count; i++) {
3797 			if (IS_64BIT_PROCESS(p)) {
3798 				mp[i] = ((user_addr_t *)mp0)[i];
3799 			} else {
3800 				mp[i] = (user_addr_t)mp0[i];
3801 			}
3802 		}
3803 		kfree_data(mp0, macsize);
3804 	}
3805 #endif
3806 
3807 
3808 	fst.sfsp = sfsp;
3809 	fst.mp = mp;
3810 	fst.flags = uap->flags;
3811 	fst.count = 0;
3812 	fst.error = 0;
3813 	fst.maxcount = (int)maxcount;
3814 
3815 
3816 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3817 
3818 	if (mp) {
3819 		kfree_data(mp, count * sizeof(user_addr_t));
3820 	}
3821 
3822 	if (fst.error) {
3823 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3824 		return fst.error;
3825 	}
3826 
3827 	if (fst.sfsp && fst.count > fst.maxcount) {
3828 		*retval = fst.maxcount;
3829 	} else {
3830 		*retval = fst.count;
3831 	}
3832 	return 0;
3833 }
3834 
3835 static int
getfsstat64_callback(mount_t mp,void * arg)3836 getfsstat64_callback(mount_t mp, void * arg)
3837 {
3838 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3839 	struct vfsstatfs *sp;
3840 	struct statfs64 sfs;
3841 	int error;
3842 
3843 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3844 #if CONFIG_MACF
3845 		error = mac_mount_check_stat(vfs_context_current(), mp);
3846 		if (error != 0) {
3847 			fstp->error = error;
3848 			return VFS_RETURNED_DONE;
3849 		}
3850 #endif
3851 		sp = &mp->mnt_vfsstat;
3852 		/*
3853 		 * If MNT_NOWAIT is specified, do not refresh the fsstat
3854 		 * cache. MNT_WAIT overrides MNT_NOWAIT.
3855 		 *
3856 		 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3857 		 * getfsstat, since the constants are out of the same
3858 		 * namespace.
3859 		 */
3860 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3861 		    ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3862 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3863 		    (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3864 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3865 			return VFS_RETURNED;
3866 		}
3867 
3868 		vfs_get_statfs64(mp, &sfs);
3869 		error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3870 		if (error) {
3871 			fstp->error = error;
3872 			return VFS_RETURNED_DONE;
3873 		}
3874 		fstp->sfsp += sizeof(sfs);
3875 	}
3876 	fstp->count++;
3877 	return VFS_RETURNED;
3878 }
3879 
3880 /*
3881  * Get statistics on all file systems in 64 bit mode.
3882  */
3883 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)3884 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3885 {
3886 	user_addr_t sfsp;
3887 	int count, maxcount;
3888 	struct getfsstat_struct fst;
3889 
3890 	maxcount = uap->bufsize / sizeof(struct statfs64);
3891 
3892 	sfsp = uap->buf;
3893 	count = 0;
3894 
3895 	fst.sfsp = sfsp;
3896 	fst.flags = uap->flags;
3897 	fst.count = 0;
3898 	fst.error = 0;
3899 	fst.maxcount = maxcount;
3900 
3901 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3902 
3903 	if (fst.error) {
3904 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3905 		return fst.error;
3906 	}
3907 
3908 	if (fst.sfsp && fst.count > fst.maxcount) {
3909 		*retval = fst.maxcount;
3910 	} else {
3911 		*retval = fst.count;
3912 	}
3913 
3914 	return 0;
3915 }
3916 
3917 /*
3918  * gets the associated vnode with the file descriptor passed.
3919  * as input
3920  *
3921  * INPUT
3922  * ctx - vfs context of caller
3923  * fd - file descriptor for which vnode is required.
3924  * vpp - Pointer to pointer to vnode to be returned.
3925  *
3926  * The vnode is returned with an iocount so any vnode obtained
3927  * by this call needs a vnode_put
3928  *
3929  */
3930 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)3931 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3932 {
3933 	int error;
3934 	vnode_t vp;
3935 	struct fileproc *fp;
3936 	proc_t p = vfs_context_proc(ctx);
3937 
3938 	*vpp =  NULLVP;
3939 
3940 	error = fp_getfvp(p, fd, &fp, &vp);
3941 	if (error) {
3942 		return error;
3943 	}
3944 
3945 	error = vnode_getwithref(vp);
3946 	if (error) {
3947 		(void)fp_drop(p, fd, fp, 0);
3948 		return error;
3949 	}
3950 
3951 	(void)fp_drop(p, fd, fp, 0);
3952 	*vpp = vp;
3953 	return error;
3954 }
3955 
3956 /*
3957  * Wrapper function around namei to start lookup from a directory
3958  * specified by a file descriptor ni_dirfd.
3959  *
3960  * In addition to all the errors returned by namei, this call can
3961  * return ENOTDIR if the file descriptor does not refer to a directory.
3962  * and EBADF if the file descriptor is not valid.
3963  */
3964 int
nameiat(struct nameidata * ndp,int dirfd)3965 nameiat(struct nameidata *ndp, int dirfd)
3966 {
3967 	if ((dirfd != AT_FDCWD) &&
3968 	    !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3969 	    !(ndp->ni_cnd.cn_flags & USEDVP)) {
3970 		int error = 0;
3971 		char c;
3972 
3973 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3974 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
3975 			if (error) {
3976 				return error;
3977 			}
3978 		} else {
3979 			c = *((char *)(ndp->ni_dirp));
3980 		}
3981 
3982 		if (c != '/') {
3983 			vnode_t dvp_at;
3984 
3985 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3986 			    &dvp_at);
3987 			if (error) {
3988 				return error;
3989 			}
3990 
3991 			if (vnode_vtype(dvp_at) != VDIR) {
3992 				vnode_put(dvp_at);
3993 				return ENOTDIR;
3994 			}
3995 
3996 			ndp->ni_dvp = dvp_at;
3997 			ndp->ni_cnd.cn_flags |= USEDVP;
3998 			error = namei(ndp);
3999 			ndp->ni_cnd.cn_flags &= ~USEDVP;
4000 			vnode_put(dvp_at);
4001 			return error;
4002 		}
4003 	}
4004 
4005 	return namei(ndp);
4006 }
4007 
4008 /*
4009  * Change current working directory to a given file descriptor.
4010  */
4011 /* ARGSUSED */
4012 static int
common_fchdir(proc_t p,struct fchdir_args * uap,int per_thread)4013 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
4014 {
4015 	vnode_t vp;
4016 	vnode_t tdp;
4017 	vnode_t tvp;
4018 	struct mount *mp;
4019 	int error, should_put = 1;
4020 	vfs_context_t ctx = vfs_context_current();
4021 
4022 	AUDIT_ARG(fd, uap->fd);
4023 	if (per_thread && uap->fd == -1) {
4024 		/*
4025 		 * Switching back from per-thread to per process CWD; verify we
4026 		 * in fact have one before proceeding.  The only success case
4027 		 * for this code path is to return 0 preemptively after zapping
4028 		 * the thread structure contents.
4029 		 */
4030 		thread_t th = vfs_context_thread(ctx);
4031 		if (th) {
4032 			uthread_t uth = get_bsdthread_info(th);
4033 			tvp = uth->uu_cdir;
4034 			uth->uu_cdir = NULLVP;
4035 			if (tvp != NULLVP) {
4036 				vnode_rele(tvp);
4037 				return 0;
4038 			}
4039 		}
4040 		return EBADF;
4041 	}
4042 
4043 	if ((error = file_vnode(uap->fd, &vp))) {
4044 		return error;
4045 	}
4046 	if ((error = vnode_getwithref(vp))) {
4047 		file_drop(uap->fd);
4048 		return error;
4049 	}
4050 
4051 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4052 
4053 	if (vp->v_type != VDIR) {
4054 		error = ENOTDIR;
4055 		goto out;
4056 	}
4057 
4058 #if CONFIG_MACF
4059 	error = mac_vnode_check_chdir(ctx, vp);
4060 	if (error) {
4061 		goto out;
4062 	}
4063 #endif
4064 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4065 	if (error) {
4066 		goto out;
4067 	}
4068 
4069 	while (!error && (mp = vp->v_mountedhere) != NULL) {
4070 		if (vfs_busy(mp, LK_NOWAIT)) {
4071 			error = EACCES;
4072 			goto out;
4073 		}
4074 		error = VFS_ROOT(mp, &tdp, ctx);
4075 		vfs_unbusy(mp);
4076 		if (error) {
4077 			break;
4078 		}
4079 		vnode_put(vp);
4080 		vp = tdp;
4081 	}
4082 	if (error) {
4083 		goto out;
4084 	}
4085 	if ((error = vnode_ref(vp))) {
4086 		goto out;
4087 	}
4088 	vnode_put(vp);
4089 	should_put = 0;
4090 
4091 	if (per_thread) {
4092 		thread_t th = vfs_context_thread(ctx);
4093 		if (th) {
4094 			uthread_t uth = get_bsdthread_info(th);
4095 			tvp = uth->uu_cdir;
4096 			uth->uu_cdir = vp;
4097 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4098 		} else {
4099 			vnode_rele(vp);
4100 			error = ENOENT;
4101 			goto out;
4102 		}
4103 	} else {
4104 		proc_dirs_lock_exclusive(p);
4105 		proc_fdlock(p);
4106 		tvp = p->p_fd.fd_cdir;
4107 		p->p_fd.fd_cdir = vp;
4108 		proc_fdunlock(p);
4109 		proc_dirs_unlock_exclusive(p);
4110 	}
4111 
4112 	if (tvp) {
4113 		vnode_rele(tvp);
4114 	}
4115 
4116 out:
4117 	if (should_put) {
4118 		vnode_put(vp);
4119 	}
4120 	file_drop(uap->fd);
4121 
4122 	return error;
4123 }
4124 
4125 int
fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4126 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4127 {
4128 	return common_fchdir(p, uap, 0);
4129 }
4130 
4131 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4132 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4133 {
4134 	return common_fchdir(p, (void *)uap, 1);
4135 }
4136 
4137 
4138 /*
4139  * Change current working directory (".").
4140  *
4141  * Returns:	0			Success
4142  *	change_dir:ENOTDIR
4143  *	change_dir:???
4144  *	vnode_ref:ENOENT		No such file or directory
4145  */
4146 /* ARGSUSED */
4147 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4148 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4149 {
4150 	int error;
4151 	vnode_t tvp;
4152 
4153 	error = change_dir(ndp, ctx);
4154 	if (error) {
4155 		return error;
4156 	}
4157 	if ((error = vnode_ref(ndp->ni_vp))) {
4158 		vnode_put(ndp->ni_vp);
4159 		return error;
4160 	}
4161 	/*
4162 	 * drop the iocount we picked up in change_dir
4163 	 */
4164 	vnode_put(ndp->ni_vp);
4165 
4166 	if (per_thread) {
4167 		thread_t th = vfs_context_thread(ctx);
4168 		if (th) {
4169 			uthread_t uth = get_bsdthread_info(th);
4170 			tvp = uth->uu_cdir;
4171 			uth->uu_cdir = ndp->ni_vp;
4172 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4173 		} else {
4174 			vnode_rele(ndp->ni_vp);
4175 			return ENOENT;
4176 		}
4177 	} else {
4178 		proc_dirs_lock_exclusive(p);
4179 		proc_fdlock(p);
4180 		tvp = p->p_fd.fd_cdir;
4181 		p->p_fd.fd_cdir = ndp->ni_vp;
4182 		proc_fdunlock(p);
4183 		proc_dirs_unlock_exclusive(p);
4184 	}
4185 
4186 	if (tvp) {
4187 		vnode_rele(tvp);
4188 	}
4189 
4190 	return 0;
4191 }
4192 
4193 
4194 /*
4195  * Change current working directory (".").
4196  *
4197  * Returns:	0			Success
4198  *	chdir_internal:ENOTDIR
4199  *	chdir_internal:ENOENT		No such file or directory
4200  *	chdir_internal:???
4201  */
4202 /* ARGSUSED */
4203 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4204 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4205 {
4206 	struct nameidata nd;
4207 	vfs_context_t ctx = vfs_context_current();
4208 
4209 	NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4210 	    UIO_USERSPACE, uap->path, ctx);
4211 
4212 	return chdir_internal(p, ctx, &nd, per_thread);
4213 }
4214 
4215 
4216 /*
4217  * chdir
4218  *
4219  * Change current working directory (".") for the entire process
4220  *
4221  * Parameters:  p       Process requesting the call
4222  *              uap     User argument descriptor (see below)
4223  *              retval  (ignored)
4224  *
4225  * Indirect parameters:	uap->path	Directory path
4226  *
4227  * Returns:	0			Success
4228  *              common_chdir: ENOTDIR
4229  *              common_chdir: ENOENT	No such file or directory
4230  *              common_chdir: ???
4231  *
4232  */
4233 int
chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4234 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4235 {
4236 	return common_chdir(p, (void *)uap, 0);
4237 }
4238 
4239 /*
4240  * __pthread_chdir
4241  *
4242  * Change current working directory (".") for a single thread
4243  *
4244  * Parameters:  p       Process requesting the call
4245  *              uap     User argument descriptor (see below)
4246  *              retval  (ignored)
4247  *
4248  * Indirect parameters:	uap->path	Directory path
4249  *
4250  * Returns:	0			Success
4251  *              common_chdir: ENOTDIR
4252  *		common_chdir: ENOENT	No such file or directory
4253  *		common_chdir: ???
4254  *
4255  */
4256 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4257 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4258 {
4259 	return common_chdir(p, (void *)uap, 1);
4260 }
4261 
4262 
4263 /*
4264  * Change notion of root (``/'') directory.
4265  */
4266 /* ARGSUSED */
4267 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4268 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4269 {
4270 	struct filedesc *fdp = &p->p_fd;
4271 	int error;
4272 	struct nameidata nd;
4273 	vnode_t tvp;
4274 	vfs_context_t ctx = vfs_context_current();
4275 
4276 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4277 		return error;
4278 	}
4279 
4280 	NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4281 	    UIO_USERSPACE, uap->path, ctx);
4282 	error = change_dir(&nd, ctx);
4283 	if (error) {
4284 		return error;
4285 	}
4286 
4287 #if CONFIG_MACF
4288 	error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4289 	    &nd.ni_cnd);
4290 	if (error) {
4291 		vnode_put(nd.ni_vp);
4292 		return error;
4293 	}
4294 #endif
4295 
4296 	if ((error = vnode_ref(nd.ni_vp))) {
4297 		vnode_put(nd.ni_vp);
4298 		return error;
4299 	}
4300 	vnode_put(nd.ni_vp);
4301 
4302 	/*
4303 	 * This lock provides the guarantee that as long as you hold the lock
4304 	 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4305 	 * on a referenced vnode in namei when determining the rootvnode for
4306 	 * a process.
4307 	 */
4308 	/* needed for synchronization with lookup */
4309 	proc_dirs_lock_exclusive(p);
4310 	/* needed for setting the flag and other activities on the fd itself */
4311 	proc_fdlock(p);
4312 	tvp = fdp->fd_rdir;
4313 	fdp->fd_rdir = nd.ni_vp;
4314 	fdt_flag_set(fdp, FD_CHROOT);
4315 	proc_fdunlock(p);
4316 	proc_dirs_unlock_exclusive(p);
4317 
4318 	if (tvp != NULL) {
4319 		vnode_rele(tvp);
4320 	}
4321 
4322 	return 0;
4323 }
4324 
4325 #define PATHSTATICBUFLEN 256
4326 #define PIVOT_ROOT_ENTITLEMENT              \
4327        "com.apple.private.vfs.pivot-root"
4328 
4329 #if defined(XNU_TARGET_OS_OSX)
4330 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4331 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4332 {
4333 	int error;
4334 	char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4335 	char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4336 	char *new_rootfs_path_before_buf = NULL;
4337 	char *old_rootfs_path_after_buf = NULL;
4338 	char *incoming = NULL;
4339 	char *outgoing = NULL;
4340 	vnode_t incoming_rootvp = NULLVP;
4341 	size_t bytes_copied;
4342 
4343 	/*
4344 	 * XXX : Additional restrictions needed
4345 	 * - perhaps callable only once.
4346 	 */
4347 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4348 		return error;
4349 	}
4350 
4351 	/*
4352 	 * pivot_root can be executed by launchd only.
4353 	 * Enforce entitlement.
4354 	 */
4355 	if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4356 		return EPERM;
4357 	}
4358 
4359 	error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4360 	if (error == ENAMETOOLONG) {
4361 		new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4362 		error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4363 	}
4364 
4365 	if (error) {
4366 		goto out;
4367 	}
4368 
4369 	error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4370 	if (error == ENAMETOOLONG) {
4371 		old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4372 		error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4373 	}
4374 	if (error) {
4375 		goto out;
4376 	}
4377 
4378 	if (new_rootfs_path_before_buf) {
4379 		incoming = new_rootfs_path_before_buf;
4380 	} else {
4381 		incoming = &new_rootfs_path_before[0];
4382 	}
4383 
4384 	if (old_rootfs_path_after_buf) {
4385 		outgoing = old_rootfs_path_after_buf;
4386 	} else {
4387 		outgoing = &old_rootfs_path_after[0];
4388 	}
4389 
4390 	/*
4391 	 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4392 	 * Userland is not allowed to pivot to an image.
4393 	 */
4394 	error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4395 	if (error) {
4396 		goto out;
4397 	}
4398 	error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4399 	if (error) {
4400 		goto out;
4401 	}
4402 
4403 	error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4404 
4405 out:
4406 	if (incoming_rootvp != NULLVP) {
4407 		vnode_put(incoming_rootvp);
4408 		incoming_rootvp = NULLVP;
4409 	}
4410 
4411 	if (old_rootfs_path_after_buf) {
4412 		zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4413 	}
4414 
4415 	if (new_rootfs_path_before_buf) {
4416 		zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4417 	}
4418 
4419 	return error;
4420 }
4421 #else
4422 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4423 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4424 {
4425 	return nosys(p, NULL, retval);
4426 }
4427 #endif /* XNU_TARGET_OS_OSX */
4428 
4429 /*
4430  * Common routine for chroot and chdir.
4431  *
4432  * Returns:	0			Success
4433  *		ENOTDIR			Not a directory
4434  *		namei:???		[anything namei can return]
4435  *		vnode_authorize:???	[anything vnode_authorize can return]
4436  */
4437 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4438 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4439 {
4440 	vnode_t vp;
4441 	int error;
4442 
4443 	if ((error = namei(ndp))) {
4444 		return error;
4445 	}
4446 	nameidone(ndp);
4447 	vp = ndp->ni_vp;
4448 
4449 	if (vp->v_type != VDIR) {
4450 		vnode_put(vp);
4451 		return ENOTDIR;
4452 	}
4453 
4454 #if CONFIG_MACF
4455 	error = mac_vnode_check_chdir(ctx, vp);
4456 	if (error) {
4457 		vnode_put(vp);
4458 		return error;
4459 	}
4460 #endif
4461 
4462 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4463 	if (error) {
4464 		vnode_put(vp);
4465 		return error;
4466 	}
4467 
4468 	return error;
4469 }
4470 
4471 /*
4472  * Free the vnode data (for directories) associated with the file glob.
4473  */
4474 struct fd_vn_data *
fg_vn_data_alloc(void)4475 fg_vn_data_alloc(void)
4476 {
4477 	struct fd_vn_data *fvdata;
4478 
4479 	/* Allocate per fd vnode data */
4480 	fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4481 	lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4482 	return fvdata;
4483 }
4484 
4485 /*
4486  * Free the vnode data (for directories) associated with the file glob.
4487  */
4488 void
fg_vn_data_free(void * fgvndata)4489 fg_vn_data_free(void *fgvndata)
4490 {
4491 	struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4492 
4493 	kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4494 	lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4495 	kfree_type(struct fd_vn_data, fvdata);
4496 }
4497 
4498 /*
4499  * Check permissions, allocate an open file structure,
4500  * and call the device open routine if any.
4501  *
4502  * Returns:	0			Success
4503  *		EINVAL
4504  *		EINTR
4505  *	falloc:ENFILE
4506  *	falloc:EMFILE
4507  *	falloc:ENOMEM
4508  *	vn_open_auth:???
4509  *	dupfdopen:???
4510  *	VNOP_ADVLOCK:???
4511  *	vnode_setsize:???
4512  *
4513  * XXX Need to implement uid, gid
4514  */
4515 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4516 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4517     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4518 {
4519 	proc_t p = vfs_context_proc(ctx);
4520 	uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4521 	struct fileproc *fp;
4522 	vnode_t vp;
4523 	int flags, oflags, amode;
4524 	int type, indx, error;
4525 	struct vfs_context context;
4526 	vnode_t authvp = NULLVP;
4527 
4528 	oflags = uflags;
4529 
4530 	amode = oflags & O_ACCMODE;
4531 	/*
4532 	 * Because O_RDONLY is 0, it is not possible to distinguish between
4533 	 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4534 	 * with FREAD/FWRITE.
4535 	 */
4536 	if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4537 		return EINVAL;
4538 	}
4539 
4540 	flags = FFLAGS(uflags);
4541 	CLR(flags, FENCRYPTED);
4542 	CLR(flags, FUNENCRYPTED);
4543 
4544 	AUDIT_ARG(fflags, oflags);
4545 	AUDIT_ARG(mode, vap->va_mode);
4546 
4547 	if ((error = falloc_withinit(p, &fp, &indx, ctx, fp_init, initarg)) != 0) {
4548 		return error;
4549 	}
4550 	if (flags & O_CLOEXEC) {
4551 		fp->fp_flags |= FP_CLOEXEC;
4552 	}
4553 	if (flags & O_CLOFORK) {
4554 		fp->fp_flags |= FP_CLOFORK;
4555 	}
4556 
4557 	/* setup state to recognize when fdesc_open was called */
4558 	uu->uu_dupfd = -1;
4559 
4560 	/*
4561 	 * Disable read/write access if file is opened with O_EVTONLY and
4562 	 * the process has requested to deny read/write access.
4563 	 */
4564 	if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4565 		flags &= ~(FREAD | FWRITE);
4566 	}
4567 
4568 	if (authfd != AUTH_OPEN_NOAUTHFD) {
4569 		error = vnode_getfromfd(ctx, authfd, &authvp);
4570 		if (error) {
4571 			fp_free(p, indx, fp);
4572 			return error;
4573 		}
4574 	}
4575 
4576 	if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4577 		if (authvp != NULLVP) {
4578 			vnode_put(authvp);
4579 		}
4580 		if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4581 			if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4582 				*retval = indx;
4583 				return 0;
4584 			}
4585 		}
4586 		if (error == ERESTART) {
4587 			error = EINTR;
4588 		}
4589 		fp_free(p, indx, fp);
4590 		return error;
4591 	}
4592 
4593 	if (authvp != NULLVP) {
4594 		vnode_put(authvp);
4595 	}
4596 
4597 	uu->uu_dupfd = 0;
4598 	vp = ndp->ni_vp;
4599 
4600 	fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4601 	fp->fp_glob->fg_ops = &vnops;
4602 	fp_set_data(fp, vp);
4603 
4604 #if CONFIG_FILE_LEASES
4605 	/*
4606 	 * If we are creating a file or open with truncate, we need to break the
4607 	 * lease if there is a read lease placed on the parent dir.
4608 	 */
4609 	if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4610 		vnode_breakdirlease(vp, true, oflags);
4611 	}
4612 	/* Now check if there is a lease placed on the file itself. */
4613 	error = vnode_breaklease(vp, oflags, ctx);
4614 	if (error) {
4615 		goto bad;
4616 	}
4617 #endif /* CONFIG_FILE_LEASES */
4618 
4619 	if (flags & (O_EXLOCK | O_SHLOCK)) {
4620 		struct flock lf = {
4621 			.l_whence = SEEK_SET,
4622 		};
4623 
4624 		if (flags & O_EXLOCK) {
4625 			lf.l_type = F_WRLCK;
4626 		} else {
4627 			lf.l_type = F_RDLCK;
4628 		}
4629 		type = F_FLOCK;
4630 		if ((flags & FNONBLOCK) == 0) {
4631 			type |= F_WAIT;
4632 		}
4633 #if CONFIG_MACF
4634 		error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4635 		    F_SETLK, &lf);
4636 		if (error) {
4637 			goto bad;
4638 		}
4639 #endif
4640 		if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4641 			goto bad;
4642 		}
4643 		fp->fp_glob->fg_flag |= FWASLOCKED;
4644 	}
4645 
4646 	/* try to truncate by setting the size attribute */
4647 	if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4648 		goto bad;
4649 	}
4650 
4651 	/*
4652 	 * For directories we hold some additional information in the fd.
4653 	 */
4654 	if (vnode_vtype(vp) == VDIR) {
4655 		fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4656 	} else {
4657 		fp->fp_glob->fg_vn_data = NULL;
4658 	}
4659 
4660 #if CONFIG_SECLUDED_MEMORY
4661 	if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
4662 		memory_object_control_t moc;
4663 		const char *v_name;
4664 
4665 		moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4666 
4667 		if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4668 			/* nothing to do... */
4669 		} else if (fp->fp_glob->fg_flag & FWRITE) {
4670 			/* writable -> no longer  eligible for secluded pages */
4671 			memory_object_mark_eligible_for_secluded(moc,
4672 			    FALSE);
4673 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
4674 			char pathname[32] = { 0, };
4675 			size_t copied;
4676 			/* XXX FBDP: better way to detect /Applications/ ? */
4677 			if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4678 				(void)copyinstr(ndp->ni_dirp,
4679 				    pathname,
4680 				    sizeof(pathname),
4681 				    &copied);
4682 			} else {
4683 				copystr(CAST_DOWN(void *, ndp->ni_dirp),
4684 				    pathname,
4685 				    sizeof(pathname),
4686 				    &copied);
4687 			}
4688 			pathname[sizeof(pathname) - 1] = '\0';
4689 			if (strncmp(pathname,
4690 			    "/Applications/",
4691 			    strlen("/Applications/")) == 0 &&
4692 			    strncmp(pathname,
4693 			    "/Applications/Camera.app/",
4694 			    strlen("/Applications/Camera.app/")) != 0) {
4695 				/*
4696 				 * not writable
4697 				 * AND from "/Applications/"
4698 				 * AND not from "/Applications/Camera.app/"
4699 				 * ==> eligible for secluded
4700 				 */
4701 				memory_object_mark_eligible_for_secluded(moc,
4702 				    TRUE);
4703 			}
4704 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
4705 		    (v_name = vnode_getname(vp))) {
4706 			size_t len = strlen(v_name);
4707 
4708 			if (!strncmp(v_name, "dyld", len) ||
4709 			    !strncmp(v_name, "launchd", len) ||
4710 			    !strncmp(v_name, "Camera", len) ||
4711 			    !strncmp(v_name, "SpringBoard", len) ||
4712 			    !strncmp(v_name, "backboardd", len)) {
4713 				/*
4714 				 * This file matters when launching Camera:
4715 				 * do not store its contents in the secluded
4716 				 * pool that will be drained on Camera launch.
4717 				 */
4718 				memory_object_mark_eligible_for_secluded(moc,
4719 				    FALSE);
4720 			} else if (!strncmp(v_name, "mediaserverd", len)) {
4721 				memory_object_mark_eligible_for_secluded(moc,
4722 				    FALSE);
4723 				memory_object_mark_for_realtime(moc,
4724 				    true);
4725 			} else if (!strncmp(v_name, "bluetoothd", len)) {
4726 				/*
4727 				 * bluetoothd might be needed for realtime audio
4728 				 * playback.
4729 				 */
4730 				memory_object_mark_eligible_for_secluded(moc,
4731 				    FALSE);
4732 				memory_object_mark_for_realtime(moc,
4733 				    true);
4734 			} else {
4735 				char pathname[64] = { 0, };
4736 				size_t copied;
4737 				if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4738 					(void)copyinstr(ndp->ni_dirp,
4739 					    pathname,
4740 					    sizeof(pathname),
4741 					    &copied);
4742 				} else {
4743 					copystr(CAST_DOWN(void *, ndp->ni_dirp),
4744 					    pathname,
4745 					    sizeof(pathname),
4746 					    &copied);
4747 				}
4748 				pathname[sizeof(pathname) - 1] = '\0';
4749 				if (strncmp(pathname,
4750 				    "/Library/Audio/Plug-Ins/",
4751 				    strlen("/Library/Audio/Plug-Ins/")) == 0 ||
4752 				    strncmp(pathname,
4753 				    "/System/Library/Audio/Plug-Ins/",
4754 				    strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
4755 					/*
4756 					 * This may be an audio plugin required
4757 					 * for realtime playback.
4758 					 * ==> NOT eligible for secluded.
4759 					 */
4760 					memory_object_mark_eligible_for_secluded(moc,
4761 					    FALSE);
4762 					memory_object_mark_for_realtime(moc,
4763 					    true);
4764 				}
4765 			}
4766 			vnode_putname(v_name);
4767 		}
4768 	}
4769 #endif /* CONFIG_SECLUDED_MEMORY */
4770 
4771 	vnode_put(vp);
4772 
4773 	/*
4774 	 * The first terminal open (without a O_NOCTTY) by a session leader
4775 	 * results in it being set as the controlling terminal.
4776 	 */
4777 	if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4778 	    !(flags & O_NOCTTY)) {
4779 		int tmp = 0;
4780 
4781 		(void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4782 		    (caddr_t)&tmp, ctx);
4783 	}
4784 
4785 	proc_fdlock(p);
4786 	procfdtbl_releasefd(p, indx, NULL);
4787 
4788 	fp_drop(p, indx, fp, 1);
4789 	proc_fdunlock(p);
4790 
4791 	*retval = indx;
4792 
4793 	return 0;
4794 bad:
4795 	context = *vfs_context_current();
4796 	context.vc_ucred = fp->fp_glob->fg_cred;
4797 
4798 	if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4799 	    (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4800 		struct flock lf = {
4801 			.l_whence = SEEK_SET,
4802 			.l_type = F_UNLCK,
4803 		};
4804 
4805 		(void)VNOP_ADVLOCK(
4806 			vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4807 	}
4808 
4809 	vn_close(vp, fp->fp_glob->fg_flag, &context);
4810 	vnode_put(vp);
4811 	fp_free(p, indx, fp);
4812 
4813 	return error;
4814 }
4815 
4816 /*
4817  * While most of the *at syscall handlers can call nameiat() which
4818  * is a wrapper around namei, the use of namei and initialisation
4819  * of nameidata are far removed and in different functions  - namei
4820  * gets called in vn_open_auth for open1. So we'll just do here what
4821  * nameiat() does.
4822  */
4823 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)4824 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4825     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4826     int dirfd, int authfd)
4827 {
4828 	if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4829 		int error;
4830 		char c;
4831 
4832 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4833 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
4834 			if (error) {
4835 				return error;
4836 			}
4837 		} else {
4838 			c = *((char *)(ndp->ni_dirp));
4839 		}
4840 
4841 		if (c != '/') {
4842 			vnode_t dvp_at;
4843 
4844 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4845 			    &dvp_at);
4846 			if (error) {
4847 				return error;
4848 			}
4849 
4850 			if (vnode_vtype(dvp_at) != VDIR) {
4851 				vnode_put(dvp_at);
4852 				return ENOTDIR;
4853 			}
4854 
4855 			ndp->ni_dvp = dvp_at;
4856 			ndp->ni_cnd.cn_flags |= USEDVP;
4857 			error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4858 			    retval, authfd);
4859 			vnode_put(dvp_at);
4860 			return error;
4861 		}
4862 	}
4863 
4864 	return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
4865 }
4866 
4867 /*
4868  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4869  *
4870  * Parameters:	p			Process requesting the open
4871  *		uap			User argument descriptor (see below)
4872  *		retval			Pointer to an area to receive the
4873  *					return calue from the system call
4874  *
4875  * Indirect:	uap->path		Path to open (same as 'open')
4876  *		uap->flags		Flags to open (same as 'open'
4877  *		uap->uid		UID to set, if creating
4878  *		uap->gid		GID to set, if creating
4879  *		uap->mode		File mode, if creating (same as 'open')
4880  *		uap->xsecurity		ACL to set, if creating
4881  *
4882  * Returns:	0			Success
4883  *		!0			errno value
4884  *
4885  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
4886  *
4887  * XXX:		We should enummerate the possible errno values here, and where
4888  *		in the code they originated.
4889  */
4890 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)4891 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4892 {
4893 	int ciferror;
4894 	kauth_filesec_t xsecdst;
4895 	struct vnode_attr va;
4896 	struct nameidata nd;
4897 	int cmode;
4898 
4899 	AUDIT_ARG(owner, uap->uid, uap->gid);
4900 
4901 	xsecdst = NULL;
4902 	if ((uap->xsecurity != USER_ADDR_NULL) &&
4903 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4904 		return ciferror;
4905 	}
4906 
4907 	VATTR_INIT(&va);
4908 	cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4909 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4910 	if (uap->uid != KAUTH_UID_NONE) {
4911 		VATTR_SET(&va, va_uid, uap->uid);
4912 	}
4913 	if (uap->gid != KAUTH_GID_NONE) {
4914 		VATTR_SET(&va, va_gid, uap->gid);
4915 	}
4916 	if (xsecdst != NULL) {
4917 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4918 		va.va_vaflags |= VA_FILESEC_ACL;
4919 	}
4920 
4921 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4922 	    uap->path, vfs_context_current());
4923 
4924 	ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4925 	    NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
4926 	if (xsecdst != NULL) {
4927 		kauth_filesec_free(xsecdst);
4928 	}
4929 
4930 	return ciferror;
4931 }
4932 
4933 /*
4934  * Go through the data-protected atomically controlled open (2)
4935  *
4936  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4937  */
4938 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)4939 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4940     int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
4941 {
4942 	/*
4943 	 * Follow the same path as normal open(2)
4944 	 * Look up the item if it exists, and acquire the vnode.
4945 	 */
4946 	struct vnode_attr va;
4947 	struct nameidata nd;
4948 	int cmode;
4949 	int error;
4950 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
4951 
4952 	VATTR_INIT(&va);
4953 	/* Mask off all but regular access permissions */
4954 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4955 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4956 
4957 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
4958 	    path, ctx);
4959 
4960 	/*
4961 	 * Initialize the extra fields in vnode_attr to pass down our
4962 	 * extra fields.
4963 	 * 1. target cprotect class.
4964 	 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4965 	 */
4966 	if (flags & O_CREAT) {
4967 		/* lower level kernel code validates that the class is valid before applying it. */
4968 		if (class != PROTECTION_CLASS_DEFAULT) {
4969 			/*
4970 			 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4971 			 * file behave the same as open (2)
4972 			 */
4973 			VATTR_SET(&va, va_dataprotect_class, class);
4974 		}
4975 	}
4976 
4977 	if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
4978 		if (flags & (O_RDWR | O_WRONLY)) {
4979 			/*
4980 			 * Not allowed to write raw encrypted bytes or when opening authenticated.
4981 			 */
4982 			return EINVAL;
4983 		}
4984 		if (dpflags & O_DP_GETRAWENCRYPTED) {
4985 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4986 		}
4987 		if (dpflags & O_DP_GETRAWUNENCRYPTED) {
4988 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4989 		}
4990 		if (dpflags & O_DP_AUTHENTICATE) {
4991 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
4992 		}
4993 	}
4994 
4995 	error = open1at(vfs_context_current(), &nd, flags, &va,
4996 	    NULL, NULL, retval, fd, authfd);
4997 
4998 	return error;
4999 }
5000 
5001 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)5002 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5003 {
5004 	if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5005 		return EINVAL;
5006 	}
5007 
5008 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5009 	           uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
5010 }
5011 
5012 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)5013 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5014 {
5015 	if (uap->dpflags & O_DP_AUTHENTICATE) {
5016 		return EINVAL;
5017 	}
5018 
5019 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5020 	           uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5021 }
5022 
5023 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)5024 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5025     int fd, enum uio_seg segflg, int *retval)
5026 {
5027 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5028 	struct {
5029 		struct vnode_attr va;
5030 		struct nameidata nd;
5031 	} *__open_data;
5032 	struct vnode_attr *vap;
5033 	struct nameidata *ndp;
5034 	int cmode;
5035 	int error;
5036 
5037 	__open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5038 	vap = &__open_data->va;
5039 	ndp = &__open_data->nd;
5040 
5041 	VATTR_INIT(vap);
5042 	/* Mask off all but regular access permissions */
5043 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5044 	VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5045 
5046 	NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5047 	    segflg, path, ctx);
5048 
5049 	error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5050 
5051 	kfree_type(typeof(*__open_data), __open_data);
5052 
5053 	return error;
5054 }
5055 
5056 int
open(proc_t p,struct open_args * uap,int32_t * retval)5057 open(proc_t p, struct open_args *uap, int32_t *retval)
5058 {
5059 	__pthread_testcancel(1);
5060 	return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5061 }
5062 
5063 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5064 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5065     int32_t *retval)
5066 {
5067 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5068 	           uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
5069 }
5070 
5071 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5072 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5073     int32_t *retval)
5074 {
5075 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5076 	           uap->mode, uap->fd, UIO_USERSPACE, retval);
5077 }
5078 
5079 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5080 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5081 {
5082 	__pthread_testcancel(1);
5083 	return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5084 }
5085 
5086 #define OPEN_BY_ID_ENTITLEMENT  "com.apple.private.vfs.open-by-id"
5087 
5088 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5089 vfs_context_can_open_by_id(vfs_context_t ctx)
5090 {
5091 	if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5092 		return TRUE;
5093 	}
5094 
5095 	return IOTaskHasEntitlement(vfs_context_task(ctx),
5096 	           OPEN_BY_ID_ENTITLEMENT);
5097 }
5098 
5099 /*
5100  * openbyid_np: open a file given a file system id and a file system object id
5101  *	the hfs file system object id is an fsobj_id_t {uint32, uint32}
5102  *	file systems that don't support object ids it is a node id (uint64_t).
5103  *
5104  * Parameters:	p			Process requesting the open
5105  *		uap			User argument descriptor (see below)
5106  *		retval			Pointer to an area to receive the
5107  *					return calue from the system call
5108  *
5109  * Indirect:	uap->path		Path to open (same as 'open')
5110  *
5111  *		uap->fsid		id of target file system
5112  *		uap->objid		id of target file system object
5113  *		uap->flags		Flags to open (same as 'open')
5114  *
5115  * Returns:	0			Success
5116  *		!0			errno value
5117  *
5118  *
5119  * XXX:		We should enummerate the possible errno values here, and where
5120  *		in the code they originated.
5121  */
5122 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5123 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5124 {
5125 	fsid_t fsid;
5126 	uint64_t objid;
5127 	int error;
5128 	char *buf = NULL;
5129 	int buflen = MAXPATHLEN;
5130 	int pathlen = 0;
5131 	vfs_context_t ctx = vfs_context_current();
5132 
5133 	if (!vfs_context_can_open_by_id(ctx)) {
5134 		return EPERM;
5135 	}
5136 
5137 	if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5138 		return error;
5139 	}
5140 
5141 	/*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5142 	if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5143 		return error;
5144 	}
5145 
5146 	AUDIT_ARG(value32, fsid.val[0]);
5147 	AUDIT_ARG(value64, objid);
5148 
5149 	/*resolve path from fsis, objid*/
5150 	do {
5151 		buf = kalloc_data(buflen + 1, Z_WAITOK);
5152 		if (buf == NULL) {
5153 			return ENOMEM;
5154 		}
5155 
5156 		error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5157 		    buf, FSOPT_ISREALFSID, &pathlen);
5158 
5159 		if (error) {
5160 			kfree_data(buf, buflen + 1);
5161 			buf = NULL;
5162 		}
5163 	} while (error == ENOSPC && (buflen += MAXPATHLEN));
5164 
5165 	if (error) {
5166 		return error;
5167 	}
5168 
5169 	buf[pathlen] = 0;
5170 
5171 	error = openat_internal(
5172 		ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
5173 
5174 	kfree_data(buf, buflen + 1);
5175 
5176 	return error;
5177 }
5178 
5179 
5180 /*
5181  * Create a special file.
5182  */
5183 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5184     int fd);
5185 
5186 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5187 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5188     mode_t mode, int fd)
5189 {
5190 	vfs_context_t ctx = vfs_context_current();
5191 	struct nameidata nd;
5192 	vnode_t vp, dvp;
5193 	int error;
5194 
5195 	/* If it's a mknod() of a FIFO, call mkfifo1() instead */
5196 	if ((mode & S_IFMT) == S_IFIFO) {
5197 		return mkfifo1(ctx, upath, vap, fd);
5198 	}
5199 
5200 	AUDIT_ARG(mode, mode);
5201 	AUDIT_ARG(value32, vap->va_rdev);
5202 
5203 	if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5204 		return error;
5205 	}
5206 	NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5207 	    UIO_USERSPACE, upath, ctx);
5208 	error = nameiat(&nd, fd);
5209 	if (error) {
5210 		return error;
5211 	}
5212 	dvp = nd.ni_dvp;
5213 	vp = nd.ni_vp;
5214 
5215 	if (vp != NULL) {
5216 		error = EEXIST;
5217 		goto out;
5218 	}
5219 
5220 	switch (mode & S_IFMT) {
5221 	case S_IFCHR:
5222 		VATTR_SET(vap, va_type, VCHR);
5223 		break;
5224 	case S_IFBLK:
5225 		VATTR_SET(vap, va_type, VBLK);
5226 		break;
5227 	default:
5228 		error = EINVAL;
5229 		goto out;
5230 	}
5231 
5232 #if CONFIG_MACF
5233 	error = mac_vnode_check_create(ctx,
5234 	    nd.ni_dvp, &nd.ni_cnd, vap);
5235 	if (error) {
5236 		goto out;
5237 	}
5238 #endif
5239 
5240 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5241 		goto out;
5242 	}
5243 
5244 #if CONFIG_FILE_LEASES
5245 	vnode_breakdirlease(dvp, false, O_WRONLY);
5246 #endif
5247 
5248 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5249 		goto out;
5250 	}
5251 
5252 	if (vp) {
5253 		int     update_flags = 0;
5254 
5255 		// Make sure the name & parent pointers are hooked up
5256 		if (vp->v_name == NULL) {
5257 			update_flags |= VNODE_UPDATE_NAME;
5258 		}
5259 		if (vp->v_parent == NULLVP) {
5260 			update_flags |= VNODE_UPDATE_PARENT;
5261 		}
5262 
5263 		if (update_flags) {
5264 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5265 		}
5266 
5267 #if CONFIG_FSE
5268 		add_fsevent(FSE_CREATE_FILE, ctx,
5269 		    FSE_ARG_VNODE, vp,
5270 		    FSE_ARG_DONE);
5271 #endif
5272 	}
5273 
5274 out:
5275 	/*
5276 	 * nameidone has to happen before we vnode_put(dvp)
5277 	 * since it may need to release the fs_nodelock on the dvp
5278 	 */
5279 	nameidone(&nd);
5280 
5281 	if (vp) {
5282 		vnode_put(vp);
5283 	}
5284 	vnode_put(dvp);
5285 
5286 	return error;
5287 }
5288 
5289 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5290 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5291 {
5292 	struct vnode_attr va;
5293 
5294 	VATTR_INIT(&va);
5295 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5296 	VATTR_SET(&va, va_rdev, uap->dev);
5297 
5298 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5299 }
5300 
5301 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5302 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5303 {
5304 	struct vnode_attr va;
5305 
5306 	VATTR_INIT(&va);
5307 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5308 	VATTR_SET(&va, va_rdev, uap->dev);
5309 
5310 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5311 }
5312 
5313 /*
5314  * Create a named pipe.
5315  *
5316  * Returns:	0			Success
5317  *		EEXIST
5318  *	namei:???
5319  *	vnode_authorize:???
5320  *	vn_create:???
5321  */
5322 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5323 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5324 {
5325 	vnode_t vp, dvp;
5326 	int error;
5327 	struct nameidata nd;
5328 
5329 	NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5330 	    UIO_USERSPACE, upath, ctx);
5331 	error = nameiat(&nd, fd);
5332 	if (error) {
5333 		return error;
5334 	}
5335 	dvp = nd.ni_dvp;
5336 	vp = nd.ni_vp;
5337 
5338 	/* check that this is a new file and authorize addition */
5339 	if (vp != NULL) {
5340 		error = EEXIST;
5341 		goto out;
5342 	}
5343 	VATTR_SET(vap, va_type, VFIFO);
5344 
5345 	if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5346 		goto out;
5347 	}
5348 
5349 	error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5350 out:
5351 	/*
5352 	 * nameidone has to happen before we vnode_put(dvp)
5353 	 * since it may need to release the fs_nodelock on the dvp
5354 	 */
5355 	nameidone(&nd);
5356 
5357 	if (vp) {
5358 		vnode_put(vp);
5359 	}
5360 	vnode_put(dvp);
5361 
5362 	return error;
5363 }
5364 
5365 
5366 /*
5367  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5368  *
5369  * Parameters:	p			Process requesting the open
5370  *		uap			User argument descriptor (see below)
5371  *		retval			(Ignored)
5372  *
5373  * Indirect:	uap->path		Path to fifo (same as 'mkfifo')
5374  *		uap->uid		UID to set
5375  *		uap->gid		GID to set
5376  *		uap->mode		File mode to set (same as 'mkfifo')
5377  *		uap->xsecurity		ACL to set, if creating
5378  *
5379  * Returns:	0			Success
5380  *		!0			errno value
5381  *
5382  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
5383  *
5384  * XXX:		We should enummerate the possible errno values here, and where
5385  *		in the code they originated.
5386  */
5387 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5388 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5389 {
5390 	int ciferror;
5391 	kauth_filesec_t xsecdst;
5392 	struct vnode_attr va;
5393 
5394 	AUDIT_ARG(owner, uap->uid, uap->gid);
5395 
5396 	xsecdst = KAUTH_FILESEC_NONE;
5397 	if (uap->xsecurity != USER_ADDR_NULL) {
5398 		if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5399 			return ciferror;
5400 		}
5401 	}
5402 
5403 	VATTR_INIT(&va);
5404 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5405 	if (uap->uid != KAUTH_UID_NONE) {
5406 		VATTR_SET(&va, va_uid, uap->uid);
5407 	}
5408 	if (uap->gid != KAUTH_GID_NONE) {
5409 		VATTR_SET(&va, va_gid, uap->gid);
5410 	}
5411 	if (xsecdst != KAUTH_FILESEC_NONE) {
5412 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5413 		va.va_vaflags |= VA_FILESEC_ACL;
5414 	}
5415 
5416 	ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5417 
5418 	if (xsecdst != KAUTH_FILESEC_NONE) {
5419 		kauth_filesec_free(xsecdst);
5420 	}
5421 	return ciferror;
5422 }
5423 
5424 /* ARGSUSED */
5425 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5426 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5427 {
5428 	struct vnode_attr va;
5429 
5430 	VATTR_INIT(&va);
5431 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5432 
5433 	return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5434 }
5435 
5436 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5437 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5438 {
5439 	struct vnode_attr va;
5440 
5441 	VATTR_INIT(&va);
5442 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5443 
5444 	return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5445 }
5446 
5447 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5448 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5449 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5450 
5451 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5452 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5453 {
5454 	int ret, len = _len;
5455 
5456 	*truncated_path = 0;
5457 
5458 	if (firmlink) {
5459 		ret = vn_getpath(dvp, path, &len);
5460 	} else {
5461 		ret = vn_getpath_no_firmlink(dvp, path, &len);
5462 	}
5463 	if (ret == 0 && len < (MAXPATHLEN - 1)) {
5464 		if (leafname) {
5465 			path[len - 1] = '/';
5466 			len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5467 			if (len > MAXPATHLEN) {
5468 				char *ptr;
5469 
5470 				// the string got truncated!
5471 				*truncated_path = 1;
5472 				ptr = strrchr(path, '/');
5473 				if (ptr) {
5474 					*ptr = '\0';   // chop off the string at the last directory component
5475 				}
5476 				len = (int)strlen(path) + 1;
5477 			}
5478 		}
5479 	} else if (ret == 0) {
5480 		*truncated_path = 1;
5481 	} else if (ret != 0) {
5482 		struct vnode *mydvp = dvp;
5483 
5484 		if (ret != ENOSPC) {
5485 			printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5486 			    dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5487 		}
5488 		*truncated_path = 1;
5489 
5490 		do {
5491 			if (mydvp->v_parent != NULL) {
5492 				mydvp = mydvp->v_parent;
5493 			} else if (mydvp->v_mount) {
5494 				strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5495 				break;
5496 			} else {
5497 				// no parent and no mount point?  only thing is to punt and say "/" changed
5498 				strlcpy(path, "/", _len);
5499 				len = 2;
5500 				mydvp = NULL;
5501 			}
5502 
5503 			if (mydvp == NULL) {
5504 				break;
5505 			}
5506 
5507 			len = _len;
5508 			if (firmlink) {
5509 				ret = vn_getpath(mydvp, path, &len);
5510 			} else {
5511 				ret = vn_getpath_no_firmlink(mydvp, path, &len);
5512 			}
5513 		} while (ret == ENOSPC);
5514 	}
5515 
5516 	return len;
5517 }
5518 
5519 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5520 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5521 {
5522 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5523 }
5524 
5525 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5526 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5527 {
5528 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5529 }
5530 
5531 /*
5532  * Make a hard file link.
5533  *
5534  * Returns:	0			Success
5535  *		EPERM
5536  *		EEXIST
5537  *		EXDEV
5538  *	namei:???
5539  *	vnode_authorize:???
5540  *	VNOP_LINK:???
5541  */
5542 /* ARGSUSED */
5543 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5544 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5545     user_addr_t link, int flag, enum uio_seg segflg)
5546 {
5547 	vnode_t vp, pvp, dvp, lvp;
5548 	struct nameidata nd;
5549 	int follow;
5550 	int error;
5551 #if CONFIG_FSE
5552 	fse_info finfo;
5553 #endif
5554 	int need_event, has_listeners, need_kpath2;
5555 	char *target_path = NULL;
5556 	char  *no_firmlink_path = NULL;
5557 	int truncated = 0;
5558 	int truncated_no_firmlink_path = 0;
5559 
5560 	vp = dvp = lvp = NULLVP;
5561 
5562 	/* look up the object we are linking to */
5563 	follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5564 	NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5565 	    segflg, path, ctx);
5566 
5567 	error = nameiat(&nd, fd1);
5568 	if (error) {
5569 		return error;
5570 	}
5571 	vp = nd.ni_vp;
5572 
5573 	nameidone(&nd);
5574 
5575 	/*
5576 	 * Normally, linking to directories is not supported.
5577 	 * However, some file systems may have limited support.
5578 	 */
5579 	if (vp->v_type == VDIR) {
5580 		if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5581 			error = EPERM;   /* POSIX */
5582 			goto out;
5583 		}
5584 
5585 		/* Linking to a directory requires ownership. */
5586 		if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5587 			struct vnode_attr dva;
5588 
5589 			VATTR_INIT(&dva);
5590 			VATTR_WANTED(&dva, va_uid);
5591 			if (vnode_getattr(vp, &dva, ctx) != 0 ||
5592 			    !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5593 			    (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5594 				error = EACCES;
5595 				goto out;
5596 			}
5597 		}
5598 	}
5599 
5600 	/* lookup the target node */
5601 #if CONFIG_TRIGGERS
5602 	nd.ni_op = OP_LINK;
5603 #endif
5604 	nd.ni_cnd.cn_nameiop = CREATE;
5605 	nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5606 	nd.ni_dirp = link;
5607 	error = nameiat(&nd, fd2);
5608 	if (error != 0) {
5609 		goto out;
5610 	}
5611 	dvp = nd.ni_dvp;
5612 	lvp = nd.ni_vp;
5613 
5614 #if CONFIG_MACF
5615 	if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5616 		goto out2;
5617 	}
5618 #endif
5619 
5620 	/* or to anything that kauth doesn't want us to (eg. immutable items) */
5621 	if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5622 		goto out2;
5623 	}
5624 
5625 	/* target node must not exist */
5626 	if (lvp != NULLVP) {
5627 		error = EEXIST;
5628 		goto out2;
5629 	}
5630 	/* cannot link across mountpoints */
5631 	if (vnode_mount(vp) != vnode_mount(dvp)) {
5632 		error = EXDEV;
5633 		goto out2;
5634 	}
5635 
5636 	/* authorize creation of the target note */
5637 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5638 		goto out2;
5639 	}
5640 
5641 #if CONFIG_FILE_LEASES
5642 	vnode_breakdirlease(dvp, false, O_WRONLY);
5643 #endif
5644 
5645 	/* and finally make the link */
5646 	error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5647 	if (error) {
5648 		goto out2;
5649 	}
5650 
5651 #if CONFIG_MACF
5652 	(void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5653 #endif
5654 
5655 #if CONFIG_FSE
5656 	need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5657 #else
5658 	need_event = 0;
5659 #endif
5660 	has_listeners = kauth_authorize_fileop_has_listeners();
5661 
5662 	need_kpath2 = 0;
5663 #if CONFIG_AUDIT
5664 	if (AUDIT_RECORD_EXISTS()) {
5665 		need_kpath2 = 1;
5666 	}
5667 #endif
5668 
5669 	if (need_event || has_listeners || need_kpath2) {
5670 		char *link_to_path = NULL;
5671 		int len, link_name_len;
5672 		int  len_no_firmlink_path = 0;
5673 
5674 		/* build the path to the new link file */
5675 		GET_PATH(target_path);
5676 
5677 		len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5678 		if (no_firmlink_path == NULL) {
5679 			GET_PATH(no_firmlink_path);
5680 		}
5681 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5682 
5683 		AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5684 
5685 		if (has_listeners) {
5686 			/* build the path to file we are linking to */
5687 			GET_PATH(link_to_path);
5688 
5689 			link_name_len = MAXPATHLEN;
5690 			if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5691 				/*
5692 				 * Call out to allow 3rd party notification of rename.
5693 				 * Ignore result of kauth_authorize_fileop call.
5694 				 */
5695 				kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5696 				    (uintptr_t)link_to_path,
5697 				    (uintptr_t)target_path);
5698 			}
5699 			if (link_to_path != NULL) {
5700 				RELEASE_PATH(link_to_path);
5701 			}
5702 		}
5703 #if CONFIG_FSE
5704 		if (need_event) {
5705 			/* construct fsevent */
5706 			if (get_fse_info(vp, &finfo, ctx) == 0) {
5707 				if (truncated_no_firmlink_path) {
5708 					finfo.mode |= FSE_TRUNCATED_PATH;
5709 				}
5710 
5711 				// build the path to the destination of the link
5712 				add_fsevent(FSE_CREATE_FILE, ctx,
5713 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5714 				    FSE_ARG_FINFO, &finfo,
5715 				    FSE_ARG_DONE);
5716 			}
5717 
5718 			pvp = vp->v_parent;
5719 			// need an iocount on parent vnode in this case
5720 			if (pvp && pvp != dvp) {
5721 				pvp = vnode_getparent_if_different(vp, dvp);
5722 			}
5723 			if (pvp) {
5724 				add_fsevent(FSE_STAT_CHANGED, ctx,
5725 				    FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5726 			}
5727 			if (pvp && pvp != dvp) {
5728 				vnode_put(pvp);
5729 			}
5730 		}
5731 #endif
5732 	}
5733 out2:
5734 	/*
5735 	 * nameidone has to happen before we vnode_put(dvp)
5736 	 * since it may need to release the fs_nodelock on the dvp
5737 	 */
5738 	nameidone(&nd);
5739 	if (target_path != NULL) {
5740 		RELEASE_PATH(target_path);
5741 	}
5742 	if (no_firmlink_path != NULL) {
5743 		RELEASE_PATH(no_firmlink_path);
5744 		no_firmlink_path = NULL;
5745 	}
5746 out:
5747 	if (lvp) {
5748 		vnode_put(lvp);
5749 	}
5750 	if (dvp) {
5751 		vnode_put(dvp);
5752 	}
5753 	vnode_put(vp);
5754 	return error;
5755 }
5756 
5757 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5758 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5759 {
5760 	return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5761 	           AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5762 }
5763 
5764 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5765 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5766 {
5767 	if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5768 		return EINVAL;
5769 	}
5770 
5771 	return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5772 	           uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5773 }
5774 
5775 /*
5776  * Make a symbolic link.
5777  *
5778  * We could add support for ACLs here too...
5779  */
5780 /* ARGSUSED */
5781 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5782 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5783     user_addr_t link, enum uio_seg segflg)
5784 {
5785 	struct vnode_attr va;
5786 	char *path;
5787 	int error;
5788 	struct nameidata nd;
5789 	vnode_t vp, dvp;
5790 	size_t dummy = 0;
5791 	proc_t p;
5792 
5793 	error = 0;
5794 	if (UIO_SEG_IS_USER_SPACE(segflg)) {
5795 		path = zalloc(ZV_NAMEI);
5796 		error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5797 	} else {
5798 		path = (char *)path_data;
5799 	}
5800 	if (error) {
5801 		goto out;
5802 	}
5803 	AUDIT_ARG(text, path);  /* This is the link string */
5804 
5805 	NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5806 	    segflg, link, ctx);
5807 
5808 	error = nameiat(&nd, fd);
5809 	if (error) {
5810 		goto out;
5811 	}
5812 	dvp = nd.ni_dvp;
5813 	vp = nd.ni_vp;
5814 
5815 	p = vfs_context_proc(ctx);
5816 	VATTR_INIT(&va);
5817 	VATTR_SET(&va, va_type, VLNK);
5818 	VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5819 
5820 #if CONFIG_MACF
5821 	error = mac_vnode_check_create(ctx,
5822 	    dvp, &nd.ni_cnd, &va);
5823 #endif
5824 	if (error != 0) {
5825 		goto skipit;
5826 	}
5827 
5828 	if (vp != NULL) {
5829 		error = EEXIST;
5830 		goto skipit;
5831 	}
5832 
5833 	/* authorize */
5834 	if (error == 0) {
5835 		error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5836 	}
5837 	/* get default ownership, etc. */
5838 	if (error == 0) {
5839 		error = vnode_authattr_new(dvp, &va, 0, ctx);
5840 	}
5841 
5842 #if CONFIG_FILE_LEASES
5843 	vnode_breakdirlease(dvp, false, O_WRONLY);
5844 #endif
5845 
5846 	if (error == 0) {
5847 		error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5848 	}
5849 
5850 	/* do fallback attribute handling */
5851 	if (error == 0 && vp) {
5852 		error = vnode_setattr_fallback(vp, &va, ctx);
5853 	}
5854 
5855 #if CONFIG_MACF
5856 	if (error == 0 && vp) {
5857 		error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5858 	}
5859 #endif
5860 
5861 	if (error == 0) {
5862 		int     update_flags = 0;
5863 
5864 		/*check if a new vnode was created, else try to get one*/
5865 		if (vp == NULL) {
5866 			nd.ni_cnd.cn_nameiop = LOOKUP;
5867 #if CONFIG_TRIGGERS
5868 			nd.ni_op = OP_LOOKUP;
5869 #endif
5870 			/*
5871 			 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5872 			 * reallocated again in namei().
5873 			 */
5874 			nd.ni_cnd.cn_flags &= HASBUF;
5875 			error = nameiat(&nd, fd);
5876 			if (error) {
5877 				goto skipit;
5878 			}
5879 			vp = nd.ni_vp;
5880 		}
5881 
5882 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5883 		/* call out to allow 3rd party notification of rename.
5884 		 * Ignore result of kauth_authorize_fileop call.
5885 		 */
5886 		if (kauth_authorize_fileop_has_listeners() &&
5887 		    namei(&nd) == 0) {
5888 			char *new_link_path = NULL;
5889 			int             len;
5890 
5891 			/* build the path to the new link file */
5892 			new_link_path = get_pathbuff();
5893 			len = MAXPATHLEN;
5894 			vn_getpath(dvp, new_link_path, &len);
5895 			if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5896 				new_link_path[len - 1] = '/';
5897 				strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5898 			}
5899 
5900 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5901 			    (uintptr_t)path, (uintptr_t)new_link_path);
5902 			if (new_link_path != NULL) {
5903 				release_pathbuff(new_link_path);
5904 			}
5905 		}
5906 #endif
5907 		// Make sure the name & parent pointers are hooked up
5908 		if (vp->v_name == NULL) {
5909 			update_flags |= VNODE_UPDATE_NAME;
5910 		}
5911 		if (vp->v_parent == NULLVP) {
5912 			update_flags |= VNODE_UPDATE_PARENT;
5913 		}
5914 
5915 		if (update_flags) {
5916 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5917 		}
5918 
5919 #if CONFIG_FSE
5920 		add_fsevent(FSE_CREATE_FILE, ctx,
5921 		    FSE_ARG_VNODE, vp,
5922 		    FSE_ARG_DONE);
5923 #endif
5924 	}
5925 
5926 skipit:
5927 	/*
5928 	 * nameidone has to happen before we vnode_put(dvp)
5929 	 * since it may need to release the fs_nodelock on the dvp
5930 	 */
5931 	nameidone(&nd);
5932 
5933 	if (vp) {
5934 		vnode_put(vp);
5935 	}
5936 	vnode_put(dvp);
5937 out:
5938 	if (path && (path != (char *)path_data)) {
5939 		zfree(ZV_NAMEI, path);
5940 	}
5941 
5942 	return error;
5943 }
5944 
5945 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)5946 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5947 {
5948 	return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5949 	           uap->link, UIO_USERSPACE);
5950 }
5951 
5952 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)5953 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5954     __unused int32_t *retval)
5955 {
5956 	return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5957 	           uap->path2, UIO_USERSPACE);
5958 }
5959 
5960 /*
5961  * Delete a whiteout from the filesystem.
5962  * No longer supported.
5963  */
5964 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)5965 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5966 {
5967 	return ENOTSUP;
5968 }
5969 
5970 /*
5971  * Delete a name from the filesystem.
5972  */
5973 /* ARGSUSED */
5974 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5975 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5976     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5977 {
5978 	struct {
5979 		struct nameidata nd;
5980 #if CONFIG_FSE
5981 		struct vnode_attr va;
5982 		fse_info finfo;
5983 #endif
5984 	} *__unlink_data;
5985 	struct nameidata *ndp;
5986 	vnode_t vp, dvp;
5987 	int error;
5988 	struct componentname *cnp;
5989 	char  *path = NULL;
5990 	char  *no_firmlink_path = NULL;
5991 	int  len_path = 0;
5992 	int  len_no_firmlink_path = 0;
5993 	int flags;
5994 	int need_event;
5995 	int has_listeners;
5996 	int truncated_path;
5997 	int truncated_no_firmlink_path;
5998 	int batched;
5999 	struct vnode_attr *vap;
6000 	int do_retry;
6001 	int retry_count = 0;
6002 	int cn_flags;
6003 	int nofollow_any = 0;
6004 
6005 	cn_flags = LOCKPARENT;
6006 	if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6007 		cn_flags |= AUDITVNPATH1;
6008 	}
6009 	if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
6010 		nofollow_any = NAMEI_NOFOLLOW_ANY;
6011 		unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
6012 	}
6013 	/* If a starting dvp is passed, it trumps any fd passed. */
6014 	if (start_dvp) {
6015 		cn_flags |= USEDVP;
6016 	}
6017 
6018 #if NAMEDRSRCFORK
6019 	/* unlink or delete is allowed on rsrc forks and named streams */
6020 	cn_flags |= CN_ALLOWRSRCFORK;
6021 #endif
6022 
6023 	__unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6024 	ndp = &__unlink_data->nd;
6025 #if CONFIG_FSE
6026 	fse_info *finfop = &__unlink_data->finfo;
6027 #endif
6028 
6029 retry:
6030 	do_retry = 0;
6031 	flags = 0;
6032 	need_event = 0;
6033 	has_listeners = 0;
6034 	truncated_path = 0;
6035 	truncated_no_firmlink_path = 0;
6036 	vap = NULL;
6037 
6038 	NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6039 
6040 	ndp->ni_dvp = start_dvp;
6041 	ndp->ni_flag |= NAMEI_COMPOUNDREMOVE | nofollow_any;
6042 	cnp = &ndp->ni_cnd;
6043 
6044 continue_lookup:
6045 	error = nameiat(ndp, fd);
6046 	if (error) {
6047 		goto early_out;
6048 	}
6049 
6050 	dvp = ndp->ni_dvp;
6051 	vp = ndp->ni_vp;
6052 
6053 	/* With Carbon delete semantics, busy files cannot be deleted */
6054 	if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6055 		flags |= VNODE_REMOVE_NODELETEBUSY;
6056 	}
6057 
6058 	/* Skip any potential upcalls if told to. */
6059 	if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6060 		flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6061 	}
6062 
6063 	if (vp) {
6064 		batched = vnode_compound_remove_available(vp);
6065 		/*
6066 		 * The root of a mounted filesystem cannot be deleted.
6067 		 */
6068 		if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6069 			error = EBUSY;
6070 			goto out;
6071 		}
6072 
6073 #if DEVELOPMENT || DEBUG
6074 		/*
6075 		 * XXX VSWAP: Check for entitlements or special flag here
6076 		 * so we can restrict access appropriately.
6077 		 */
6078 #else /* DEVELOPMENT || DEBUG */
6079 
6080 		if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6081 			error = EPERM;
6082 			goto out;
6083 		}
6084 #endif /* DEVELOPMENT || DEBUG */
6085 
6086 		if (!batched) {
6087 			error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6088 			if (error) {
6089 				if (error == ENOENT) {
6090 					if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6091 						do_retry = 1;
6092 						retry_count++;
6093 					}
6094 				}
6095 				goto out;
6096 			}
6097 		}
6098 	} else {
6099 		batched = 1;
6100 
6101 		if (!vnode_compound_remove_available(dvp)) {
6102 			panic("No vp, but no compound remove?");
6103 		}
6104 	}
6105 
6106 #if CONFIG_FSE
6107 	need_event = need_fsevent(FSE_DELETE, dvp);
6108 	if (need_event) {
6109 		if (!batched) {
6110 			if ((vp->v_flag & VISHARDLINK) == 0) {
6111 				/* XXX need to get these data in batched VNOP */
6112 				get_fse_info(vp, finfop, ctx);
6113 			}
6114 		} else {
6115 			error =
6116 			    vfs_get_notify_attributes(&__unlink_data->va);
6117 			if (error) {
6118 				goto out;
6119 			}
6120 
6121 			vap = &__unlink_data->va;
6122 		}
6123 	}
6124 #endif
6125 	has_listeners = kauth_authorize_fileop_has_listeners();
6126 	if (need_event || has_listeners) {
6127 		if (path == NULL) {
6128 			GET_PATH(path);
6129 		}
6130 		len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6131 		if (no_firmlink_path == NULL) {
6132 			GET_PATH(no_firmlink_path);
6133 		}
6134 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6135 	}
6136 
6137 #if NAMEDRSRCFORK
6138 	if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6139 		error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6140 	} else
6141 #endif
6142 	{
6143 #if CONFIG_FILE_LEASES
6144 		vnode_breakdirlease(dvp, false, O_WRONLY);
6145 #endif
6146 
6147 		error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6148 		vp = ndp->ni_vp;
6149 		if (error == EKEEPLOOKING) {
6150 			if (!batched) {
6151 				panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6152 			}
6153 
6154 			if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6155 				panic("EKEEPLOOKING, but continue flag not set?");
6156 			}
6157 
6158 			if (vnode_isdir(vp)) {
6159 				error = EISDIR;
6160 				goto out;
6161 			}
6162 			goto continue_lookup;
6163 		} else if (error == ENOENT && batched) {
6164 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6165 				/*
6166 				 * For compound VNOPs, the authorization callback may
6167 				 * return ENOENT in case of racing hardlink lookups
6168 				 * hitting the name  cache, redrive the lookup.
6169 				 */
6170 				do_retry = 1;
6171 				retry_count += 1;
6172 				goto out;
6173 			}
6174 		}
6175 	}
6176 
6177 	/*
6178 	 * Call out to allow 3rd party notification of delete.
6179 	 * Ignore result of kauth_authorize_fileop call.
6180 	 */
6181 	if (!error) {
6182 		if (has_listeners) {
6183 			kauth_authorize_fileop(vfs_context_ucred(ctx),
6184 			    KAUTH_FILEOP_DELETE,
6185 			    (uintptr_t)vp,
6186 			    (uintptr_t)path);
6187 		}
6188 
6189 		if (vp->v_flag & VISHARDLINK) {
6190 			//
6191 			// if a hardlink gets deleted we want to blow away the
6192 			// v_parent link because the path that got us to this
6193 			// instance of the link is no longer valid.  this will
6194 			// force the next call to get the path to ask the file
6195 			// system instead of just following the v_parent link.
6196 			//
6197 			vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6198 		}
6199 
6200 #if CONFIG_FSE
6201 		if (need_event) {
6202 			if (vp->v_flag & VISHARDLINK) {
6203 				get_fse_info(vp, finfop, ctx);
6204 			} else if (vap) {
6205 				vnode_get_fse_info_from_vap(vp, finfop, vap);
6206 			}
6207 			if (truncated_path) {
6208 				finfop->mode |= FSE_TRUNCATED_PATH;
6209 			}
6210 			add_fsevent(FSE_DELETE, ctx,
6211 			    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6212 			    FSE_ARG_FINFO, finfop,
6213 			    FSE_ARG_DONE);
6214 		}
6215 #endif
6216 
6217 #if CONFIG_MACF
6218 		mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6219 #endif
6220 	}
6221 
6222 out:
6223 	if (path != NULL) {
6224 		RELEASE_PATH(path);
6225 		path = NULL;
6226 	}
6227 
6228 	if (no_firmlink_path != NULL) {
6229 		RELEASE_PATH(no_firmlink_path);
6230 		no_firmlink_path = NULL;
6231 	}
6232 #if NAMEDRSRCFORK
6233 	/* recycle the deleted rsrc fork vnode to force a reclaim, which
6234 	 * will cause its shadow file to go away if necessary.
6235 	 */
6236 	if (vp && (vnode_isnamedstream(vp)) &&
6237 	    (vp->v_parent != NULLVP) &&
6238 	    vnode_isshadow(vp)) {
6239 		vnode_recycle(vp);
6240 	}
6241 #endif
6242 	/*
6243 	 * nameidone has to happen before we vnode_put(dvp)
6244 	 * since it may need to release the fs_nodelock on the dvp
6245 	 */
6246 	nameidone(ndp);
6247 	vnode_put(dvp);
6248 	if (vp) {
6249 		vnode_put(vp);
6250 	}
6251 
6252 	if (do_retry) {
6253 		goto retry;
6254 	}
6255 
6256 early_out:
6257 	kfree_type(typeof(*__unlink_data), __unlink_data);
6258 	return error;
6259 }
6260 
6261 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6262 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6263     enum uio_seg segflg, int unlink_flags)
6264 {
6265 	return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6266 	           unlink_flags);
6267 }
6268 
6269 /*
6270  * Delete a name from the filesystem using Carbon semantics.
6271  */
6272 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6273 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6274 {
6275 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6276 	           uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6277 }
6278 
6279 /*
6280  * Delete a name from the filesystem using POSIX semantics.
6281  */
6282 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6283 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6284 {
6285 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6286 	           uap->path, UIO_USERSPACE, 0);
6287 }
6288 
6289 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6290 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6291 {
6292 	int unlink_flags = 0;
6293 
6294 	if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS | AT_SYMLINK_NOFOLLOW_ANY)) {
6295 		return EINVAL;
6296 	}
6297 
6298 	if (uap->flag & AT_SYMLINK_NOFOLLOW_ANY) {
6299 		unlink_flags |= VNODE_REMOVE_NOFOLLOW_ANY;
6300 	}
6301 
6302 	if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6303 		if (uap->flag & AT_REMOVEDIR_DATALESS) {
6304 			unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6305 		}
6306 		return rmdirat_internal(vfs_context_current(), uap->fd,
6307 		           uap->path, UIO_USERSPACE, unlink_flags);
6308 	} else {
6309 		return unlinkat_internal(vfs_context_current(), uap->fd,
6310 		           NULLVP, uap->path, UIO_USERSPACE, unlink_flags);
6311 	}
6312 }
6313 
6314 /*
6315  * Reposition read/write file offset.
6316  */
6317 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6318 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6319 {
6320 	struct fileproc *fp;
6321 	vnode_t vp;
6322 	struct vfs_context *ctx;
6323 	off_t offset = uap->offset, file_size;
6324 	int error;
6325 
6326 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6327 		if (error == ENOTSUP) {
6328 			return ESPIPE;
6329 		}
6330 		return error;
6331 	}
6332 	if (vnode_isfifo(vp)) {
6333 		file_drop(uap->fd);
6334 		return ESPIPE;
6335 	}
6336 
6337 
6338 	ctx = vfs_context_current();
6339 #if CONFIG_MACF
6340 	if (uap->whence == L_INCR && uap->offset == 0) {
6341 		error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6342 		    fp->fp_glob);
6343 	} else {
6344 		error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6345 		    fp->fp_glob);
6346 	}
6347 	if (error) {
6348 		file_drop(uap->fd);
6349 		return error;
6350 	}
6351 #endif
6352 	if ((error = vnode_getwithref(vp))) {
6353 		file_drop(uap->fd);
6354 		return error;
6355 	}
6356 
6357 	switch (uap->whence) {
6358 	case L_INCR:
6359 		offset += fp->fp_glob->fg_offset;
6360 		break;
6361 	case L_XTND:
6362 		if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6363 			break;
6364 		}
6365 		offset += file_size;
6366 		break;
6367 	case L_SET:
6368 		break;
6369 	case SEEK_HOLE:
6370 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6371 		break;
6372 	case SEEK_DATA:
6373 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6374 		break;
6375 	default:
6376 		error = EINVAL;
6377 	}
6378 	if (error == 0) {
6379 		if (uap->offset > 0 && offset < 0) {
6380 			/* Incremented/relative move past max size */
6381 			error = EOVERFLOW;
6382 		} else {
6383 			/*
6384 			 * Allow negative offsets on character devices, per
6385 			 * POSIX 1003.1-2001.  Most likely for writing disk
6386 			 * labels.
6387 			 */
6388 			if (offset < 0 && vp->v_type != VCHR) {
6389 				/* Decremented/relative move before start */
6390 				error = EINVAL;
6391 			} else {
6392 				/* Success */
6393 				fp->fp_glob->fg_offset = offset;
6394 				*retval = fp->fp_glob->fg_offset;
6395 			}
6396 		}
6397 	}
6398 
6399 	/*
6400 	 * An lseek can affect whether data is "available to read."  Use
6401 	 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6402 	 */
6403 	post_event_if_success(vp, error, NOTE_NONE);
6404 	(void)vnode_put(vp);
6405 	file_drop(uap->fd);
6406 	return error;
6407 }
6408 
6409 
6410 /*
6411  * Check access permissions.
6412  *
6413  * Returns:	0			Success
6414  *		vnode_authorize:???
6415  */
6416 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6417 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6418 {
6419 	kauth_action_t action;
6420 	int error;
6421 
6422 	/*
6423 	 * If just the regular access bits, convert them to something
6424 	 * that vnode_authorize will understand.
6425 	 */
6426 	if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6427 		action = 0;
6428 		if (uflags & R_OK) {
6429 			action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
6430 		}
6431 		if (uflags & W_OK) {
6432 			if (vnode_isdir(vp)) {
6433 				action |= KAUTH_VNODE_ADD_FILE |
6434 				    KAUTH_VNODE_ADD_SUBDIRECTORY;
6435 				/* might want delete rights here too */
6436 			} else {
6437 				action |= KAUTH_VNODE_WRITE_DATA;
6438 			}
6439 		}
6440 		if (uflags & X_OK) {
6441 			if (vnode_isdir(vp)) {
6442 				action |= KAUTH_VNODE_SEARCH;
6443 			} else {
6444 				action |= KAUTH_VNODE_EXECUTE;
6445 			}
6446 		}
6447 	} else {
6448 		/* take advantage of definition of uflags */
6449 		action = uflags >> 8;
6450 	}
6451 
6452 #if CONFIG_MACF
6453 	error = mac_vnode_check_access(ctx, vp, uflags);
6454 	if (error) {
6455 		return error;
6456 	}
6457 #endif /* MAC */
6458 
6459 	/* action == 0 means only check for existence */
6460 	if (action != 0) {
6461 		error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6462 	} else {
6463 		error = 0;
6464 	}
6465 
6466 	return error;
6467 }
6468 
6469 
6470 
6471 /*
6472  * access_extended: Check access permissions in bulk.
6473  *
6474  * Description:	uap->entries		Pointer to an array of accessx
6475  *                                      descriptor structs, plus one or
6476  *                                      more NULL terminated strings (see
6477  *                                      "Notes" section below).
6478  *		uap->size		Size of the area pointed to by
6479  *					uap->entries.
6480  *		uap->results		Pointer to the results array.
6481  *
6482  * Returns:	0			Success
6483  *		ENOMEM			Insufficient memory
6484  *		EINVAL			Invalid arguments
6485  *		namei:EFAULT		Bad address
6486  *		namei:ENAMETOOLONG	Filename too long
6487  *		namei:ENOENT		No such file or directory
6488  *		namei:ELOOP		Too many levels of symbolic links
6489  *		namei:EBADF		Bad file descriptor
6490  *		namei:ENOTDIR		Not a directory
6491  *		namei:???
6492  *		access1:
6493  *
6494  * Implicit returns:
6495  *		uap->results		Array contents modified
6496  *
6497  * Notes:	The uap->entries are structured as an arbitrary length array
6498  *		of accessx descriptors, followed by one or more NULL terminated
6499  *		strings
6500  *
6501  *			struct accessx_descriptor[0]
6502  *			...
6503  *			struct accessx_descriptor[n]
6504  *			char name_data[0];
6505  *
6506  *		We determine the entry count by walking the buffer containing
6507  *		the uap->entries argument descriptor.  For each descriptor we
6508  *		see, the valid values for the offset ad_name_offset will be
6509  *		in the byte range:
6510  *
6511  *			[ uap->entries + sizeof(struct accessx_descriptor) ]
6512  *						to
6513  *				[ uap->entries + uap->size - 2 ]
6514  *
6515  *		since we must have at least one string, and the string must
6516  *		be at least one character plus the NULL terminator in length.
6517  *
6518  * XXX:		Need to support the check-as uid argument
6519  */
6520 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6521 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6522 {
6523 	struct accessx_descriptor *input = NULL;
6524 	errno_t *result = NULL;
6525 	errno_t error = 0;
6526 	int wantdelete = 0;
6527 	size_t desc_max, desc_actual = 0;
6528 	unsigned int i, j;
6529 	struct vfs_context context;
6530 	struct nameidata nd;
6531 	int niopts;
6532 	vnode_t vp = NULL;
6533 	vnode_t dvp = NULL;
6534 #define ACCESSX_MAX_DESCR_ON_STACK 10
6535 	struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6536 
6537 	context.vc_ucred = NULL;
6538 
6539 	/*
6540 	 * Validate parameters; if valid, copy the descriptor array and string
6541 	 * arguments into local memory.  Before proceeding, the following
6542 	 * conditions must have been met:
6543 	 *
6544 	 * o	The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6545 	 * o	There must be sufficient room in the request for at least one
6546 	 *	descriptor and a one yte NUL terminated string.
6547 	 * o	The allocation of local storage must not fail.
6548 	 */
6549 	if (uap->size > ACCESSX_MAX_TABLESIZE) {
6550 		return ENOMEM;
6551 	}
6552 	if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6553 		return EINVAL;
6554 	}
6555 	if (uap->size <= sizeof(stack_input)) {
6556 		input = stack_input;
6557 	} else {
6558 		input = kalloc_data(uap->size, Z_WAITOK);
6559 		if (input == NULL) {
6560 			error = ENOMEM;
6561 			goto out;
6562 		}
6563 	}
6564 	error = copyin(uap->entries, input, uap->size);
6565 	if (error) {
6566 		goto out;
6567 	}
6568 
6569 	AUDIT_ARG(opaque, input, uap->size);
6570 
6571 	/*
6572 	 * Force NUL termination of the copyin buffer to avoid nami() running
6573 	 * off the end.  If the caller passes us bogus data, they may get a
6574 	 * bogus result.
6575 	 */
6576 	((char *)input)[uap->size - 1] = 0;
6577 
6578 	/*
6579 	 * Access is defined as checking against the process' real identity,
6580 	 * even if operations are checking the effective identity.  This
6581 	 * requires that we use a local vfs context.
6582 	 */
6583 	context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6584 	context.vc_thread = current_thread();
6585 
6586 	/*
6587 	 * Find out how many entries we have, so we can allocate the result
6588 	 * array by walking the list and adjusting the count downward by the
6589 	 * earliest string offset we see.
6590 	 */
6591 	desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6592 	desc_actual = desc_max;
6593 	for (i = 0; i < desc_actual; i++) {
6594 		/*
6595 		 * Take the offset to the name string for this entry and
6596 		 * convert to an input array index, which would be one off
6597 		 * the end of the array if this entry was the lowest-addressed
6598 		 * name string.
6599 		 */
6600 		j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6601 
6602 		/*
6603 		 * An offset greater than the max allowable offset is an error.
6604 		 * It is also an error for any valid entry to point
6605 		 * to a location prior to the end of the current entry, if
6606 		 * it's not a reference to the string of the previous entry.
6607 		 */
6608 		if (j > desc_max || (j != 0 && j <= i)) {
6609 			error = EINVAL;
6610 			goto out;
6611 		}
6612 
6613 		/* Also do not let ad_name_offset point to something beyond the size of the input */
6614 		if (input[i].ad_name_offset >= uap->size) {
6615 			error = EINVAL;
6616 			goto out;
6617 		}
6618 
6619 		/*
6620 		 * An offset of 0 means use the previous descriptor's offset;
6621 		 * this is used to chain multiple requests for the same file
6622 		 * to avoid multiple lookups.
6623 		 */
6624 		if (j == 0) {
6625 			/* This is not valid for the first entry */
6626 			if (i == 0) {
6627 				error = EINVAL;
6628 				goto out;
6629 			}
6630 			continue;
6631 		}
6632 
6633 		/*
6634 		 * If the offset of the string for this descriptor is before
6635 		 * what we believe is the current actual last descriptor,
6636 		 * then we need to adjust our estimate downward; this permits
6637 		 * the string table following the last descriptor to be out
6638 		 * of order relative to the descriptor list.
6639 		 */
6640 		if (j < desc_actual) {
6641 			desc_actual = j;
6642 		}
6643 	}
6644 
6645 	/*
6646 	 * We limit the actual number of descriptors we are willing to process
6647 	 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
6648 	 * requested does not exceed this limit,
6649 	 */
6650 	if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6651 		error = ENOMEM;
6652 		goto out;
6653 	}
6654 	result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6655 	if (result == NULL) {
6656 		error = ENOMEM;
6657 		goto out;
6658 	}
6659 
6660 	/*
6661 	 * Do the work by iterating over the descriptor entries we know to
6662 	 * at least appear to contain valid data.
6663 	 */
6664 	error = 0;
6665 	for (i = 0; i < desc_actual; i++) {
6666 		/*
6667 		 * If the ad_name_offset is 0, then we use the previous
6668 		 * results to make the check; otherwise, we are looking up
6669 		 * a new file name.
6670 		 */
6671 		if (input[i].ad_name_offset != 0) {
6672 			/* discard old vnodes */
6673 			if (vp) {
6674 				vnode_put(vp);
6675 				vp = NULL;
6676 			}
6677 			if (dvp) {
6678 				vnode_put(dvp);
6679 				dvp = NULL;
6680 			}
6681 
6682 			/*
6683 			 * Scan forward in the descriptor list to see if we
6684 			 * need the parent vnode.  We will need it if we are
6685 			 * deleting, since we must have rights  to remove
6686 			 * entries in the parent directory, as well as the
6687 			 * rights to delete the object itself.
6688 			 */
6689 			wantdelete = input[i].ad_flags & _DELETE_OK;
6690 			for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6691 				if (input[j].ad_flags & _DELETE_OK) {
6692 					wantdelete = 1;
6693 				}
6694 			}
6695 
6696 			niopts = FOLLOW | AUDITVNPATH1;
6697 
6698 			/* need parent for vnode_authorize for deletion test */
6699 			if (wantdelete) {
6700 				niopts |= WANTPARENT;
6701 			}
6702 
6703 			/* do the lookup */
6704 			NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6705 			    CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6706 			    &context);
6707 			error = namei(&nd);
6708 			if (!error) {
6709 				vp = nd.ni_vp;
6710 				if (wantdelete) {
6711 					dvp = nd.ni_dvp;
6712 				}
6713 			}
6714 			nameidone(&nd);
6715 		}
6716 
6717 		/*
6718 		 * Handle lookup errors.
6719 		 */
6720 		switch (error) {
6721 		case ENOENT:
6722 		case EACCES:
6723 		case EPERM:
6724 		case ENOTDIR:
6725 			result[i] = error;
6726 			break;
6727 		case 0:
6728 			/* run this access check */
6729 			result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6730 			break;
6731 		default:
6732 			/* fatal lookup error */
6733 
6734 			goto out;
6735 		}
6736 	}
6737 
6738 	AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6739 
6740 	/* copy out results */
6741 	error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6742 
6743 out:
6744 	if (input && input != stack_input) {
6745 		kfree_data(input, uap->size);
6746 	}
6747 	if (result) {
6748 		kfree_data(result, desc_actual * sizeof(errno_t));
6749 	}
6750 	if (vp) {
6751 		vnode_put(vp);
6752 	}
6753 	if (dvp) {
6754 		vnode_put(dvp);
6755 	}
6756 	if (IS_VALID_CRED(context.vc_ucred)) {
6757 		kauth_cred_unref(&context.vc_ucred);
6758 	}
6759 	return error;
6760 }
6761 
6762 
6763 /*
6764  * Returns:	0			Success
6765  *		namei:EFAULT		Bad address
6766  *		namei:ENAMETOOLONG	Filename too long
6767  *		namei:ENOENT		No such file or directory
6768  *		namei:ELOOP		Too many levels of symbolic links
6769  *		namei:EBADF		Bad file descriptor
6770  *		namei:ENOTDIR		Not a directory
6771  *		namei:???
6772  *		access1:
6773  */
6774 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6775 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6776     int flag, enum uio_seg segflg)
6777 {
6778 	int error;
6779 	struct nameidata nd;
6780 	int niopts;
6781 	struct vfs_context context;
6782 #if NAMEDRSRCFORK
6783 	int is_namedstream = 0;
6784 #endif
6785 
6786 	/*
6787 	 * Unless the AT_EACCESS option is used, Access is defined as checking
6788 	 * against the process' real identity, even if operations are checking
6789 	 * the effective identity.  So we need to tweak the credential
6790 	 * in the context for that case.
6791 	 */
6792 	if (!(flag & AT_EACCESS)) {
6793 		context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6794 	} else {
6795 		context.vc_ucred = ctx->vc_ucred;
6796 	}
6797 	context.vc_thread = ctx->vc_thread;
6798 
6799 
6800 	niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6801 	/* need parent for vnode_authorize for deletion test */
6802 	if (amode & _DELETE_OK) {
6803 		niopts |= WANTPARENT;
6804 	}
6805 	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6806 	    path, &context);
6807 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6808 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6809 	}
6810 
6811 #if NAMEDRSRCFORK
6812 	/* access(F_OK) calls are allowed for resource forks. */
6813 	if (amode == F_OK) {
6814 		nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6815 	}
6816 #endif
6817 	error = nameiat(&nd, fd);
6818 	if (error) {
6819 		goto out;
6820 	}
6821 
6822 #if NAMEDRSRCFORK
6823 	/* Grab reference on the shadow stream file vnode to
6824 	 * force an inactive on release which will mark it
6825 	 * for recycle.
6826 	 */
6827 	if (vnode_isnamedstream(nd.ni_vp) &&
6828 	    (nd.ni_vp->v_parent != NULLVP) &&
6829 	    vnode_isshadow(nd.ni_vp)) {
6830 		is_namedstream = 1;
6831 		vnode_ref(nd.ni_vp);
6832 	}
6833 #endif
6834 
6835 	error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6836 
6837 #if NAMEDRSRCFORK
6838 	if (is_namedstream) {
6839 		vnode_rele(nd.ni_vp);
6840 	}
6841 #endif
6842 
6843 	vnode_put(nd.ni_vp);
6844 	if (amode & _DELETE_OK) {
6845 		vnode_put(nd.ni_dvp);
6846 	}
6847 	nameidone(&nd);
6848 
6849 out:
6850 	if (!(flag & AT_EACCESS)) {
6851 		kauth_cred_unref(&context.vc_ucred);
6852 	}
6853 	return error;
6854 }
6855 
6856 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)6857 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6858 {
6859 	return faccessat_internal(vfs_context_current(), AT_FDCWD,
6860 	           uap->path, uap->flags, 0, UIO_USERSPACE);
6861 }
6862 
6863 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)6864 faccessat(__unused proc_t p, struct faccessat_args *uap,
6865     __unused int32_t *retval)
6866 {
6867 	if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
6868 		return EINVAL;
6869 	}
6870 
6871 	return faccessat_internal(vfs_context_current(), uap->fd,
6872 	           uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6873 }
6874 
6875 /*
6876  * Returns:	0			Success
6877  *		EFAULT
6878  *	copyout:EFAULT
6879  *	namei:???
6880  *	vn_stat:???
6881  */
6882 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)6883 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6884     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6885     enum uio_seg segflg, int fd, int flag)
6886 {
6887 	struct nameidata *ndp = NULL;
6888 	int follow;
6889 	union {
6890 		struct stat sb;
6891 		struct stat64 sb64;
6892 	} source = {};
6893 	union {
6894 		struct user64_stat user64_sb;
6895 		struct user32_stat user32_sb;
6896 		struct user64_stat64 user64_sb64;
6897 		struct user32_stat64 user32_sb64;
6898 	} dest = {};
6899 	caddr_t sbp;
6900 	int error, my_size;
6901 	kauth_filesec_t fsec = KAUTH_FILESEC_NONE;
6902 	size_t xsecurity_bufsize;
6903 	void * statptr;
6904 	struct fileproc *fp = NULL;
6905 	int needsrealdev = 0;
6906 
6907 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
6908 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
6909 	NDINIT(ndp, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6910 	    segflg, path, ctx);
6911 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6912 		ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
6913 	}
6914 
6915 #if NAMEDRSRCFORK
6916 	int is_namedstream = 0;
6917 	/* stat calls are allowed for resource forks. */
6918 	ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6919 #endif
6920 
6921 	if (flag & AT_FDONLY) {
6922 		vnode_t fvp;
6923 
6924 		error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6925 		if (error) {
6926 			goto out;
6927 		}
6928 		if ((error = vnode_getwithref(fvp))) {
6929 			file_drop(fd);
6930 			goto out;
6931 		}
6932 		ndp->ni_vp = fvp;
6933 	} else {
6934 		error = nameiat(ndp, fd);
6935 		if (error) {
6936 			goto out;
6937 		}
6938 	}
6939 
6940 	statptr = (void *)&source;
6941 
6942 #if NAMEDRSRCFORK
6943 	/* Grab reference on the shadow stream file vnode to
6944 	 * force an inactive on release which will mark it
6945 	 * for recycle.
6946 	 */
6947 	if (vnode_isnamedstream(ndp->ni_vp) &&
6948 	    (ndp->ni_vp->v_parent != NULLVP) &&
6949 	    vnode_isshadow(ndp->ni_vp)) {
6950 		is_namedstream = 1;
6951 		vnode_ref(ndp->ni_vp);
6952 	}
6953 #endif
6954 
6955 	needsrealdev = flag & AT_REALDEV ? 1 : 0;
6956 	if (fp && (xsecurity == USER_ADDR_NULL)) {
6957 		/*
6958 		 * If the caller has the file open, and is not
6959 		 * requesting extended security information, we are
6960 		 * going to let them get the basic stat information.
6961 		 */
6962 		error = vn_stat_noauth(ndp->ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6963 		    fp->fp_glob->fg_cred);
6964 	} else {
6965 		error = vn_stat(ndp->ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6966 		    isstat64, needsrealdev, ctx);
6967 	}
6968 
6969 #if NAMEDRSRCFORK
6970 	if (is_namedstream) {
6971 		vnode_rele(ndp->ni_vp);
6972 	}
6973 #endif
6974 	vnode_put(ndp->ni_vp);
6975 	nameidone(ndp);
6976 
6977 	if (fp) {
6978 		file_drop(fd);
6979 		fp = NULL;
6980 	}
6981 
6982 	if (error) {
6983 		goto out;
6984 	}
6985 	/* Zap spare fields */
6986 	if (isstat64 != 0) {
6987 		source.sb64.st_lspare = 0;
6988 		source.sb64.st_qspare[0] = 0LL;
6989 		source.sb64.st_qspare[1] = 0LL;
6990 		if (vfs_context_is64bit(ctx)) {
6991 			munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6992 			my_size = sizeof(dest.user64_sb64);
6993 			sbp = (caddr_t)&dest.user64_sb64;
6994 		} else {
6995 			munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6996 			my_size = sizeof(dest.user32_sb64);
6997 			sbp = (caddr_t)&dest.user32_sb64;
6998 		}
6999 		/*
7000 		 * Check if we raced (post lookup) against the last unlink of a file.
7001 		 */
7002 		if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7003 			source.sb64.st_nlink = 1;
7004 		}
7005 	} else {
7006 		source.sb.st_lspare = 0;
7007 		source.sb.st_qspare[0] = 0LL;
7008 		source.sb.st_qspare[1] = 0LL;
7009 		if (vfs_context_is64bit(ctx)) {
7010 			munge_user64_stat(&source.sb, &dest.user64_sb);
7011 			my_size = sizeof(dest.user64_sb);
7012 			sbp = (caddr_t)&dest.user64_sb;
7013 		} else {
7014 			munge_user32_stat(&source.sb, &dest.user32_sb);
7015 			my_size = sizeof(dest.user32_sb);
7016 			sbp = (caddr_t)&dest.user32_sb;
7017 		}
7018 
7019 		/*
7020 		 * Check if we raced (post lookup) against the last unlink of a file.
7021 		 */
7022 		if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7023 			source.sb.st_nlink = 1;
7024 		}
7025 	}
7026 	if ((error = copyout(sbp, ub, my_size)) != 0) {
7027 		goto out;
7028 	}
7029 
7030 	/* caller wants extended security information? */
7031 	if (xsecurity != USER_ADDR_NULL) {
7032 		/* did we get any? */
7033 		if (fsec == KAUTH_FILESEC_NONE) {
7034 			if (susize(xsecurity_size, 0) != 0) {
7035 				error = EFAULT;
7036 				goto out;
7037 			}
7038 		} else {
7039 			/* find the user buffer size */
7040 			xsecurity_bufsize = fusize(xsecurity_size);
7041 
7042 			/* copy out the actual data size */
7043 			if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7044 				error = EFAULT;
7045 				goto out;
7046 			}
7047 
7048 			/* if the caller supplied enough room, copy out to it */
7049 			if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7050 				error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7051 			}
7052 		}
7053 	}
7054 out:
7055 	if (ndp) {
7056 		kfree_type(struct nameidata, ndp);
7057 	}
7058 	if (fsec != KAUTH_FILESEC_NONE) {
7059 		kauth_filesec_free(fsec);
7060 	}
7061 	return error;
7062 }
7063 
7064 /*
7065  * stat_extended: Get file status; with extended security (ACL).
7066  *
7067  * Parameters:    p                       (ignored)
7068  *                uap                     User argument descriptor (see below)
7069  *                retval                  (ignored)
7070  *
7071  * Indirect:      uap->path               Path of file to get status from
7072  *                uap->ub                 User buffer (holds file status info)
7073  *                uap->xsecurity          ACL to get (extended security)
7074  *                uap->xsecurity_size     Size of ACL
7075  *
7076  * Returns:        0                      Success
7077  *                !0                      errno value
7078  *
7079  */
7080 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7081 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7082     __unused int32_t *retval)
7083 {
7084 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7085 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7086 	           0);
7087 }
7088 
7089 /*
7090  * Returns:	0			Success
7091  *	fstatat_internal:???		[see fstatat_internal() in this file]
7092  */
7093 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7094 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7095 {
7096 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7097 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7098 }
7099 
7100 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7101 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7102 {
7103 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7104 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7105 }
7106 
7107 /*
7108  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7109  *
7110  * Parameters:    p                       (ignored)
7111  *                uap                     User argument descriptor (see below)
7112  *                retval                  (ignored)
7113  *
7114  * Indirect:      uap->path               Path of file to get status from
7115  *                uap->ub                 User buffer (holds file status info)
7116  *                uap->xsecurity          ACL to get (extended security)
7117  *                uap->xsecurity_size     Size of ACL
7118  *
7119  * Returns:        0                      Success
7120  *                !0                      errno value
7121  *
7122  */
7123 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7124 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7125 {
7126 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7127 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7128 	           0);
7129 }
7130 
7131 /*
7132  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7133  *
7134  * Parameters:    p                       (ignored)
7135  *                uap                     User argument descriptor (see below)
7136  *                retval                  (ignored)
7137  *
7138  * Indirect:      uap->path               Path of file to get status from
7139  *                uap->ub                 User buffer (holds file status info)
7140  *                uap->xsecurity          ACL to get (extended security)
7141  *                uap->xsecurity_size     Size of ACL
7142  *
7143  * Returns:        0                      Success
7144  *                !0                      errno value
7145  *
7146  */
7147 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7148 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7149 {
7150 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7151 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7152 	           AT_SYMLINK_NOFOLLOW);
7153 }
7154 
7155 /*
7156  * Get file status; this version does not follow links.
7157  */
7158 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7159 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7160 {
7161 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7162 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7163 }
7164 
7165 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7166 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7167 {
7168 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7169 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7170 }
7171 
7172 /*
7173  * lstat64_extended: Get file status; can handle large inode numbers; does not
7174  * follow links; with extended security (ACL).
7175  *
7176  * Parameters:    p                       (ignored)
7177  *                uap                     User argument descriptor (see below)
7178  *                retval                  (ignored)
7179  *
7180  * Indirect:      uap->path               Path of file to get status from
7181  *                uap->ub                 User buffer (holds file status info)
7182  *                uap->xsecurity          ACL to get (extended security)
7183  *                uap->xsecurity_size     Size of ACL
7184  *
7185  * Returns:        0                      Success
7186  *                !0                      errno value
7187  *
7188  */
7189 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7190 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7191 {
7192 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7193 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7194 	           AT_SYMLINK_NOFOLLOW);
7195 }
7196 
7197 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7198 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7199 {
7200 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7201 		return EINVAL;
7202 	}
7203 
7204 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7205 	           0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7206 }
7207 
7208 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7209 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7210     __unused int32_t *retval)
7211 {
7212 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7213 		return EINVAL;
7214 	}
7215 
7216 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7217 	           0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7218 }
7219 
7220 /*
7221  * Get configurable pathname variables.
7222  *
7223  * Returns:	0			Success
7224  *	namei:???
7225  *	vn_pathconf:???
7226  *
7227  * Notes:	Global implementation  constants are intended to be
7228  *		implemented in this function directly; all other constants
7229  *		are per-FS implementation, and therefore must be handled in
7230  *		each respective FS, instead.
7231  *
7232  * XXX We implement some things globally right now that should actually be
7233  * XXX per-FS; we will need to deal with this at some point.
7234  */
7235 /* ARGSUSED */
7236 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7237 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7238 {
7239 	int error;
7240 	struct nameidata nd;
7241 	vfs_context_t ctx = vfs_context_current();
7242 
7243 	NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7244 	    UIO_USERSPACE, uap->path, ctx);
7245 	error = namei(&nd);
7246 	if (error) {
7247 		return error;
7248 	}
7249 
7250 	error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7251 
7252 	vnode_put(nd.ni_vp);
7253 	nameidone(&nd);
7254 	return error;
7255 }
7256 
7257 /*
7258  * Return target name of a symbolic link.
7259  */
7260 /* ARGSUSED */
7261 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7262 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7263     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7264     int *retval)
7265 {
7266 	vnode_t vp;
7267 	uio_t auio;
7268 	int error;
7269 	struct nameidata nd;
7270 	UIO_STACKBUF(uio_buf, 1);
7271 	bool put_vnode;
7272 
7273 	if (bufsize > INT32_MAX) {
7274 		return EINVAL;
7275 	}
7276 
7277 	if (lnk_vp) {
7278 		vp = lnk_vp;
7279 		put_vnode = false;
7280 	} else {
7281 		NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7282 		    seg, path, ctx);
7283 
7284 		error = nameiat(&nd, fd);
7285 		if (error) {
7286 			return error;
7287 		}
7288 		vp = nd.ni_vp;
7289 		put_vnode = true;
7290 		nameidone(&nd);
7291 	}
7292 
7293 	auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7294 	    &uio_buf[0], sizeof(uio_buf));
7295 	uio_addiov(auio, buf, bufsize);
7296 	if (vp->v_type != VLNK) {
7297 		error = EINVAL;
7298 	} else {
7299 #if CONFIG_MACF
7300 		error = mac_vnode_check_readlink(ctx, vp);
7301 #endif
7302 		if (error == 0) {
7303 			error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7304 			    ctx);
7305 		}
7306 		if (error == 0) {
7307 			error = VNOP_READLINK(vp, auio, ctx);
7308 		}
7309 	}
7310 
7311 	if (put_vnode) {
7312 		vnode_put(vp);
7313 	}
7314 
7315 	*retval = (int)(bufsize - uio_resid(auio));
7316 	return error;
7317 }
7318 
7319 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7320 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7321 {
7322 	enum uio_seg procseg;
7323 	vnode_t vp;
7324 	int error;
7325 
7326 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7327 
7328 	AUDIT_ARG(fd, uap->fd);
7329 
7330 	if ((error = file_vnode(uap->fd, &vp))) {
7331 		return error;
7332 	}
7333 	if ((error = vnode_getwithref(vp))) {
7334 		file_drop(uap->fd);
7335 		return error;
7336 	}
7337 
7338 	error = readlinkat_internal(vfs_context_current(), -1,
7339 	    vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7340 	    uap->bufsize, procseg, retval);
7341 
7342 	vnode_put(vp);
7343 	file_drop(uap->fd);
7344 	return error;
7345 }
7346 
7347 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7348 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7349 {
7350 	enum uio_seg procseg;
7351 
7352 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7353 	return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7354 	           CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7355 	           uap->count, procseg, retval);
7356 }
7357 
7358 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7359 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7360 {
7361 	enum uio_seg procseg;
7362 
7363 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7364 	return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7365 	           CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7366 	           retval);
7367 }
7368 
7369 /*
7370  * Change file flags, the deep inner layer.
7371  */
7372 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7373 chflags0(vnode_t vp, struct vnode_attr *va,
7374     int (*setattr)(vnode_t, void *, vfs_context_t),
7375     void *arg, vfs_context_t ctx)
7376 {
7377 	kauth_action_t action = 0;
7378 	int error;
7379 
7380 #if CONFIG_MACF
7381 	error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7382 	if (error) {
7383 		goto out;
7384 	}
7385 #endif
7386 
7387 	/* request authorisation, disregard immutability */
7388 	if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7389 		goto out;
7390 	}
7391 	/*
7392 	 * Request that the auth layer disregard those file flags it's allowed to when
7393 	 * authorizing this operation; we need to do this in order to be able to
7394 	 * clear immutable flags.
7395 	 */
7396 	if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7397 		goto out;
7398 	}
7399 	error = (*setattr)(vp, arg, ctx);
7400 
7401 #if CONFIG_MACF
7402 	if (error == 0) {
7403 		mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7404 	}
7405 #endif
7406 
7407 out:
7408 	return error;
7409 }
7410 
7411 /*
7412  * Change file flags.
7413  *
7414  * NOTE: this will vnode_put() `vp'
7415  */
7416 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7417 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7418 {
7419 	struct vnode_attr va;
7420 	int error;
7421 
7422 	VATTR_INIT(&va);
7423 	VATTR_SET(&va, va_flags, flags);
7424 
7425 	error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7426 	vnode_put(vp);
7427 
7428 	if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7429 		error = ENOTSUP;
7430 	}
7431 
7432 	return error;
7433 }
7434 
7435 /*
7436  * Change flags of a file given a path name.
7437  */
7438 /* ARGSUSED */
7439 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7440 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7441 {
7442 	vnode_t vp;
7443 	vfs_context_t ctx = vfs_context_current();
7444 	int error;
7445 	struct nameidata nd;
7446 	uint32_t wantparent = 0;
7447 
7448 #if CONFIG_FILE_LEASES
7449 	wantparent = WANTPARENT;
7450 #endif
7451 
7452 	AUDIT_ARG(fflags, uap->flags);
7453 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7454 	    UIO_USERSPACE, uap->path, ctx);
7455 	error = namei(&nd);
7456 	if (error) {
7457 		return error;
7458 	}
7459 	vp = nd.ni_vp;
7460 
7461 #if CONFIG_FILE_LEASES
7462 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7463 	vnode_put(nd.ni_dvp);
7464 #endif
7465 
7466 	nameidone(&nd);
7467 
7468 	/* we don't vnode_put() here because chflags1 does internally */
7469 	error = chflags1(vp, uap->flags, ctx);
7470 
7471 	return error;
7472 }
7473 
7474 /*
7475  * Change flags of a file given a file descriptor.
7476  */
7477 /* ARGSUSED */
7478 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7479 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7480 {
7481 	vnode_t vp;
7482 	int error;
7483 
7484 	AUDIT_ARG(fd, uap->fd);
7485 	AUDIT_ARG(fflags, uap->flags);
7486 	if ((error = file_vnode(uap->fd, &vp))) {
7487 		return error;
7488 	}
7489 
7490 	if ((error = vnode_getwithref(vp))) {
7491 		file_drop(uap->fd);
7492 		return error;
7493 	}
7494 
7495 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7496 
7497 #if CONFIG_FILE_LEASES
7498 	vnode_breakdirlease(vp, true, O_WRONLY);
7499 #endif
7500 
7501 	/* we don't vnode_put() here because chflags1 does internally */
7502 	error = chflags1(vp, uap->flags, vfs_context_current());
7503 
7504 	file_drop(uap->fd);
7505 	return error;
7506 }
7507 
7508 /*
7509  * Change security information on a filesystem object.
7510  *
7511  * Returns:	0			Success
7512  *		EPERM			Operation not permitted
7513  *		vnode_authattr:???	[anything vnode_authattr can return]
7514  *		vnode_authorize:???	[anything vnode_authorize can return]
7515  *		vnode_setattr:???	[anything vnode_setattr can return]
7516  *
7517  * Notes:	If vnode_authattr or vnode_authorize return EACCES, it will be
7518  *		translated to EPERM before being returned.
7519  */
7520 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7521 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7522 {
7523 	kauth_action_t action;
7524 	int error;
7525 
7526 	AUDIT_ARG(mode, vap->va_mode);
7527 	/* XXX audit new args */
7528 
7529 #if NAMEDSTREAMS
7530 	/* chmod calls are not allowed for resource forks. */
7531 	if (vp->v_flag & VISNAMEDSTREAM) {
7532 		return EPERM;
7533 	}
7534 #endif
7535 
7536 #if CONFIG_MACF
7537 	if (VATTR_IS_ACTIVE(vap, va_mode) &&
7538 	    (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7539 		return error;
7540 	}
7541 
7542 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7543 		if ((error = mac_vnode_check_setowner(ctx, vp,
7544 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7545 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7546 			return error;
7547 		}
7548 	}
7549 
7550 	if (VATTR_IS_ACTIVE(vap, va_acl) &&
7551 	    (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7552 		return error;
7553 	}
7554 #endif
7555 
7556 	/* make sure that the caller is allowed to set this security information */
7557 	if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7558 	    ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7559 		if (error == EACCES) {
7560 			error = EPERM;
7561 		}
7562 		return error;
7563 	}
7564 
7565 	if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7566 		return error;
7567 	}
7568 
7569 #if CONFIG_MACF
7570 	if (VATTR_IS_ACTIVE(vap, va_mode)) {
7571 		mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7572 	}
7573 
7574 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7575 		mac_vnode_notify_setowner(ctx, vp,
7576 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7577 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7578 	}
7579 
7580 	if (VATTR_IS_ACTIVE(vap, va_acl)) {
7581 		mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7582 	}
7583 #endif
7584 
7585 	return error;
7586 }
7587 
7588 
7589 /*
7590  * Change mode of a file given a path name.
7591  *
7592  * Returns:	0			Success
7593  *		namei:???		[anything namei can return]
7594  *		chmod_vnode:???		[anything chmod_vnode can return]
7595  */
7596 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7597 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7598     int fd, int flag, enum uio_seg segflg)
7599 {
7600 	struct nameidata nd;
7601 	int follow, error;
7602 	uint32_t wantparent = 0;
7603 
7604 #if CONFIG_FILE_LEASES
7605 	wantparent = WANTPARENT;
7606 #endif
7607 
7608 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7609 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7610 	    segflg, path, ctx);
7611 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7612 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7613 	}
7614 	if ((error = nameiat(&nd, fd))) {
7615 		return error;
7616 	}
7617 
7618 #if CONFIG_FILE_LEASES
7619 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7620 	vnode_put(nd.ni_dvp);
7621 #endif
7622 
7623 	error = chmod_vnode(ctx, nd.ni_vp, vap);
7624 	vnode_put(nd.ni_vp);
7625 	nameidone(&nd);
7626 	return error;
7627 }
7628 
7629 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)7630 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7631     gid_t gid, user_addr_t xsecurity)
7632 {
7633 	int error;
7634 
7635 	VATTR_INIT(pva);
7636 
7637 	if (mode != -1) {
7638 		VATTR_SET(pva, va_mode, mode & ALLPERMS);
7639 	} else {
7640 		pva->va_mode = 0;
7641 	}
7642 
7643 	if (uid != KAUTH_UID_NONE) {
7644 		VATTR_SET(pva, va_uid, uid);
7645 	}
7646 
7647 	if (gid != KAUTH_GID_NONE) {
7648 		VATTR_SET(pva, va_gid, gid);
7649 	}
7650 
7651 	*pxsecdst = NULL;
7652 	switch (xsecurity) {
7653 	case USER_ADDR_NULL:
7654 		break;
7655 
7656 	case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7657 		VATTR_SET(pva, va_acl, NULL);
7658 		break;
7659 
7660 	default:
7661 		if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
7662 			return error;
7663 		}
7664 
7665 		VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7666 		pva->va_vaflags |= VA_FILESEC_ACL;
7667 		KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7668 		break;
7669 	}
7670 
7671 	return 0;
7672 }
7673 
7674 /*
7675  * chmod_extended: Change the mode of a file given a path name; with extended
7676  * argument list (including extended security (ACL)).
7677  *
7678  * Parameters:	p			Process requesting the open
7679  *		uap			User argument descriptor (see below)
7680  *		retval			(ignored)
7681  *
7682  * Indirect:	uap->path		Path to object (same as 'chmod')
7683  *		uap->uid		UID to set
7684  *		uap->gid		GID to set
7685  *		uap->mode		File mode to set (same as 'chmod')
7686  *		uap->xsecurity		ACL to set (or delete)
7687  *
7688  * Returns:	0			Success
7689  *		!0			errno value
7690  *
7691  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
7692  *
7693  * XXX:		We should enummerate the possible errno values here, and where
7694  *		in the code they originated.
7695  */
7696 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7697 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7698 {
7699 	int error;
7700 	struct vnode_attr va;
7701 	kauth_filesec_t xsecdst = NULL;
7702 
7703 	AUDIT_ARG(owner, uap->uid, uap->gid);
7704 
7705 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7706 	    uap->gid, uap->xsecurity);
7707 
7708 	if (error) {
7709 		return error;
7710 	}
7711 
7712 	error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7713 	    UIO_USERSPACE);
7714 
7715 	if (xsecdst != NULL) {
7716 		kauth_filesec_free(xsecdst);
7717 	}
7718 	return error;
7719 }
7720 
7721 /*
7722  * Returns:	0			Success
7723  *		chmodat:???		[anything chmodat can return]
7724  */
7725 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7726 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7727     int flag, enum uio_seg segflg)
7728 {
7729 	struct vnode_attr va;
7730 
7731 	VATTR_INIT(&va);
7732 	VATTR_SET(&va, va_mode, mode & ALLPERMS);
7733 
7734 	return chmodat(ctx, path, &va, fd, flag, segflg);
7735 }
7736 
7737 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7738 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7739 {
7740 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7741 	           AT_FDCWD, 0, UIO_USERSPACE);
7742 }
7743 
7744 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7745 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7746 {
7747 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7748 		return EINVAL;
7749 	}
7750 
7751 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7752 	           uap->fd, uap->flag, UIO_USERSPACE);
7753 }
7754 
7755 /*
7756  * Change mode of a file given a file descriptor.
7757  */
7758 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7759 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7760 {
7761 	vnode_t vp;
7762 	int error;
7763 
7764 	AUDIT_ARG(fd, fd);
7765 
7766 	if ((error = file_vnode(fd, &vp)) != 0) {
7767 		return error;
7768 	}
7769 	if ((error = vnode_getwithref(vp)) != 0) {
7770 		file_drop(fd);
7771 		return error;
7772 	}
7773 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7774 
7775 #if CONFIG_FILE_LEASES
7776 	vnode_breakdirlease(vp, true, O_WRONLY);
7777 #endif
7778 
7779 	error = chmod_vnode(vfs_context_current(), vp, vap);
7780 	(void)vnode_put(vp);
7781 	file_drop(fd);
7782 
7783 	return error;
7784 }
7785 
7786 /*
7787  * fchmod_extended: Change mode of a file given a file descriptor; with
7788  * extended argument list (including extended security (ACL)).
7789  *
7790  * Parameters:    p                       Process requesting to change file mode
7791  *                uap                     User argument descriptor (see below)
7792  *                retval                  (ignored)
7793  *
7794  * Indirect:      uap->mode               File mode to set (same as 'chmod')
7795  *                uap->uid                UID to set
7796  *                uap->gid                GID to set
7797  *                uap->xsecurity          ACL to set (or delete)
7798  *                uap->fd                 File descriptor of file to change mode
7799  *
7800  * Returns:        0                      Success
7801  *                !0                      errno value
7802  *
7803  */
7804 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7805 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7806 {
7807 	int error;
7808 	struct vnode_attr va;
7809 	kauth_filesec_t xsecdst = NULL;
7810 
7811 	AUDIT_ARG(owner, uap->uid, uap->gid);
7812 
7813 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7814 	    uap->gid, uap->xsecurity);
7815 
7816 	if (error) {
7817 		return error;
7818 	}
7819 
7820 	error = fchmod1(p, uap->fd, &va);
7821 
7822 	if (xsecdst != NULL) {
7823 		kauth_filesec_free(xsecdst);
7824 	}
7825 	return error;
7826 }
7827 
7828 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)7829 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7830 {
7831 	struct vnode_attr va;
7832 
7833 	VATTR_INIT(&va);
7834 	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7835 
7836 	return fchmod1(p, uap->fd, &va);
7837 }
7838 
7839 static int
vn_chown_internal(__unused vfs_context_t ctx,vnode_t vp,uid_t uid,gid_t gid)7840 vn_chown_internal(__unused vfs_context_t ctx, vnode_t vp, uid_t uid, gid_t gid)
7841 {
7842 	struct vnode_attr va;
7843 	kauth_action_t action;
7844 	int error;
7845 
7846 	VATTR_INIT(&va);
7847 	if (uid != (uid_t)VNOVAL) {
7848 		VATTR_SET(&va, va_uid, uid);
7849 	}
7850 	if (gid != (gid_t)VNOVAL) {
7851 		VATTR_SET(&va, va_gid, gid);
7852 	}
7853 
7854 #if NAMEDSTREAMS
7855 	/* chown calls are not allowed for resource forks. */
7856 	if (vp->v_flag & VISNAMEDSTREAM) {
7857 		error = EPERM;
7858 		goto out;
7859 	}
7860 #endif
7861 
7862 #if CONFIG_MACF
7863 	error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7864 	if (error) {
7865 		goto out;
7866 	}
7867 #endif
7868 
7869 	/* preflight and authorize attribute changes */
7870 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7871 		goto out;
7872 	}
7873 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7874 		/*
7875 		 * EACCES is only allowed from namei(); permissions failure should
7876 		 * return EPERM, so we need to translate the error code.
7877 		 */
7878 		if (error == EACCES) {
7879 			error = EPERM;
7880 		}
7881 
7882 		goto out;
7883 	}
7884 
7885 #if CONFIG_FILE_LEASES
7886 	vnode_breakdirlease(vp, true, O_WRONLY);
7887 #endif
7888 
7889 	error = vnode_setattr(vp, &va, ctx);
7890 
7891 #if CONFIG_MACF
7892 	if (error == 0) {
7893 		mac_vnode_notify_setowner(ctx, vp, uid, gid);
7894 	}
7895 #endif
7896 
7897 out:
7898 	return error;
7899 }
7900 
7901 /*
7902  * Set ownership given a path name.
7903  */
7904 /* ARGSUSED */
7905 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)7906 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7907     gid_t gid, int flag, enum uio_seg segflg)
7908 {
7909 	vnode_t vp;
7910 	int error;
7911 	struct nameidata nd;
7912 	int follow;
7913 
7914 	AUDIT_ARG(owner, uid, gid);
7915 
7916 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7917 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg, path, ctx);
7918 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7919 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7920 	}
7921 
7922 	error = nameiat(&nd, fd);
7923 	if (error) {
7924 		return error;
7925 	}
7926 
7927 	vp = nd.ni_vp;
7928 	error = vn_chown_internal(ctx, vp, uid, gid);
7929 
7930 	nameidone(&nd);
7931 	vnode_put(vp);
7932 	return error;
7933 }
7934 
7935 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)7936 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7937 {
7938 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7939 	           uap->uid, uap->gid, 0, UIO_USERSPACE);
7940 }
7941 
7942 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)7943 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7944 {
7945 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7946 	           uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7947 }
7948 
7949 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)7950 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7951 {
7952 	if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7953 		return EINVAL;
7954 	}
7955 
7956 	return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7957 	           uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7958 }
7959 
7960 /*
7961  * Set ownership given a file descriptor.
7962  */
7963 /* ARGSUSED */
7964 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)7965 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7966 {
7967 	vfs_context_t ctx = vfs_context_current();
7968 	vnode_t vp;
7969 	int error;
7970 
7971 	AUDIT_ARG(owner, uap->uid, uap->gid);
7972 	AUDIT_ARG(fd, uap->fd);
7973 
7974 	if ((error = file_vnode(uap->fd, &vp))) {
7975 		return error;
7976 	}
7977 
7978 	if ((error = vnode_getwithref(vp))) {
7979 		file_drop(uap->fd);
7980 		return error;
7981 	}
7982 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7983 
7984 	error = vn_chown_internal(ctx, vp, uap->uid, uap->gid);
7985 
7986 	(void)vnode_put(vp);
7987 	file_drop(uap->fd);
7988 	return error;
7989 }
7990 
7991 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)7992 getutimes(user_addr_t usrtvp, struct timespec *tsp)
7993 {
7994 	int error;
7995 
7996 	if (usrtvp == USER_ADDR_NULL) {
7997 		struct timeval old_tv;
7998 		/* XXX Y2038 bug because of microtime argument */
7999 		microtime(&old_tv);
8000 		TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8001 		tsp[1] = tsp[0];
8002 	} else {
8003 		if (IS_64BIT_PROCESS(current_proc())) {
8004 			struct user64_timeval tv[2];
8005 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8006 			if (error) {
8007 				return error;
8008 			}
8009 			TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8010 			TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8011 		} else {
8012 			struct user32_timeval tv[2];
8013 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8014 			if (error) {
8015 				return error;
8016 			}
8017 			TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8018 			TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8019 		}
8020 	}
8021 	return 0;
8022 }
8023 
8024 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8025 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8026     int nullflag)
8027 {
8028 	int error;
8029 	struct vnode_attr va;
8030 	kauth_action_t action;
8031 
8032 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8033 
8034 	VATTR_INIT(&va);
8035 	VATTR_SET(&va, va_access_time, ts[0]);
8036 	VATTR_SET(&va, va_modify_time, ts[1]);
8037 	if (nullflag) {
8038 		va.va_vaflags |= VA_UTIMES_NULL;
8039 	}
8040 
8041 #if NAMEDSTREAMS
8042 	/* utimes calls are not allowed for resource forks. */
8043 	if (vp->v_flag & VISNAMEDSTREAM) {
8044 		error = EPERM;
8045 		goto out;
8046 	}
8047 #endif
8048 
8049 #if CONFIG_MACF
8050 	error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8051 	if (error) {
8052 		goto out;
8053 	}
8054 #endif
8055 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8056 		if (!nullflag && error == EACCES) {
8057 			error = EPERM;
8058 		}
8059 		goto out;
8060 	}
8061 
8062 	/* since we may not need to auth anything, check here */
8063 	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8064 		if (!nullflag && error == EACCES) {
8065 			error = EPERM;
8066 		}
8067 		goto out;
8068 	}
8069 	error = vnode_setattr(vp, &va, ctx);
8070 
8071 #if CONFIG_MACF
8072 	if (error == 0) {
8073 		mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8074 	}
8075 #endif
8076 
8077 out:
8078 	return error;
8079 }
8080 
8081 /*
8082  * Set the access and modification times of a file.
8083  */
8084 /* ARGSUSED */
8085 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8086 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8087 {
8088 	struct timespec ts[2];
8089 	user_addr_t usrtvp;
8090 	int error;
8091 	struct nameidata nd;
8092 	vfs_context_t ctx = vfs_context_current();
8093 	uint32_t wantparent = 0;
8094 
8095 #if CONFIG_FILE_LEASES
8096 	wantparent = WANTPARENT;
8097 #endif
8098 
8099 	/*
8100 	 * AUDIT: Needed to change the order of operations to do the
8101 	 * name lookup first because auditing wants the path.
8102 	 */
8103 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8104 	    UIO_USERSPACE, uap->path, ctx);
8105 	error = namei(&nd);
8106 	if (error) {
8107 		return error;
8108 	}
8109 
8110 	/*
8111 	 * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
8112 	 * the current time instead.
8113 	 */
8114 	usrtvp = uap->tptr;
8115 	if ((error = getutimes(usrtvp, ts)) != 0) {
8116 		goto out;
8117 	}
8118 
8119 #if CONFIG_FILE_LEASES
8120 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8121 #endif
8122 
8123 	error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8124 
8125 out:
8126 #if CONFIG_FILE_LEASES
8127 	vnode_put(nd.ni_dvp);
8128 #endif
8129 	nameidone(&nd);
8130 	vnode_put(nd.ni_vp);
8131 	return error;
8132 }
8133 
8134 /*
8135  * Set the access and modification times of a file.
8136  */
8137 /* ARGSUSED */
8138 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8139 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8140 {
8141 	struct timespec ts[2];
8142 	vnode_t vp;
8143 	user_addr_t usrtvp;
8144 	int error;
8145 
8146 	AUDIT_ARG(fd, uap->fd);
8147 	usrtvp = uap->tptr;
8148 	if ((error = getutimes(usrtvp, ts)) != 0) {
8149 		return error;
8150 	}
8151 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
8152 		return error;
8153 	}
8154 	if ((error = vnode_getwithref(vp))) {
8155 		file_drop(uap->fd);
8156 		return error;
8157 	}
8158 
8159 #if CONFIG_FILE_LEASES
8160 	vnode_breakdirlease(vp, true, O_WRONLY);
8161 #endif
8162 
8163 	error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8164 
8165 	vnode_put(vp);
8166 	file_drop(uap->fd);
8167 	return error;
8168 }
8169 
8170 static int
truncate_validate_common(proc_t p,off_t length)8171 truncate_validate_common(proc_t p, off_t length)
8172 {
8173 	rlim_t fsize_limit;
8174 
8175 	if (length < 0) {
8176 		return EINVAL;
8177 	}
8178 
8179 	fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8180 	if ((rlim_t)length > fsize_limit) {
8181 		psignal(p, SIGXFSZ);
8182 		return EFBIG;
8183 	}
8184 
8185 	return 0;
8186 }
8187 
8188 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8189 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8190     vfs_context_t ctx, boolean_t need_auth)
8191 {
8192 	struct vnode_attr va;
8193 	kauth_action_t action;
8194 	int error;
8195 
8196 	VATTR_INIT(&va);
8197 	VATTR_SET(&va, va_data_size, length);
8198 
8199 #if CONFIG_MACF
8200 	error = mac_vnode_check_truncate(ctx, cred, vp);
8201 	if (error) {
8202 		return error;
8203 	}
8204 #endif
8205 
8206 	/*
8207 	 * If we reached here from `ftruncate` then we already did an effective
8208 	 * `vnode_authorize` upon open.  We honour the result from then.
8209 	 */
8210 	if (need_auth) {
8211 		if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8212 			return error;
8213 		}
8214 
8215 		if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8216 			return error;
8217 		}
8218 	}
8219 
8220 #if CONFIG_FILE_LEASES
8221 	/* Check if there is a lease placed on the parent directory. */
8222 	vnode_breakdirlease(vp, true, O_WRONLY);
8223 
8224 	/* Now check if there is a lease placed on the file itself. */
8225 	(void)vnode_breaklease(vp, O_WRONLY, ctx);
8226 #endif
8227 
8228 	error = vnode_setattr(vp, &va, ctx);
8229 
8230 #if CONFIG_MACF
8231 	if (error == 0) {
8232 		mac_vnode_notify_truncate(ctx, cred, vp);
8233 	}
8234 #endif
8235 
8236 	return error;
8237 }
8238 
8239 /*
8240  * Truncate a file given its path name.
8241  */
8242 /* ARGSUSED */
8243 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8244 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8245 {
8246 	vfs_context_t ctx = vfs_context_current();
8247 	vnode_t vp;
8248 	int error;
8249 	struct nameidata nd;
8250 
8251 	if ((error = truncate_validate_common(p, uap->length))) {
8252 		return error;
8253 	}
8254 
8255 	NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8256 	    UIO_USERSPACE, uap->path, ctx);
8257 
8258 	if ((error = namei(&nd))) {
8259 		return error;
8260 	}
8261 
8262 	vp = nd.ni_vp;
8263 	nameidone(&nd);
8264 
8265 	error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8266 	vnode_put(vp);
8267 
8268 	return error;
8269 }
8270 
8271 /*
8272  * Truncate a file given a file descriptor.
8273  */
8274 /* ARGSUSED */
8275 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8276 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8277 {
8278 	vnode_t vp;
8279 	struct fileproc *fp;
8280 	int error;
8281 
8282 	AUDIT_ARG(fd, uap->fd);
8283 
8284 	if ((error = truncate_validate_common(p, uap->length))) {
8285 		return error;
8286 	}
8287 
8288 	if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8289 		return error;
8290 	}
8291 
8292 	switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8293 	case DTYPE_PSXSHM:
8294 		error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8295 		goto out;
8296 	case DTYPE_VNODE:
8297 		break;
8298 	default:
8299 		error = EINVAL;
8300 		goto out;
8301 	}
8302 
8303 	vp = (vnode_t)fp_get_data(fp);
8304 
8305 	if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8306 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8307 		error = EINVAL;
8308 		goto out;
8309 	}
8310 
8311 	if ((error = vnode_getwithref(vp)) != 0) {
8312 		goto out;
8313 	}
8314 
8315 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8316 
8317 	error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8318 	    vfs_context_current(), false);
8319 	vnode_put(vp);
8320 
8321 out:
8322 	file_drop(uap->fd);
8323 	return error;
8324 }
8325 
8326 
8327 /*
8328  * Sync an open file with synchronized I/O _file_ integrity completion
8329  */
8330 /* ARGSUSED */
8331 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8332 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8333 {
8334 	__pthread_testcancel(1);
8335 	return fsync_common(p, uap, MNT_WAIT);
8336 }
8337 
8338 
8339 /*
8340  * Sync an open file with synchronized I/O _file_ integrity completion
8341  *
8342  * Notes:	This is a legacy support function that does not test for
8343  *		thread cancellation points.
8344  */
8345 /* ARGSUSED */
8346 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8347 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8348 {
8349 	return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8350 }
8351 
8352 
8353 /*
8354  * Sync an open file with synchronized I/O _data_ integrity completion
8355  */
8356 /* ARGSUSED */
8357 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8358 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8359 {
8360 	__pthread_testcancel(1);
8361 	return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8362 }
8363 
8364 
8365 /*
8366  * fsync_common
8367  *
8368  * Common fsync code to support both synchronized I/O file integrity completion
8369  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8370  *
8371  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8372  * will only guarantee that the file data contents are retrievable.  If
8373  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8374  * includes additional metadata unnecessary for retrieving the file data
8375  * contents, such as atime, mtime, ctime, etc., also be committed to stable
8376  * storage.
8377  *
8378  * Parameters:	p				The process
8379  *		uap->fd				The descriptor to synchronize
8380  *		flags				The data integrity flags
8381  *
8382  * Returns:	int				Success
8383  *	fp_getfvp:EBADF				Bad file descriptor
8384  *	fp_getfvp:ENOTSUP			fd does not refer to a vnode
8385  *	VNOP_FSYNC:???				unspecified
8386  *
8387  * Notes:	We use struct fsync_args because it is a short name, and all
8388  *		caller argument structures are otherwise identical.
8389  */
8390 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8391 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8392 {
8393 	vnode_t vp;
8394 	struct fileproc *fp;
8395 	vfs_context_t ctx = vfs_context_current();
8396 	int error;
8397 
8398 	AUDIT_ARG(fd, uap->fd);
8399 
8400 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8401 		return error;
8402 	}
8403 	if ((error = vnode_getwithref(vp))) {
8404 		file_drop(uap->fd);
8405 		return error;
8406 	}
8407 
8408 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8409 
8410 	error = VNOP_FSYNC(vp, flags, ctx);
8411 
8412 #if NAMEDRSRCFORK
8413 	/* Sync resource fork shadow file if necessary. */
8414 	if ((error == 0) &&
8415 	    (vp->v_flag & VISNAMEDSTREAM) &&
8416 	    (vp->v_parent != NULLVP) &&
8417 	    vnode_isshadow(vp) &&
8418 	    (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8419 		(void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8420 	}
8421 #endif
8422 
8423 	(void)vnode_put(vp);
8424 	file_drop(uap->fd);
8425 	return error;
8426 }
8427 
8428 /*
8429  * Duplicate files.  Source must be a file, target must be a file or
8430  * must not exist.
8431  *
8432  * XXX Copyfile authorisation checking is woefully inadequate, and will not
8433  *     perform inheritance correctly.
8434  */
8435 /* ARGSUSED */
8436 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8437 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8438 {
8439 	vnode_t tvp, fvp, tdvp, sdvp;
8440 	struct nameidata fromnd, tond;
8441 	int error;
8442 	vfs_context_t ctx = vfs_context_current();
8443 
8444 	/* Check that the flags are valid. */
8445 	if (uap->flags & ~CPF_MASK) {
8446 		return EINVAL;
8447 	}
8448 
8449 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8450 	    UIO_USERSPACE, uap->from, ctx);
8451 	if ((error = namei(&fromnd))) {
8452 		return error;
8453 	}
8454 	fvp = fromnd.ni_vp;
8455 
8456 	NDINIT(&tond, CREATE, OP_LINK,
8457 	    LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8458 	    UIO_USERSPACE, uap->to, ctx);
8459 	if ((error = namei(&tond))) {
8460 		goto out1;
8461 	}
8462 	tdvp = tond.ni_dvp;
8463 	tvp = tond.ni_vp;
8464 
8465 	if (tvp != NULL) {
8466 		if (!(uap->flags & CPF_OVERWRITE)) {
8467 			error = EEXIST;
8468 			goto out;
8469 		}
8470 	}
8471 
8472 	if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8473 		error = EISDIR;
8474 		goto out;
8475 	}
8476 
8477 	if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8478 		error = EOPNOTSUPP;
8479 		goto out;
8480 	}
8481 
8482 #if CONFIG_MACF
8483 	if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8484 		goto out;
8485 	}
8486 #endif /* CONFIG_MACF */
8487 
8488 	if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8489 		goto out;
8490 	}
8491 	if (tvp) {
8492 		if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8493 			goto out;
8494 		}
8495 	}
8496 	if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8497 		goto out;
8498 	}
8499 
8500 	if (fvp == tdvp) {
8501 		error = EINVAL;
8502 	}
8503 	/*
8504 	 * If source is the same as the destination (that is the
8505 	 * same inode number) then there is nothing to do.
8506 	 * (fixed to have POSIX semantics - CSM 3/2/98)
8507 	 */
8508 	if (fvp == tvp) {
8509 		error = -1;
8510 	}
8511 
8512 #if CONFIG_FILE_LEASES
8513 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8514 #endif
8515 
8516 	if (!error) {
8517 		error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8518 	}
8519 out:
8520 	sdvp = tond.ni_startdir;
8521 	/*
8522 	 * nameidone has to happen before we vnode_put(tdvp)
8523 	 * since it may need to release the fs_nodelock on the tdvp
8524 	 */
8525 	nameidone(&tond);
8526 
8527 	if (tvp) {
8528 		vnode_put(tvp);
8529 	}
8530 	vnode_put(tdvp);
8531 	vnode_put(sdvp);
8532 out1:
8533 	vnode_put(fvp);
8534 
8535 	nameidone(&fromnd);
8536 
8537 	if (error == -1) {
8538 		return 0;
8539 	}
8540 	return error;
8541 }
8542 
8543 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8544 
8545 /*
8546  * Helper function for doing clones. The caller is expected to provide an
8547  * iocounted source vnode and release it.
8548  */
8549 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)8550 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8551     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8552 {
8553 	vnode_t tvp, tdvp;
8554 	struct nameidata tond;
8555 	int error;
8556 	int follow;
8557 	boolean_t free_src_acl;
8558 	boolean_t attr_cleanup;
8559 	enum vtype v_type;
8560 	kauth_action_t action;
8561 	struct componentname *cnp;
8562 	uint32_t defaulted = 0;
8563 	struct vnode_attr va;
8564 	struct vnode_attr nva;
8565 	uint32_t vnop_flags;
8566 
8567 	v_type = vnode_vtype(fvp);
8568 	switch (v_type) {
8569 	case VLNK:
8570 	/* FALLTHRU */
8571 	case VREG:
8572 		action = KAUTH_VNODE_ADD_FILE;
8573 		break;
8574 	case VDIR:
8575 		if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8576 		    fvp->v_mountedhere) {
8577 			return EINVAL;
8578 		}
8579 		action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8580 		break;
8581 	default:
8582 		return EINVAL;
8583 	}
8584 
8585 	AUDIT_ARG(fd2, dst_dirfd);
8586 	AUDIT_ARG(value32, flags);
8587 
8588 	follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8589 	NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8590 	    UIO_USERSPACE, dst, ctx);
8591 	if ((error = nameiat(&tond, dst_dirfd))) {
8592 		return error;
8593 	}
8594 	cnp = &tond.ni_cnd;
8595 	tdvp = tond.ni_dvp;
8596 	tvp = tond.ni_vp;
8597 
8598 	free_src_acl = FALSE;
8599 	attr_cleanup = FALSE;
8600 
8601 	if (tvp != NULL) {
8602 		error = EEXIST;
8603 		goto out;
8604 	}
8605 
8606 	if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8607 		error = EXDEV;
8608 		goto out;
8609 	}
8610 
8611 #if CONFIG_MACF
8612 	if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8613 		goto out;
8614 	}
8615 #endif
8616 	if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8617 		goto out;
8618 	}
8619 
8620 	action = KAUTH_VNODE_GENERIC_READ_BITS;
8621 	if (data_read_authorised) {
8622 		action &= ~KAUTH_VNODE_READ_DATA;
8623 	}
8624 	if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8625 		goto out;
8626 	}
8627 
8628 	/*
8629 	 * certain attributes may need to be changed from the source, we ask for
8630 	 * those here with the exception of source file's ACLs unless the CLONE_ACL
8631 	 * flag is specified. By default, the clone file will inherit the target
8632 	 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8633 	 * will inherit the source file's ACLs instead.
8634 	 */
8635 	VATTR_INIT(&va);
8636 	VATTR_WANTED(&va, va_uid);
8637 	VATTR_WANTED(&va, va_gid);
8638 	VATTR_WANTED(&va, va_mode);
8639 	VATTR_WANTED(&va, va_flags);
8640 	if (flags & CLONE_ACL) {
8641 		VATTR_WANTED(&va, va_acl);
8642 	}
8643 
8644 	if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
8645 		goto out;
8646 	}
8647 
8648 	VATTR_INIT(&nva);
8649 	VATTR_SET(&nva, va_type, v_type);
8650 	if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8651 		VATTR_SET(&nva, va_acl, va.va_acl);
8652 		free_src_acl = TRUE;
8653 	}
8654 
8655 	/* Handle ACL inheritance, initialize vap. */
8656 	if (v_type == VLNK) {
8657 		error = vnode_authattr_new(tdvp, &nva, 0, ctx);
8658 	} else {
8659 		error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
8660 		if (error) {
8661 			goto out;
8662 		}
8663 		attr_cleanup = TRUE;
8664 	}
8665 
8666 	vnop_flags = VNODE_CLONEFILE_DEFAULT;
8667 	/*
8668 	 * We've got initial values for all security parameters,
8669 	 * If we are superuser, then we can change owners to be the
8670 	 * same as the source. Both superuser and the owner have default
8671 	 * WRITE_SECURITY privileges so all other fields can be taken
8672 	 * from source as well.
8673 	 */
8674 	if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8675 		if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8676 			VATTR_SET(&nva, va_uid, va.va_uid);
8677 		}
8678 		if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8679 			VATTR_SET(&nva, va_gid, va.va_gid);
8680 		}
8681 	} else {
8682 		vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8683 	}
8684 
8685 	if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8686 		VATTR_SET(&nva, va_mode, va.va_mode);
8687 	}
8688 	if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8689 		VATTR_SET(&nva, va_flags,
8690 		    ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8691 		    (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8692 	}
8693 
8694 #if CONFIG_FILE_LEASES
8695 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8696 #endif
8697 
8698 	error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8699 
8700 	if (!error && tvp) {
8701 		int     update_flags = 0;
8702 #if CONFIG_FSE
8703 		int fsevent;
8704 #endif /* CONFIG_FSE */
8705 
8706 		/*
8707 		 * If some of the requested attributes weren't handled by the
8708 		 * VNOP, use our fallback code.
8709 		 */
8710 		if (!VATTR_ALL_SUPPORTED(&nva)) {
8711 			(void)vnode_setattr_fallback(tvp, &nva, ctx);
8712 		}
8713 
8714 #if CONFIG_MACF
8715 		(void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8716 		    VNODE_LABEL_CREATE, ctx);
8717 #endif
8718 
8719 		// Make sure the name & parent pointers are hooked up
8720 		if (tvp->v_name == NULL) {
8721 			update_flags |= VNODE_UPDATE_NAME;
8722 		}
8723 		if (tvp->v_parent == NULLVP) {
8724 			update_flags |= VNODE_UPDATE_PARENT;
8725 		}
8726 
8727 		if (update_flags) {
8728 			(void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8729 			    cnp->cn_namelen, cnp->cn_hash, update_flags);
8730 		}
8731 
8732 #if CONFIG_FSE
8733 		switch (vnode_vtype(tvp)) {
8734 		case VLNK:
8735 		/* FALLTHRU */
8736 		case VREG:
8737 			fsevent = FSE_CREATE_FILE;
8738 			break;
8739 		case VDIR:
8740 			fsevent = FSE_CREATE_DIR;
8741 			break;
8742 		default:
8743 			goto out;
8744 		}
8745 
8746 		if (need_fsevent(fsevent, tvp)) {
8747 			/*
8748 			 * The following is a sequence of three explicit events.
8749 			 * A pair of FSE_CLONE events representing the source and destination
8750 			 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8751 			 * fseventsd may coalesce the destination clone and create events
8752 			 * into a single event resulting in the following sequence for a client
8753 			 * FSE_CLONE (src)
8754 			 * FSE_CLONE | FSE_CREATE (dst)
8755 			 */
8756 			add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8757 			    FSE_ARG_DONE);
8758 			add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8759 			    FSE_ARG_DONE);
8760 		}
8761 #endif /* CONFIG_FSE */
8762 	}
8763 
8764 out:
8765 	if (attr_cleanup) {
8766 		vn_attribute_cleanup(&nva, defaulted);
8767 	}
8768 	if (free_src_acl && va.va_acl) {
8769 		kauth_acl_free(va.va_acl);
8770 	}
8771 	nameidone(&tond);
8772 	if (tvp) {
8773 		vnode_put(tvp);
8774 	}
8775 	vnode_put(tdvp);
8776 	return error;
8777 }
8778 
8779 /*
8780  * clone files or directories, target must not exist.
8781  */
8782 /* ARGSUSED */
8783 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8784 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8785     __unused int32_t *retval)
8786 {
8787 	vnode_t fvp;
8788 	struct nameidata fromnd;
8789 	int follow;
8790 	int error;
8791 	vfs_context_t ctx = vfs_context_current();
8792 
8793 	/* Check that the flags are valid. */
8794 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8795 		return EINVAL;
8796 	}
8797 
8798 	AUDIT_ARG(fd, uap->src_dirfd);
8799 
8800 	follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8801 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8802 	    UIO_USERSPACE, uap->src, ctx);
8803 	if ((error = nameiat(&fromnd, uap->src_dirfd))) {
8804 		return error;
8805 	}
8806 
8807 	fvp = fromnd.ni_vp;
8808 	nameidone(&fromnd);
8809 
8810 	error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8811 	    uap->flags, ctx);
8812 
8813 	vnode_put(fvp);
8814 	return error;
8815 }
8816 
8817 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)8818 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8819     __unused int32_t *retval)
8820 {
8821 	vnode_t fvp;
8822 	struct fileproc *fp;
8823 	int error;
8824 	vfs_context_t ctx = vfs_context_current();
8825 
8826 	/* Check that the flags are valid. */
8827 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8828 		return EINVAL;
8829 	}
8830 
8831 	AUDIT_ARG(fd, uap->src_fd);
8832 	error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8833 	if (error) {
8834 		return error;
8835 	}
8836 
8837 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8838 		AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8839 		error = EBADF;
8840 		goto out;
8841 	}
8842 
8843 	if ((error = vnode_getwithref(fvp))) {
8844 		goto out;
8845 	}
8846 
8847 	AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8848 
8849 	error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8850 	    uap->flags, ctx);
8851 
8852 	vnode_put(fvp);
8853 out:
8854 	file_drop(uap->src_fd);
8855 	return error;
8856 }
8857 
8858 static int
rename_submounts_callback(mount_t mp,void * arg)8859 rename_submounts_callback(mount_t mp, void *arg)
8860 {
8861 	int error = 0;
8862 	mount_t pmp = (mount_t)arg;
8863 	int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8864 
8865 	if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8866 		return 0;
8867 	}
8868 
8869 	if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8870 		return 0;
8871 	}
8872 
8873 	if ((error = vfs_busy(mp, LK_NOWAIT))) {
8874 		printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8875 		return -1;
8876 	}
8877 
8878 	size_t pathlen = MAXPATHLEN;
8879 	if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8880 		printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8881 	}
8882 
8883 	vfs_unbusy(mp);
8884 
8885 	return error;
8886 }
8887 
8888 /*
8889  * Rename files.  Source and destination must either both be directories,
8890  * or both not be directories.  If target is a directory, it must be empty.
8891  */
8892 /* ARGSUSED */
8893 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)8894 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8895     int tofd, user_addr_t to, int segflg, u_int uflags)
8896 {
8897 	vnode_t tvp, tdvp;
8898 	vnode_t fvp, fdvp;
8899 	vnode_t mnt_fvp;
8900 	struct nameidata *fromnd, *tond;
8901 	int error = 0;
8902 	int do_retry;
8903 	int retry_count;
8904 	int mntrename;
8905 	int need_event;
8906 	int need_kpath2;
8907 	int has_listeners;
8908 	const char *oname = NULL;
8909 	char *from_name = NULL, *to_name = NULL;
8910 	char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8911 	int from_len = 0, to_len = 0;
8912 	int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8913 	int holding_mntlock;
8914 	int vn_authorize_skipped;
8915 	mount_t locked_mp = NULL;
8916 	vnode_t oparent = NULLVP;
8917 #if CONFIG_FSE
8918 	fse_info from_finfo = {}, to_finfo;
8919 #endif
8920 	int from_truncated = 0, to_truncated = 0;
8921 	int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8922 	int batched = 0;
8923 	struct vnode_attr *fvap, *tvap;
8924 	int continuing = 0;
8925 	vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
8926 	int32_t nofollow_any = 0;
8927 	/* carving out a chunk for structs that are too big to be on stack. */
8928 	struct {
8929 		struct nameidata from_node, to_node;
8930 		struct vnode_attr fv_attr, tv_attr;
8931 	} * __rename_data;
8932 
8933 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
8934 	fromnd = &__rename_data->from_node;
8935 	tond = &__rename_data->to_node;
8936 
8937 	holding_mntlock = 0;
8938 	do_retry = 0;
8939 	retry_count = 0;
8940 retry:
8941 	fvp = tvp = NULL;
8942 	fdvp = tdvp = NULL;
8943 	fvap = tvap = NULL;
8944 	mnt_fvp = NULLVP;
8945 	mntrename = FALSE;
8946 	vn_authorize_skipped = FALSE;
8947 
8948 	if (uflags & RENAME_NOFOLLOW_ANY) {
8949 		nofollow_any = NAMEI_NOFOLLOW_ANY;
8950 	}
8951 	NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8952 	    segflg, from, ctx);
8953 	fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8954 
8955 	NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8956 	    segflg, to, ctx);
8957 	tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8958 
8959 continue_lookup:
8960 	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8961 		if ((error = nameiat(fromnd, fromfd))) {
8962 			goto out1;
8963 		}
8964 		fdvp = fromnd->ni_dvp;
8965 		fvp  = fromnd->ni_vp;
8966 
8967 		if (fvp && fvp->v_type == VDIR) {
8968 			tond->ni_cnd.cn_flags |= WILLBEDIR;
8969 		}
8970 	}
8971 
8972 	if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8973 		if ((error = nameiat(tond, tofd))) {
8974 			/*
8975 			 * Translate error code for rename("dir1", "dir2/.").
8976 			 */
8977 			if (error == EISDIR && fvp->v_type == VDIR) {
8978 				error = EINVAL;
8979 			}
8980 			goto out1;
8981 		}
8982 		tdvp = tond->ni_dvp;
8983 		tvp  = tond->ni_vp;
8984 	}
8985 
8986 #if DEVELOPMENT || DEBUG
8987 	/*
8988 	 * XXX VSWAP: Check for entitlements or special flag here
8989 	 * so we can restrict access appropriately.
8990 	 */
8991 #else /* DEVELOPMENT || DEBUG */
8992 
8993 	if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
8994 		error = EPERM;
8995 		goto out1;
8996 	}
8997 
8998 	if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
8999 		error = EPERM;
9000 		goto out1;
9001 	}
9002 #endif /* DEVELOPMENT || DEBUG */
9003 
9004 	if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9005 		error = ENOENT;
9006 		goto out1;
9007 	}
9008 
9009 	if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9010 		int32_t pval = 0;
9011 		int err = 0;
9012 
9013 		/*
9014 		 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9015 		 * has the same name as target iff the following conditions are met:
9016 		 * 1. the target file system is case insensitive
9017 		 * 2. source and target directories are the same
9018 		 * 3. source and target files are the same
9019 		 * 4. name only differs in case (determined by underlying filesystem)
9020 		 */
9021 		if (fvp != tvp || fdvp != tdvp) {
9022 			error = EEXIST;
9023 			goto out1;
9024 		}
9025 
9026 		/*
9027 		 * Assume that the target file system is case sensitive if
9028 		 * _PC_CASE_SENSITIVE selector isn't supported.
9029 		 */
9030 		err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9031 		if (err != 0 || pval != 0) {
9032 			error = EEXIST;
9033 			goto out1;
9034 		}
9035 	}
9036 
9037 	batched = vnode_compound_rename_available(fdvp);
9038 
9039 #if CONFIG_FSE
9040 	need_event = need_fsevent(FSE_RENAME, fdvp);
9041 	if (need_event) {
9042 		if (fvp) {
9043 			get_fse_info(fvp, &from_finfo, ctx);
9044 		} else {
9045 			error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9046 			if (error) {
9047 				goto out1;
9048 			}
9049 
9050 			fvap = &__rename_data->fv_attr;
9051 		}
9052 
9053 		if (tvp) {
9054 			get_fse_info(tvp, &to_finfo, ctx);
9055 		} else if (batched) {
9056 			error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9057 			if (error) {
9058 				goto out1;
9059 			}
9060 
9061 			tvap = &__rename_data->tv_attr;
9062 		}
9063 	}
9064 #else
9065 	need_event = 0;
9066 #endif /* CONFIG_FSE */
9067 
9068 	has_listeners = kauth_authorize_fileop_has_listeners();
9069 
9070 	need_kpath2 = 0;
9071 #if CONFIG_AUDIT
9072 	if (AUDIT_RECORD_EXISTS()) {
9073 		need_kpath2 = 1;
9074 	}
9075 #endif
9076 
9077 	if (need_event || has_listeners) {
9078 		if (from_name == NULL) {
9079 			GET_PATH(from_name);
9080 		}
9081 
9082 		from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9083 
9084 		if (from_name_no_firmlink == NULL) {
9085 			GET_PATH(from_name_no_firmlink);
9086 		}
9087 
9088 		from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9089 	}
9090 
9091 	if (need_event || need_kpath2 || has_listeners) {
9092 		if (to_name == NULL) {
9093 			GET_PATH(to_name);
9094 		}
9095 
9096 		to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9097 
9098 		if (to_name_no_firmlink == NULL) {
9099 			GET_PATH(to_name_no_firmlink);
9100 		}
9101 
9102 		to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9103 		if (to_name && need_kpath2) {
9104 			AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9105 		}
9106 	}
9107 	if (!fvp) {
9108 		/*
9109 		 * Claim: this check will never reject a valid rename.
9110 		 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9111 		 * Suppose fdvp and tdvp are not on the same mount.
9112 		 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
9113 		 *      then you can't move it to within another dir on the same mountpoint.
9114 		 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9115 		 *
9116 		 * If this check passes, then we are safe to pass these vnodes to the same FS.
9117 		 */
9118 		if (fdvp->v_mount != tdvp->v_mount) {
9119 			error = EXDEV;
9120 			goto out1;
9121 		}
9122 		goto skipped_lookup;
9123 	}
9124 
9125 	/*
9126 	 * If the source and destination are the same (i.e. they're
9127 	 * links to the same vnode) and the target file system is
9128 	 * case sensitive, then there is nothing to do.
9129 	 *
9130 	 * XXX Come back to this.
9131 	 */
9132 	if (fvp == tvp) {
9133 		int pathconf_val;
9134 
9135 		/*
9136 		 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9137 		 * then assume that this file system is case sensitive.
9138 		 */
9139 		if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9140 		    pathconf_val != 0) {
9141 			vn_authorize_skipped = TRUE;
9142 			goto out1;
9143 		}
9144 	}
9145 
9146 	/*
9147 	 * Allow the renaming of mount points.
9148 	 * - target must not exist
9149 	 * - target must reside in the same directory as source
9150 	 * - union mounts cannot be renamed
9151 	 * - the root fs, and tightly-linked system volumes, cannot be renamed
9152 	 *
9153 	 * XXX Handle this in VFS after a continued lookup (if we missed
9154 	 * in the cache to start off)
9155 	 *
9156 	 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9157 	 * we'll skip past here.  The file system is responsible for
9158 	 * checking that @tvp is not a descendent of @fvp and vice versa
9159 	 * so it should always return EINVAL if either @tvp or @fvp is the
9160 	 * root of a volume.
9161 	 */
9162 	if ((fvp->v_flag & VROOT) &&
9163 	    (fvp->v_type == VDIR) &&
9164 	    (tvp == NULL) &&
9165 	    (fvp->v_mountedhere == NULL) &&
9166 	    (fdvp == tdvp) &&
9167 	    ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9168 	    ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9169 	    (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9170 		vnode_t coveredvp;
9171 
9172 		/* switch fvp to the covered vnode */
9173 		coveredvp = fvp->v_mount->mnt_vnodecovered;
9174 		if ((vnode_getwithref(coveredvp))) {
9175 			error = ENOENT;
9176 			goto out1;
9177 		}
9178 		/*
9179 		 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9180 		 * later.
9181 		 */
9182 		mnt_fvp = fvp;
9183 
9184 		fvp = coveredvp;
9185 		mntrename = TRUE;
9186 	}
9187 	/*
9188 	 * Check for cross-device rename.
9189 	 */
9190 	if ((fvp->v_mount != tdvp->v_mount) ||
9191 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
9192 		error = EXDEV;
9193 		goto out1;
9194 	}
9195 
9196 	/*
9197 	 * If source is the same as the destination (that is the
9198 	 * same inode number) then there is nothing to do...
9199 	 * EXCEPT if the underlying file system supports case
9200 	 * insensitivity and is case preserving.  In this case
9201 	 * the file system needs to handle the special case of
9202 	 * getting the same vnode as target (fvp) and source (tvp).
9203 	 *
9204 	 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9205 	 * and _PC_CASE_PRESERVING can have this exception, and they need to
9206 	 * handle the special case of getting the same vnode as target and
9207 	 * source.  NOTE: Then the target is unlocked going into vnop_rename,
9208 	 * so not to cause locking problems. There is a single reference on tvp.
9209 	 *
9210 	 * NOTE - that fvp == tvp also occurs if they are hard linked and
9211 	 * that correct behaviour then is just to return success without doing
9212 	 * anything.
9213 	 *
9214 	 * XXX filesystem should take care of this itself, perhaps...
9215 	 */
9216 	if (fvp == tvp && fdvp == tdvp) {
9217 		if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9218 		    !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9219 		    fromnd->ni_cnd.cn_namelen)) {
9220 			vn_authorize_skipped = TRUE;
9221 			goto out1;
9222 		}
9223 	}
9224 
9225 	if (holding_mntlock && fvp->v_mount != locked_mp) {
9226 		/*
9227 		 * we're holding a reference and lock
9228 		 * on locked_mp, but it no longer matches
9229 		 * what we want to do... so drop our hold
9230 		 */
9231 		mount_unlock_renames(locked_mp);
9232 		mount_drop(locked_mp, 0);
9233 		holding_mntlock = 0;
9234 	}
9235 	if (tdvp != fdvp && fvp->v_type == VDIR) {
9236 		/*
9237 		 * serialize renames that re-shape
9238 		 * the tree... if holding_mntlock is
9239 		 * set, then we're ready to go...
9240 		 * otherwise we
9241 		 * first need to drop the iocounts
9242 		 * we picked up, second take the
9243 		 * lock to serialize the access,
9244 		 * then finally start the lookup
9245 		 * process over with the lock held
9246 		 */
9247 		if (!holding_mntlock) {
9248 			/*
9249 			 * need to grab a reference on
9250 			 * the mount point before we
9251 			 * drop all the iocounts... once
9252 			 * the iocounts are gone, the mount
9253 			 * could follow
9254 			 */
9255 			locked_mp = fvp->v_mount;
9256 			mount_ref(locked_mp, 0);
9257 
9258 			/*
9259 			 * nameidone has to happen before we vnode_put(tvp)
9260 			 * since it may need to release the fs_nodelock on the tvp
9261 			 */
9262 			nameidone(tond);
9263 
9264 			if (tvp) {
9265 				vnode_put(tvp);
9266 			}
9267 			vnode_put(tdvp);
9268 
9269 			/*
9270 			 * nameidone has to happen before we vnode_put(fdvp)
9271 			 * since it may need to release the fs_nodelock on the fvp
9272 			 */
9273 			nameidone(fromnd);
9274 
9275 			vnode_put(fvp);
9276 			vnode_put(fdvp);
9277 
9278 			if (mnt_fvp != NULLVP) {
9279 				vnode_put(mnt_fvp);
9280 			}
9281 
9282 			mount_lock_renames(locked_mp);
9283 			holding_mntlock = 1;
9284 
9285 			goto retry;
9286 		}
9287 	} else {
9288 		/*
9289 		 * when we dropped the iocounts to take
9290 		 * the lock, we allowed the identity of
9291 		 * the various vnodes to change... if they did,
9292 		 * we may no longer be dealing with a rename
9293 		 * that reshapes the tree... once we're holding
9294 		 * the iocounts, the vnodes can't change type
9295 		 * so we're free to drop the lock at this point
9296 		 * and continue on
9297 		 */
9298 		if (holding_mntlock) {
9299 			mount_unlock_renames(locked_mp);
9300 			mount_drop(locked_mp, 0);
9301 			holding_mntlock = 0;
9302 		}
9303 	}
9304 
9305 	if (!batched) {
9306 		error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9307 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9308 		    flags, NULL);
9309 		if (error) {
9310 			if (error == ENOENT) {
9311 				if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9312 					/*
9313 					 * We encountered a race where after doing the namei,
9314 					 * tvp stops being valid. If so, simply re-drive the rename
9315 					 * call from the top.
9316 					 */
9317 					do_retry = 1;
9318 					retry_count += 1;
9319 				}
9320 			}
9321 			goto out1;
9322 		}
9323 	}
9324 
9325 	/* Release the 'mnt_fvp' now that it is no longer needed. */
9326 	if (mnt_fvp != NULLVP) {
9327 		vnode_put(mnt_fvp);
9328 		mnt_fvp = NULLVP;
9329 	}
9330 
9331 	// save these off so we can later verify that fvp is the same
9332 	oname   = fvp->v_name;
9333 	oparent = fvp->v_parent;
9334 
9335 skipped_lookup:
9336 #if CONFIG_FILE_LEASES
9337 	/* Lease break needed for source's parent dir? */
9338 	vnode_breakdirlease(fdvp, false, O_WRONLY);
9339 
9340 	/* Lease break needed for target's parent dir? */
9341 	vnode_breakdirlease(tdvp, false, O_WRONLY);
9342 #endif
9343 
9344 	error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9345 	    tdvp, &tvp, &tond->ni_cnd, tvap,
9346 	    flags, ctx);
9347 
9348 	if (holding_mntlock) {
9349 		/*
9350 		 * we can drop our serialization
9351 		 * lock now
9352 		 */
9353 		mount_unlock_renames(locked_mp);
9354 		mount_drop(locked_mp, 0);
9355 		holding_mntlock = 0;
9356 	}
9357 	if (error) {
9358 		if (error == EDATALESS) {
9359 			/*
9360 			 * If we've been here before, something has gone
9361 			 * horribly wrong and we should just get out lest
9362 			 * we spiral around the drain forever.
9363 			 */
9364 			if (flags & VFS_RENAME_DATALESS) {
9365 				error = EIO;
9366 				goto out1;
9367 			}
9368 
9369 			/*
9370 			 * The object we're renaming is dataless (or has a
9371 			 * dataless descendent) and requires materialization
9372 			 * before the rename occurs.  But we're holding the
9373 			 * mount point's rename lock, so it's not safe to
9374 			 * make the upcall.
9375 			 *
9376 			 * In this case, we release the lock (above), perform
9377 			 * the materialization, and start the whole thing over.
9378 			 */
9379 			error = vfs_materialize_reparent(fvp, tdvp);
9380 			if (error == 0) {
9381 				/*
9382 				 * The next time around we need to tell the
9383 				 * file system that the materializtaion has
9384 				 * been performed.
9385 				 */
9386 				flags |= VFS_RENAME_DATALESS;
9387 				do_retry = 1;
9388 			}
9389 			goto out1;
9390 		}
9391 		if (error == EKEEPLOOKING) {
9392 			if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9393 				if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9394 					panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9395 				}
9396 			}
9397 
9398 			fromnd->ni_vp = fvp;
9399 			tond->ni_vp = tvp;
9400 
9401 			goto continue_lookup;
9402 		}
9403 
9404 		/*
9405 		 * We may encounter a race in the VNOP where the destination didn't
9406 		 * exist when we did the namei, but it does by the time we go and
9407 		 * try to create the entry. In this case, we should re-drive this rename
9408 		 * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
9409 		 * but other filesystems susceptible to this race could return it, too.
9410 		 */
9411 		if (error == ERECYCLE) {
9412 			if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9413 				do_retry = 1;
9414 				retry_count += 1;
9415 			} else {
9416 				printf("rename retry limit due to ERECYCLE reached\n");
9417 				error = ENOENT;
9418 			}
9419 		}
9420 
9421 		/*
9422 		 * For compound VNOPs, the authorization callback may return
9423 		 * ENOENT in case of racing hardlink lookups hitting the name
9424 		 * cache, redrive the lookup.
9425 		 */
9426 		if (batched && error == ENOENT) {
9427 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9428 				do_retry = 1;
9429 				retry_count += 1;
9430 			}
9431 		}
9432 
9433 		goto out1;
9434 	}
9435 
9436 	/* call out to allow 3rd party notification of rename.
9437 	 * Ignore result of kauth_authorize_fileop call.
9438 	 */
9439 	kauth_authorize_fileop(vfs_context_ucred(ctx),
9440 	    KAUTH_FILEOP_RENAME,
9441 	    (uintptr_t)from_name, (uintptr_t)to_name);
9442 	if (flags & VFS_RENAME_SWAP) {
9443 		kauth_authorize_fileop(vfs_context_ucred(ctx),
9444 		    KAUTH_FILEOP_RENAME,
9445 		    (uintptr_t)to_name, (uintptr_t)from_name);
9446 	}
9447 
9448 #if CONFIG_FSE
9449 	if (from_name != NULL && to_name != NULL) {
9450 		if (from_truncated || to_truncated) {
9451 			// set it here since only the from_finfo gets reported up to user space
9452 			from_finfo.mode |= FSE_TRUNCATED_PATH;
9453 		}
9454 
9455 		if (tvap && tvp) {
9456 			vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
9457 		}
9458 		if (fvap) {
9459 			vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
9460 		}
9461 
9462 		if (tvp) {
9463 			add_fsevent(FSE_RENAME, ctx,
9464 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9465 			    FSE_ARG_FINFO, &from_finfo,
9466 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9467 			    FSE_ARG_FINFO, &to_finfo,
9468 			    FSE_ARG_DONE);
9469 			if (flags & VFS_RENAME_SWAP) {
9470 				/*
9471 				 * Strictly speaking, swap is the equivalent of
9472 				 * *three* renames.  FSEvents clients should only take
9473 				 * the events as a hint, so we only bother reporting
9474 				 * two.
9475 				 */
9476 				add_fsevent(FSE_RENAME, ctx,
9477 				    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9478 				    FSE_ARG_FINFO, &to_finfo,
9479 				    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9480 				    FSE_ARG_FINFO, &from_finfo,
9481 				    FSE_ARG_DONE);
9482 			}
9483 		} else {
9484 			add_fsevent(FSE_RENAME, ctx,
9485 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9486 			    FSE_ARG_FINFO, &from_finfo,
9487 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9488 			    FSE_ARG_DONE);
9489 		}
9490 	}
9491 #endif /* CONFIG_FSE */
9492 
9493 	/*
9494 	 * update filesystem's mount point data
9495 	 */
9496 	if (mntrename) {
9497 		char *cp, *pathend, *mpname;
9498 		char * tobuf;
9499 		struct mount *mp;
9500 		int maxlen;
9501 		size_t len = 0;
9502 
9503 		mp = fvp->v_mountedhere;
9504 
9505 		if (vfs_busy(mp, LK_NOWAIT)) {
9506 			error = EBUSY;
9507 			goto out1;
9508 		}
9509 		tobuf = zalloc(ZV_NAMEI);
9510 
9511 		if (UIO_SEG_IS_USER_SPACE(segflg)) {
9512 			error = copyinstr(to, tobuf, MAXPATHLEN, &len);
9513 		} else {
9514 			error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
9515 		}
9516 		if (!error) {
9517 			/* find current mount point prefix */
9518 			pathend = &mp->mnt_vfsstat.f_mntonname[0];
9519 			for (cp = pathend; *cp != '\0'; ++cp) {
9520 				if (*cp == '/') {
9521 					pathend = cp + 1;
9522 				}
9523 			}
9524 			/* find last component of target name */
9525 			for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9526 				if (*cp == '/') {
9527 					mpname = cp + 1;
9528 				}
9529 			}
9530 
9531 			/* Update f_mntonname of sub mounts */
9532 			vfs_iterate(0, rename_submounts_callback, (void *)mp);
9533 
9534 			/* append name to prefix */
9535 			maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9536 			bzero(pathend, maxlen);
9537 
9538 			strlcpy(pathend, mpname, maxlen);
9539 		}
9540 		zfree(ZV_NAMEI, tobuf);
9541 
9542 		vfs_unbusy(mp);
9543 
9544 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
9545 	}
9546 	/*
9547 	 * fix up name & parent pointers.  note that we first
9548 	 * check that fvp has the same name/parent pointers it
9549 	 * had before the rename call... this is a 'weak' check
9550 	 * at best...
9551 	 *
9552 	 * XXX oparent and oname may not be set in the compound vnop case
9553 	 */
9554 	if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9555 		int update_flags;
9556 
9557 		update_flags = VNODE_UPDATE_NAME;
9558 
9559 		if (fdvp != tdvp) {
9560 			update_flags |= VNODE_UPDATE_PARENT;
9561 		}
9562 
9563 		vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
9564 	}
9565 out1:
9566 	/*
9567 	 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9568 	 * skipped earlier as no actual rename was performed.
9569 	 */
9570 	if (vn_authorize_skipped && error == 0) {
9571 		error = vn_authorize_renamex_with_paths(fdvp, fvp,
9572 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9573 		    flags, NULL);
9574 		if (error && error == ENOENT) {
9575 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9576 				do_retry = 1;
9577 				retry_count += 1;
9578 			}
9579 		}
9580 	}
9581 	if (to_name != NULL) {
9582 		RELEASE_PATH(to_name);
9583 		to_name = NULL;
9584 	}
9585 	if (to_name_no_firmlink != NULL) {
9586 		RELEASE_PATH(to_name_no_firmlink);
9587 		to_name_no_firmlink = NULL;
9588 	}
9589 	if (from_name != NULL) {
9590 		RELEASE_PATH(from_name);
9591 		from_name = NULL;
9592 	}
9593 	if (from_name_no_firmlink != NULL) {
9594 		RELEASE_PATH(from_name_no_firmlink);
9595 		from_name_no_firmlink = NULL;
9596 	}
9597 	if (holding_mntlock) {
9598 		mount_unlock_renames(locked_mp);
9599 		mount_drop(locked_mp, 0);
9600 		holding_mntlock = 0;
9601 	}
9602 	if (tdvp) {
9603 		/*
9604 		 * nameidone has to happen before we vnode_put(tdvp)
9605 		 * since it may need to release the fs_nodelock on the tdvp
9606 		 */
9607 		nameidone(tond);
9608 
9609 		if (tvp) {
9610 			vnode_put(tvp);
9611 		}
9612 		vnode_put(tdvp);
9613 	}
9614 	if (fdvp) {
9615 		/*
9616 		 * nameidone has to happen before we vnode_put(fdvp)
9617 		 * since it may need to release the fs_nodelock on the fdvp
9618 		 */
9619 		nameidone(fromnd);
9620 
9621 		if (fvp) {
9622 			vnode_put(fvp);
9623 		}
9624 		vnode_put(fdvp);
9625 	}
9626 	if (mnt_fvp != NULLVP) {
9627 		vnode_put(mnt_fvp);
9628 	}
9629 	/*
9630 	 * If things changed after we did the namei, then we will re-drive
9631 	 * this rename call from the top.
9632 	 */
9633 	if (do_retry) {
9634 		do_retry = 0;
9635 		goto retry;
9636 	}
9637 
9638 	kfree_type(typeof(*__rename_data), __rename_data);
9639 	return error;
9640 }
9641 
9642 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9643 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9644 {
9645 	return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9646 	           AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9647 }
9648 
9649 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9650 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9651 {
9652 	if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9653 		return EINVAL;
9654 	}
9655 
9656 	if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9657 		return EINVAL;
9658 	}
9659 
9660 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9661 	           uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9662 }
9663 
9664 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9665 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9666 {
9667 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9668 	           uap->tofd, uap->to, UIO_USERSPACE, 0);
9669 }
9670 
9671 /*
9672  * Make a directory file.
9673  *
9674  * Returns:	0			Success
9675  *		EEXIST
9676  *	namei:???
9677  *	vnode_authorize:???
9678  *	vn_create:???
9679  */
9680 /* ARGSUSED */
9681 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9682 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9683     enum uio_seg segflg)
9684 {
9685 	vnode_t vp, dvp;
9686 	int error;
9687 	int update_flags = 0;
9688 	int batched;
9689 	struct nameidata nd;
9690 
9691 	AUDIT_ARG(mode, vap->va_mode);
9692 	NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9693 	    path, ctx);
9694 	nd.ni_cnd.cn_flags |= WILLBEDIR;
9695 	nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9696 
9697 continue_lookup:
9698 	error = nameiat(&nd, fd);
9699 	if (error) {
9700 		return error;
9701 	}
9702 	dvp = nd.ni_dvp;
9703 	vp = nd.ni_vp;
9704 
9705 	if (vp != NULL) {
9706 		error = EEXIST;
9707 		goto out;
9708 	}
9709 
9710 	batched = vnode_compound_mkdir_available(dvp);
9711 
9712 	VATTR_SET(vap, va_type, VDIR);
9713 
9714 	/*
9715 	 * XXX
9716 	 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9717 	 * only get EXISTS or EISDIR for existing path components, and not that it could see
9718 	 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9719 	 * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
9720 	 */
9721 	if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9722 		if (error == EACCES || error == EPERM) {
9723 			int error2;
9724 
9725 			nameidone(&nd);
9726 			vnode_put(dvp);
9727 			dvp = NULLVP;
9728 
9729 			/*
9730 			 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9731 			 * rather than EACCESS if the target exists.
9732 			 */
9733 			NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9734 			    path, ctx);
9735 			error2 = nameiat(&nd, fd);
9736 			if (error2) {
9737 				goto out;
9738 			} else {
9739 				vp = nd.ni_vp;
9740 				error = EEXIST;
9741 				goto out;
9742 			}
9743 		}
9744 
9745 		goto out;
9746 	}
9747 
9748 #if CONFIG_FILE_LEASES
9749 	vnode_breakdirlease(dvp, false, O_WRONLY);
9750 #endif
9751 
9752 	/*
9753 	 * make the directory
9754 	 */
9755 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9756 		if (error == EKEEPLOOKING) {
9757 			nd.ni_vp = vp;
9758 			goto continue_lookup;
9759 		}
9760 
9761 		goto out;
9762 	}
9763 
9764 	// Make sure the name & parent pointers are hooked up
9765 	if (vp->v_name == NULL) {
9766 		update_flags |= VNODE_UPDATE_NAME;
9767 	}
9768 	if (vp->v_parent == NULLVP) {
9769 		update_flags |= VNODE_UPDATE_PARENT;
9770 	}
9771 
9772 	if (update_flags) {
9773 		vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9774 	}
9775 
9776 #if CONFIG_FSE
9777 	add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9778 #endif
9779 
9780 out:
9781 	/*
9782 	 * nameidone has to happen before we vnode_put(dvp)
9783 	 * since it may need to release the fs_nodelock on the dvp
9784 	 */
9785 	nameidone(&nd);
9786 
9787 	if (vp) {
9788 		vnode_put(vp);
9789 	}
9790 	if (dvp) {
9791 		vnode_put(dvp);
9792 	}
9793 
9794 	return error;
9795 }
9796 
9797 /*
9798  * mkdir_extended: Create a directory; with extended security (ACL).
9799  *
9800  * Parameters:    p                       Process requesting to create the directory
9801  *                uap                     User argument descriptor (see below)
9802  *                retval                  (ignored)
9803  *
9804  * Indirect:      uap->path               Path of directory to create
9805  *                uap->mode               Access permissions to set
9806  *                uap->xsecurity          ACL to set
9807  *
9808  * Returns:        0                      Success
9809  *                !0                      Not success
9810  *
9811  */
9812 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)9813 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9814 {
9815 	int ciferror;
9816 	kauth_filesec_t xsecdst;
9817 	struct vnode_attr va;
9818 
9819 	AUDIT_ARG(owner, uap->uid, uap->gid);
9820 
9821 	xsecdst = NULL;
9822 	if ((uap->xsecurity != USER_ADDR_NULL) &&
9823 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9824 		return ciferror;
9825 	}
9826 
9827 	VATTR_INIT(&va);
9828 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9829 	if (xsecdst != NULL) {
9830 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9831 		va.va_vaflags |= VA_FILESEC_ACL;
9832 	}
9833 
9834 	ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9835 	    UIO_USERSPACE);
9836 	if (xsecdst != NULL) {
9837 		kauth_filesec_free(xsecdst);
9838 	}
9839 	return ciferror;
9840 }
9841 
9842 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)9843 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9844 {
9845 	struct vnode_attr va;
9846 
9847 	VATTR_INIT(&va);
9848 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9849 
9850 	return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9851 	           UIO_USERSPACE);
9852 }
9853 
9854 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)9855 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9856 {
9857 	struct vnode_attr va;
9858 
9859 	VATTR_INIT(&va);
9860 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9861 
9862 	return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9863 	           UIO_USERSPACE);
9864 }
9865 
9866 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)9867 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9868     enum uio_seg segflg, int unlink_flags)
9869 {
9870 	struct {
9871 		struct nameidata nd;
9872 #if CONFIG_FSE
9873 		struct vnode_attr va;
9874 #endif /* CONFIG_FSE */
9875 	} *__rmdir_data;
9876 	vnode_t vp, dvp;
9877 	int error;
9878 	struct nameidata *ndp;
9879 	char     *path = NULL;
9880 	char     *no_firmlink_path = NULL;
9881 	int       len_path = 0;
9882 	int       len_no_firmlink_path = 0;
9883 	int has_listeners = 0;
9884 	int need_event = 0;
9885 	int truncated_path = 0;
9886 	int truncated_no_firmlink_path = 0;
9887 	struct vnode_attr *vap = NULL;
9888 	int restart_count = 0;
9889 	int batched;
9890 
9891 	int restart_flag;
9892 	int nofollow_any = 0;
9893 
9894 	__rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
9895 	ndp = &__rmdir_data->nd;
9896 
9897 	if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
9898 		nofollow_any = NAMEI_NOFOLLOW_ANY;
9899 		unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
9900 	}
9901 
9902 	/*
9903 	 * This loop exists to restart rmdir in the unlikely case that two
9904 	 * processes are simultaneously trying to remove the same directory
9905 	 * containing orphaned appleDouble files.
9906 	 */
9907 	do {
9908 		NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9909 		    segflg, dirpath, ctx);
9910 		ndp->ni_flag = NAMEI_COMPOUNDRMDIR | nofollow_any;
9911 continue_lookup:
9912 		restart_flag = 0;
9913 		vap = NULL;
9914 
9915 		error = nameiat(ndp, fd);
9916 		if (error) {
9917 			goto err_out;
9918 		}
9919 
9920 		dvp = ndp->ni_dvp;
9921 		vp = ndp->ni_vp;
9922 
9923 		if (vp) {
9924 			batched = vnode_compound_rmdir_available(vp);
9925 
9926 			if (vp->v_flag & VROOT) {
9927 				/*
9928 				 * The root of a mounted filesystem cannot be deleted.
9929 				 */
9930 				error = EBUSY;
9931 				goto out;
9932 			}
9933 
9934 #if DEVELOPMENT || DEBUG
9935 			/*
9936 			 * XXX VSWAP: Check for entitlements or special flag here
9937 			 * so we can restrict access appropriately.
9938 			 */
9939 #else /* DEVELOPMENT || DEBUG */
9940 
9941 			if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
9942 				error = EPERM;
9943 				goto out;
9944 			}
9945 #endif /* DEVELOPMENT || DEBUG */
9946 
9947 			/*
9948 			 * Removed a check here; we used to abort if vp's vid
9949 			 * was not the same as what we'd seen the last time around.
9950 			 * I do not think that check was valid, because if we retry
9951 			 * and all dirents are gone, the directory could legitimately
9952 			 * be recycled but still be present in a situation where we would
9953 			 * have had permission to delete.  Therefore, we won't make
9954 			 * an effort to preserve that check now that we may not have a
9955 			 * vp here.
9956 			 */
9957 
9958 			if (!batched) {
9959 				error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
9960 				if (error) {
9961 					if (error == ENOENT) {
9962 						if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9963 							restart_flag = 1;
9964 							restart_count += 1;
9965 						}
9966 					}
9967 					goto out;
9968 				}
9969 			}
9970 		} else {
9971 			batched = 1;
9972 
9973 			if (!vnode_compound_rmdir_available(dvp)) {
9974 				panic("No error, but no compound rmdir?");
9975 			}
9976 		}
9977 
9978 #if CONFIG_FSE
9979 		fse_info  finfo = {0};
9980 
9981 		need_event = need_fsevent(FSE_DELETE, dvp);
9982 		if (need_event) {
9983 			if (!batched) {
9984 				get_fse_info(vp, &finfo, ctx);
9985 			} else {
9986 				error = vfs_get_notify_attributes(&__rmdir_data->va);
9987 				if (error) {
9988 					goto out;
9989 				}
9990 
9991 				vap = &__rmdir_data->va;
9992 			}
9993 		}
9994 #endif
9995 		has_listeners = kauth_authorize_fileop_has_listeners();
9996 		if (need_event || has_listeners) {
9997 			if (path == NULL) {
9998 				GET_PATH(path);
9999 			}
10000 
10001 			len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10002 
10003 			if (no_firmlink_path == NULL) {
10004 				GET_PATH(no_firmlink_path);
10005 			}
10006 
10007 			len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10008 #if CONFIG_FSE
10009 			if (truncated_no_firmlink_path) {
10010 				finfo.mode |= FSE_TRUNCATED_PATH;
10011 			}
10012 #endif
10013 		}
10014 
10015 #if CONFIG_FILE_LEASES
10016 		vnode_breakdirlease(dvp, false, O_WRONLY);
10017 #endif
10018 
10019 		error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10020 		ndp->ni_vp = vp;
10021 		if (vp == NULLVP) {
10022 			/* Couldn't find a vnode */
10023 			goto out;
10024 		}
10025 
10026 		if (error == EKEEPLOOKING) {
10027 			goto continue_lookup;
10028 		} else if (batched && error == ENOENT) {
10029 			if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10030 				/*
10031 				 * For compound VNOPs, the authorization callback
10032 				 * may return ENOENT in case of racing hard link lookups
10033 				 * redrive the lookup.
10034 				 */
10035 				restart_flag = 1;
10036 				restart_count += 1;
10037 				goto out;
10038 			}
10039 		}
10040 
10041 		/*
10042 		 * XXX There's no provision for passing flags
10043 		 * to VNOP_RMDIR().  So, if vn_rmdir() fails
10044 		 * because it's not empty, then we try again
10045 		 * with VNOP_REMOVE(), passing in a special
10046 		 * flag that clever file systems will know
10047 		 * how to handle.
10048 		 */
10049 		if (error == ENOTEMPTY &&
10050 		    (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10051 			/*
10052 			 * Only do this if the directory is actually
10053 			 * marked as DATALESS.
10054 			 */
10055 			struct vnode_attr *lvap =
10056 			    kalloc_type(struct vnode_attr, Z_WAITOK);
10057 
10058 			VATTR_INIT(lvap);
10059 			VATTR_WANTED(lvap, va_flags);
10060 			if (vnode_getattr(vp, lvap, ctx) == 0 &&
10061 			    VATTR_IS_SUPPORTED(lvap, va_flags) &&
10062 			    (lvap->va_flags & SF_DATALESS) != 0) {
10063 				/*
10064 				 * If this fails, we want to keep the original
10065 				 * error.
10066 				 */
10067 				if (vn_remove(dvp, &vp, ndp,
10068 				    VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10069 					error = 0;
10070 				}
10071 			}
10072 			kfree_type(struct vnode_attr, lvap);
10073 		}
10074 
10075 #if CONFIG_APPLEDOUBLE
10076 		/*
10077 		 * Special case to remove orphaned AppleDouble
10078 		 * files. I don't like putting this in the kernel,
10079 		 * but carbon does not like putting this in carbon either,
10080 		 * so here we are.
10081 		 */
10082 		if (error == ENOTEMPTY) {
10083 			int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10084 			if (ad_error == EBUSY) {
10085 				error = ad_error;
10086 				goto out;
10087 			}
10088 
10089 
10090 			/*
10091 			 * Assuming everything went well, we will try the RMDIR again
10092 			 */
10093 			if (!ad_error) {
10094 				error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10095 			}
10096 		}
10097 #endif /* CONFIG_APPLEDOUBLE */
10098 		/*
10099 		 * Call out to allow 3rd party notification of delete.
10100 		 * Ignore result of kauth_authorize_fileop call.
10101 		 */
10102 		if (!error) {
10103 			if (has_listeners) {
10104 				kauth_authorize_fileop(vfs_context_ucred(ctx),
10105 				    KAUTH_FILEOP_DELETE,
10106 				    (uintptr_t)vp,
10107 				    (uintptr_t)path);
10108 			}
10109 
10110 			if (vp->v_flag & VISHARDLINK) {
10111 				// see the comment in unlink1() about why we update
10112 				// the parent of a hard link when it is removed
10113 				vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10114 			}
10115 
10116 #if CONFIG_FSE
10117 			if (need_event) {
10118 				if (vap) {
10119 					vnode_get_fse_info_from_vap(vp, &finfo, vap);
10120 				}
10121 				add_fsevent(FSE_DELETE, ctx,
10122 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10123 				    FSE_ARG_FINFO, &finfo,
10124 				    FSE_ARG_DONE);
10125 			}
10126 #endif
10127 
10128 #if CONFIG_MACF
10129 			mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10130 #endif
10131 		}
10132 
10133 out:
10134 		if (path != NULL) {
10135 			RELEASE_PATH(path);
10136 			path = NULL;
10137 		}
10138 
10139 		if (no_firmlink_path != NULL) {
10140 			RELEASE_PATH(no_firmlink_path);
10141 			no_firmlink_path = NULL;
10142 		}
10143 
10144 		/*
10145 		 * nameidone has to happen before we vnode_put(dvp)
10146 		 * since it may need to release the fs_nodelock on the dvp
10147 		 */
10148 		nameidone(ndp);
10149 		vnode_put(dvp);
10150 
10151 		if (vp) {
10152 			vnode_put(vp);
10153 		}
10154 
10155 		if (restart_flag == 0) {
10156 			wakeup_one((caddr_t)vp);
10157 			goto err_out;
10158 		}
10159 		tsleep(vp, PVFS, "rm AD", 1);
10160 	} while (restart_flag != 0);
10161 
10162 err_out:
10163 	kfree_type(typeof(*__rmdir_data), __rmdir_data);
10164 
10165 	return error;
10166 }
10167 
10168 /*
10169  * Remove a directory file.
10170  */
10171 /* ARGSUSED */
10172 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10173 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10174 {
10175 	return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10176 	           CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10177 }
10178 
10179 /* Get direntry length padded to 8 byte alignment */
10180 #define DIRENT64_LEN(namlen) \
10181 	((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10182 
10183 /* Get dirent length padded to 4 byte alignment */
10184 #define DIRENT_LEN(namelen) \
10185 	((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10186 
10187 /* Get the end of this dirent */
10188 #define DIRENT_END(dep) \
10189 	(((char *)(dep)) + (dep)->d_reclen - 1)
10190 
10191 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10192 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10193     int *numdirent, vfs_context_t ctxp)
10194 {
10195 	/* Check if fs natively supports VNODE_READDIR_EXTENDED */
10196 	if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10197 	    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10198 		return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10199 	} else {
10200 		size_t bufsize;
10201 		void * bufptr;
10202 		uio_t auio;
10203 		struct direntry *entry64;
10204 		struct dirent *dep;
10205 		size_t bytesread;
10206 		int error;
10207 
10208 		/*
10209 		 * We're here because the underlying file system does not
10210 		 * support direnties or we mounted denying support so we must
10211 		 * fall back to dirents and convert them to direntries.
10212 		 *
10213 		 * Our kernel buffer needs to be smaller since re-packing will
10214 		 * expand each dirent.  The worse case (when the name length
10215 		 * is 3 or less) corresponds to a struct direntry size of 32
10216 		 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10217 		 * (4-byte aligned).  So having a buffer that is 3/8 the size
10218 		 * will prevent us from reading more than we can pack.
10219 		 *
10220 		 * Since this buffer is wired memory, we will limit the
10221 		 * buffer size to a maximum of 32K. We would really like to
10222 		 * use 32K in the MIN(), but we use magic number 87371 to
10223 		 * prevent uio_resid() * 3 / 8 from overflowing.
10224 		 */
10225 		bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10226 		bufptr = kalloc_data(bufsize, Z_WAITOK);
10227 		if (bufptr == NULL) {
10228 			return ENOMEM;
10229 		}
10230 
10231 		auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10232 		uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10233 		auio->uio_offset = uio->uio_offset;
10234 
10235 		error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10236 
10237 		dep = (struct dirent *)bufptr;
10238 		bytesread = bufsize - uio_resid(auio);
10239 
10240 		entry64 = kalloc_type(struct direntry, Z_WAITOK);
10241 		/*
10242 		 * Convert all the entries and copy them out to user's buffer.
10243 		 */
10244 		while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10245 			/* First check that the dirent struct up to d_name is within the buffer */
10246 			if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10247 			    /* Check that the length of the entire dirent is within the buffer */
10248 			    DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10249 			    /* Check that the actual length including the name doesn't exceed d_reclen */
10250 			    DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10251 				printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10252 				    vp->v_mount->mnt_vfsstat.f_mntonname,
10253 				    vp->v_name ? vp->v_name : "<unknown>");
10254 				error = EIO;
10255 				break;
10256 			}
10257 
10258 			size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
10259 
10260 			bzero(entry64, enbufsize);
10261 			/* Convert a dirent to a dirent64. */
10262 			entry64->d_ino = dep->d_ino;
10263 			entry64->d_seekoff = 0;
10264 			entry64->d_reclen = (uint16_t)enbufsize;
10265 			entry64->d_namlen = dep->d_namlen;
10266 			entry64->d_type = dep->d_type;
10267 			bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10268 
10269 			/* Move to next entry. */
10270 			dep = (struct dirent *)((char *)dep + dep->d_reclen);
10271 
10272 			/* Copy entry64 to user's buffer. */
10273 			error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10274 		}
10275 
10276 		/* Update the real offset using the offset we got from VNOP_READDIR. */
10277 		if (error == 0) {
10278 			uio->uio_offset = auio->uio_offset;
10279 		}
10280 		uio_free(auio);
10281 		kfree_data(bufptr, bufsize);
10282 		kfree_type(struct direntry, entry64);
10283 		return error;
10284 	}
10285 }
10286 
10287 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
10288 
10289 /*
10290  * Read a block of directory entries in a file system independent format.
10291  */
10292 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10293 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10294     off_t *offset, int *eofflag, int flags)
10295 {
10296 	vnode_t vp;
10297 	struct vfs_context context = *vfs_context_current();    /* local copy */
10298 	struct fileproc *fp;
10299 	uio_t auio;
10300 	int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10301 	off_t loff;
10302 	int error, numdirent;
10303 	UIO_STACKBUF(uio_buf, 1);
10304 
10305 get_from_fd:
10306 	error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10307 	if (error) {
10308 		return error;
10309 	}
10310 
10311 	vn_offset_lock(fp->fp_glob);
10312 	if (((vnode_t)fp_get_data(fp)) != vp) {
10313 		vn_offset_unlock(fp->fp_glob);
10314 		file_drop(fd);
10315 		goto get_from_fd;
10316 	}
10317 
10318 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10319 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10320 		error = EBADF;
10321 		goto out;
10322 	}
10323 
10324 	if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10325 		bufsize = GETDIRENTRIES_MAXBUFSIZE;
10326 	}
10327 
10328 #if CONFIG_MACF
10329 	error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10330 	if (error) {
10331 		goto out;
10332 	}
10333 #endif
10334 
10335 	if ((error = vnode_getwithref(vp))) {
10336 		goto out;
10337 	}
10338 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10339 
10340 #if CONFIG_UNION_MOUNTS
10341 unionread:
10342 #endif /* CONFIG_UNION_MOUNTS */
10343 	if (vp->v_type != VDIR) {
10344 		(void)vnode_put(vp);
10345 		error = EINVAL;
10346 		goto out;
10347 	}
10348 
10349 #if CONFIG_MACF
10350 	error = mac_vnode_check_readdir(&context, vp);
10351 	if (error != 0) {
10352 		(void)vnode_put(vp);
10353 		goto out;
10354 	}
10355 #endif /* MAC */
10356 
10357 	loff = fp->fp_glob->fg_offset;
10358 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10359 	uio_addiov(auio, bufp, bufsize);
10360 
10361 	if (flags & VNODE_READDIR_EXTENDED) {
10362 		error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10363 		fp->fp_glob->fg_offset = uio_offset(auio);
10364 	} else {
10365 		error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10366 		fp->fp_glob->fg_offset = uio_offset(auio);
10367 	}
10368 	if (error) {
10369 		(void)vnode_put(vp);
10370 		goto out;
10371 	}
10372 
10373 #if CONFIG_UNION_MOUNTS
10374 	if ((user_ssize_t)bufsize == uio_resid(auio) &&
10375 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
10376 		vnode_t uvp;
10377 
10378 		if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10379 			if (vnode_ref(uvp) == 0) {
10380 				fp_set_data(fp, uvp);
10381 				fp->fp_glob->fg_offset = 0;
10382 				vnode_rele(vp);
10383 				vnode_put(vp);
10384 				vp = uvp;
10385 				goto unionread;
10386 			} else {
10387 				/* could not get a ref, can't replace in fd */
10388 				vnode_put(uvp);
10389 			}
10390 		}
10391 	}
10392 #endif /* CONFIG_UNION_MOUNTS */
10393 
10394 	vnode_put(vp);
10395 	if (offset) {
10396 		*offset = loff;
10397 	}
10398 
10399 	*bytesread = bufsize - uio_resid(auio);
10400 out:
10401 	vn_offset_unlock(fp->fp_glob);
10402 	file_drop(fd);
10403 	return error;
10404 }
10405 
10406 
10407 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)10408 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10409 {
10410 	off_t offset;
10411 	ssize_t bytesread;
10412 	int error, eofflag;
10413 
10414 	AUDIT_ARG(fd, uap->fd);
10415 	error = getdirentries_common(uap->fd, uap->buf, uap->count,
10416 	    &bytesread, &offset, &eofflag, 0);
10417 
10418 	if (error == 0) {
10419 		if (proc_is64bit(p)) {
10420 			user64_long_t base = (user64_long_t)offset;
10421 			error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10422 		} else {
10423 			user32_long_t base = (user32_long_t)offset;
10424 			error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10425 		}
10426 		*retval = (int)bytesread;
10427 	}
10428 	return error;
10429 }
10430 
10431 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)10432 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10433 {
10434 	off_t offset;
10435 	ssize_t bytesread;
10436 	int error, eofflag;
10437 	user_size_t bufsize;
10438 
10439 	AUDIT_ARG(fd, uap->fd);
10440 
10441 	/*
10442 	 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10443 	 * then the kernel carves out the last 4 bytes to return extended
10444 	 * information to userspace (namely whether we reached EOF with this call).
10445 	 */
10446 	if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10447 		bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10448 	} else {
10449 		bufsize = uap->bufsize;
10450 	}
10451 
10452 	error = getdirentries_common(uap->fd, uap->buf, bufsize,
10453 	    &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
10454 
10455 	if (error == 0) {
10456 		*retval = bytesread;
10457 		error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10458 
10459 		if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10460 			getdirentries64_flags_t flags = 0;
10461 			if (eofflag) {
10462 				flags |= GETDIRENTRIES64_EOF;
10463 			}
10464 			error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10465 			    sizeof(flags));
10466 		}
10467 	}
10468 	return error;
10469 }
10470 
10471 
10472 /*
10473  * Set the mode mask for creation of filesystem nodes.
10474  * XXX implement xsecurity
10475  */
10476 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
10477 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)10478 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10479 {
10480 	AUDIT_ARG(mask, newmask);
10481 	proc_fdlock(p);
10482 	*retval = p->p_fd.fd_cmask;
10483 	p->p_fd.fd_cmask = newmask & ALLPERMS;
10484 	proc_fdunlock(p);
10485 	return 0;
10486 }
10487 
10488 /*
10489  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10490  *
10491  * Parameters:    p                       Process requesting to set the umask
10492  *                uap                     User argument descriptor (see below)
10493  *                retval                  umask of the process (parameter p)
10494  *
10495  * Indirect:      uap->newmask            umask to set
10496  *                uap->xsecurity          ACL to set
10497  *
10498  * Returns:        0                      Success
10499  *                !0                      Not success
10500  *
10501  */
10502 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)10503 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10504 {
10505 	return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
10506 }
10507 
10508 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)10509 umask(proc_t p, struct umask_args *uap, int32_t *retval)
10510 {
10511 	return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
10512 }
10513 
10514 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT                               \
10515 	"com.apple.private.vfs.revoke-mounted-device"
10516 
10517 /*
10518  * Void all references to file by ripping underlying filesystem
10519  * away from vnode.
10520  */
10521 /* ARGSUSED */
10522 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)10523 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10524 {
10525 	vnode_t vp;
10526 	struct vnode_attr va;
10527 	vfs_context_t ctx = vfs_context_current();
10528 	int error;
10529 	struct nameidata nd;
10530 
10531 	NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10532 	    uap->path, ctx);
10533 	error = namei(&nd);
10534 	if (error) {
10535 		return error;
10536 	}
10537 	vp = nd.ni_vp;
10538 
10539 	nameidone(&nd);
10540 
10541 	if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10542 		error = ENOTSUP;
10543 		goto out;
10544 	}
10545 
10546 	if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10547 		error = EBUSY;
10548 		goto out;
10549 	}
10550 
10551 #if CONFIG_MACF
10552 	error = mac_vnode_check_revoke(ctx, vp);
10553 	if (error) {
10554 		goto out;
10555 	}
10556 #endif
10557 
10558 	VATTR_INIT(&va);
10559 	VATTR_WANTED(&va, va_uid);
10560 	if ((error = vnode_getattr(vp, &va, ctx))) {
10561 		goto out;
10562 	}
10563 	if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
10564 	    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
10565 		goto out;
10566 	}
10567 	if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10568 		VNOP_REVOKE(vp, REVOKEALL, ctx);
10569 	}
10570 out:
10571 	vnode_put(vp);
10572 	return error;
10573 }
10574 
10575 
10576 /*
10577  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10578  *  The following system calls are designed to support features
10579  *  which are specific to the HFS & HFS Plus volume formats
10580  */
10581 
10582 
10583 /*
10584  * Obtain attribute information on objects in a directory while enumerating
10585  * the directory.
10586  */
10587 /* ARGSUSED */
10588 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)10589 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10590 {
10591 	vnode_t vp;
10592 	struct fileproc *fp;
10593 	uio_t auio = NULL;
10594 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10595 	uint32_t count = 0, savecount = 0;
10596 	uint32_t newstate = 0;
10597 	int error, eofflag = 0;
10598 	off_t loff = 0;
10599 	struct attrlist attributelist;
10600 	vfs_context_t ctx = vfs_context_current();
10601 	int fd = uap->fd;
10602 	UIO_STACKBUF(uio_buf, 1);
10603 	kauth_action_t action;
10604 
10605 	AUDIT_ARG(fd, fd);
10606 
10607 	/* Get the attributes into kernel space */
10608 	if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10609 		return error;
10610 	}
10611 	if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10612 		return error;
10613 	}
10614 	savecount = count;
10615 
10616 get_from_fd:
10617 	if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10618 		return error;
10619 	}
10620 
10621 	vn_offset_lock(fp->fp_glob);
10622 	if (((vnode_t)fp_get_data(fp)) != vp) {
10623 		vn_offset_unlock(fp->fp_glob);
10624 		file_drop(fd);
10625 		goto get_from_fd;
10626 	}
10627 
10628 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10629 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10630 		error = EBADF;
10631 		goto out;
10632 	}
10633 
10634 
10635 #if CONFIG_MACF
10636 	error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10637 	    fp->fp_glob);
10638 	if (error) {
10639 		goto out;
10640 	}
10641 #endif
10642 
10643 
10644 	if ((error = vnode_getwithref(vp))) {
10645 		goto out;
10646 	}
10647 
10648 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10649 
10650 #if CONFIG_UNION_MOUNTS
10651 unionread:
10652 #endif /* CONFIG_UNION_MOUNTS */
10653 	if (vp->v_type != VDIR) {
10654 		(void)vnode_put(vp);
10655 		error = EINVAL;
10656 		goto out;
10657 	}
10658 
10659 #if CONFIG_MACF
10660 	error = mac_vnode_check_readdir(ctx, vp);
10661 	if (error != 0) {
10662 		(void)vnode_put(vp);
10663 		goto out;
10664 	}
10665 #endif /* MAC */
10666 
10667 	/* set up the uio structure which will contain the users return buffer */
10668 	loff = fp->fp_glob->fg_offset;
10669 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10670 	uio_addiov(auio, uap->buffer, uap->buffersize);
10671 
10672 	/*
10673 	 * If the only item requested is file names, we can let that past with
10674 	 * just LIST_DIRECTORY.  If they want any other attributes, that means
10675 	 * they need SEARCH as well.
10676 	 */
10677 	action = KAUTH_VNODE_LIST_DIRECTORY;
10678 	if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10679 	    attributelist.fileattr || attributelist.dirattr) {
10680 		action |= KAUTH_VNODE_SEARCH;
10681 	}
10682 
10683 	if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10684 		/* Believe it or not, uap->options only has 32-bits of valid
10685 		 * info, so truncate before extending again */
10686 
10687 		error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10688 		    (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10689 	}
10690 
10691 	if (error) {
10692 		(void) vnode_put(vp);
10693 		goto out;
10694 	}
10695 
10696 #if CONFIG_UNION_MOUNTS
10697 	/*
10698 	 * If we've got the last entry of a directory in a union mount
10699 	 * then reset the eofflag and pretend there's still more to come.
10700 	 * The next call will again set eofflag and the buffer will be empty,
10701 	 * so traverse to the underlying directory and do the directory
10702 	 * read there.
10703 	 */
10704 	if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10705 		if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10706 			eofflag = 0;
10707 		} else {                                                // Empty buffer
10708 			vnode_t uvp;
10709 			if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10710 				if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10711 					fp_set_data(fp, uvp);
10712 					fp->fp_glob->fg_offset = 0; // reset index for new dir
10713 					count = savecount;
10714 					vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10715 					vnode_put(vp);
10716 					vp = uvp;
10717 					goto unionread;
10718 				} else {
10719 					/* could not get a ref, can't replace in fd */
10720 					vnode_put(uvp);
10721 				}
10722 			}
10723 		}
10724 	}
10725 #endif /* CONFIG_UNION_MOUNTS */
10726 
10727 	(void)vnode_put(vp);
10728 
10729 	if (error) {
10730 		goto out;
10731 	}
10732 	fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10733 
10734 	if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10735 		goto out;
10736 	}
10737 	if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10738 		goto out;
10739 	}
10740 	if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10741 		goto out;
10742 	}
10743 
10744 	*retval = eofflag;  /* similar to getdirentries */
10745 	error = 0;
10746 out:
10747 	vn_offset_unlock(fp->fp_glob);
10748 	file_drop(fd);
10749 	return error; /* return error earlier, an retval of 0 or 1 now */
10750 } /* end of getdirentriesattr system call */
10751 
10752 /*
10753  * Exchange data between two files
10754  */
10755 
10756 /* ARGSUSED */
10757 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10758 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10759 {
10760 	struct nameidata fnd, snd;
10761 	vfs_context_t ctx = vfs_context_current();
10762 	vnode_t fvp;
10763 	vnode_t svp;
10764 	int error;
10765 	u_int32_t nameiflags;
10766 	char *fpath = NULL;
10767 	char *spath = NULL;
10768 	int   flen = 0, slen = 0;
10769 	int from_truncated = 0, to_truncated = 0;
10770 #if CONFIG_FSE
10771 	fse_info f_finfo, s_finfo;
10772 #endif
10773 
10774 	nameiflags = 0;
10775 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10776 		nameiflags |= FOLLOW;
10777 	}
10778 
10779 	NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10780 	    UIO_USERSPACE, uap->path1, ctx);
10781 
10782 	error = namei(&fnd);
10783 	if (error) {
10784 		goto out2;
10785 	}
10786 
10787 	nameidone(&fnd);
10788 	fvp = fnd.ni_vp;
10789 
10790 	NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10791 	    UIO_USERSPACE, uap->path2, ctx);
10792 
10793 	error = namei(&snd);
10794 	if (error) {
10795 		vnode_put(fvp);
10796 		goto out2;
10797 	}
10798 	nameidone(&snd);
10799 	svp = snd.ni_vp;
10800 
10801 	/*
10802 	 * if the files are the same, return an inval error
10803 	 */
10804 	if (svp == fvp) {
10805 		error = EINVAL;
10806 		goto out;
10807 	}
10808 
10809 	/*
10810 	 * if the files are on different volumes, return an error
10811 	 */
10812 	if (svp->v_mount != fvp->v_mount) {
10813 		error = EXDEV;
10814 		goto out;
10815 	}
10816 
10817 	/* If they're not files, return an error */
10818 	if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10819 		error = EINVAL;
10820 		goto out;
10821 	}
10822 
10823 #if CONFIG_MACF
10824 	error = mac_vnode_check_exchangedata(ctx,
10825 	    fvp, svp);
10826 	if (error) {
10827 		goto out;
10828 	}
10829 #endif
10830 	if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10831 	    ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10832 		goto out;
10833 	}
10834 
10835 	if (
10836 #if CONFIG_FSE
10837 		need_fsevent(FSE_EXCHANGE, fvp) ||
10838 #endif
10839 		kauth_authorize_fileop_has_listeners()) {
10840 		GET_PATH(fpath);
10841 		GET_PATH(spath);
10842 
10843 		flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10844 		slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10845 
10846 #if CONFIG_FSE
10847 		get_fse_info(fvp, &f_finfo, ctx);
10848 		get_fse_info(svp, &s_finfo, ctx);
10849 		if (from_truncated || to_truncated) {
10850 			// set it here since only the f_finfo gets reported up to user space
10851 			f_finfo.mode |= FSE_TRUNCATED_PATH;
10852 		}
10853 #endif
10854 	}
10855 	/* Ok, make the call */
10856 	error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10857 
10858 	if (error == 0) {
10859 		const char *tmpname;
10860 
10861 		if (fpath != NULL && spath != NULL) {
10862 			/* call out to allow 3rd party notification of exchangedata.
10863 			 * Ignore result of kauth_authorize_fileop call.
10864 			 */
10865 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10866 			    (uintptr_t)fpath, (uintptr_t)spath);
10867 		}
10868 		name_cache_lock();
10869 
10870 		tmpname     = fvp->v_name;
10871 		fvp->v_name = svp->v_name;
10872 		svp->v_name = tmpname;
10873 
10874 		if (fvp->v_parent != svp->v_parent) {
10875 			vnode_t tmp;
10876 
10877 			tmp           = fvp->v_parent;
10878 			fvp->v_parent = svp->v_parent;
10879 			svp->v_parent = tmp;
10880 		}
10881 		name_cache_unlock();
10882 
10883 #if CONFIG_FSE
10884 		if (fpath != NULL && spath != NULL) {
10885 			add_fsevent(FSE_EXCHANGE, ctx,
10886 			    FSE_ARG_STRING, flen, fpath,
10887 			    FSE_ARG_FINFO, &f_finfo,
10888 			    FSE_ARG_STRING, slen, spath,
10889 			    FSE_ARG_FINFO, &s_finfo,
10890 			    FSE_ARG_DONE);
10891 		}
10892 #endif
10893 	}
10894 
10895 out:
10896 	if (fpath != NULL) {
10897 		RELEASE_PATH(fpath);
10898 	}
10899 	if (spath != NULL) {
10900 		RELEASE_PATH(spath);
10901 	}
10902 	vnode_put(svp);
10903 	vnode_put(fvp);
10904 out2:
10905 	return error;
10906 }
10907 
10908 /*
10909  * Return (in MB) the amount of freespace on the given vnode's volume.
10910  */
10911 uint32_t freespace_mb(vnode_t vp);
10912 
10913 uint32_t
freespace_mb(vnode_t vp)10914 freespace_mb(vnode_t vp)
10915 {
10916 	vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
10917 	return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10918 	       vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10919 }
10920 
10921 #if CONFIG_SEARCHFS
10922 
10923 /* ARGSUSED */
10924 
10925 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)10926 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10927 {
10928 	vnode_t vp, tvp;
10929 	int i, error = 0;
10930 	int fserror = 0;
10931 	struct nameidata nd;
10932 	struct user64_fssearchblock searchblock;
10933 	struct searchstate *state;
10934 	struct attrlist *returnattrs;
10935 	struct timeval timelimit;
10936 	void *searchparams1, *searchparams2;
10937 	uio_t auio = NULL;
10938 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10939 	uint32_t nummatches;
10940 	size_t mallocsize;
10941 	uint32_t nameiflags;
10942 	vfs_context_t ctx = vfs_context_current();
10943 	UIO_STACKBUF(uio_buf, 1);
10944 
10945 	/* Start by copying in fsearchblock parameter list */
10946 	if (IS_64BIT_PROCESS(p)) {
10947 		error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
10948 		timelimit.tv_sec = searchblock.timelimit.tv_sec;
10949 		timelimit.tv_usec = searchblock.timelimit.tv_usec;
10950 	} else {
10951 		struct user32_fssearchblock tmp_searchblock;
10952 
10953 		error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
10954 		// munge into 64-bit version
10955 		searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
10956 		searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
10957 		searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
10958 		searchblock.maxmatches = tmp_searchblock.maxmatches;
10959 		/*
10960 		 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
10961 		 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
10962 		 */
10963 		timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
10964 		timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
10965 		searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
10966 		searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
10967 		searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
10968 		searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
10969 		searchblock.searchattrs = tmp_searchblock.searchattrs;
10970 	}
10971 	if (error) {
10972 		return error;
10973 	}
10974 
10975 	/* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
10976 	 */
10977 	if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10978 	    searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10979 		return EINVAL;
10980 	}
10981 
10982 	/* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
10983 	/* It all has to do into local memory and it's not that big so we might as well  put it all together. */
10984 	/* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
10985 	/* block.                                                                                             */
10986 	/*												      */
10987 	/* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
10988 	/*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
10989 	/*       assumes the size is still 556 bytes it will continue to work				      */
10990 
10991 	mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
10992 	    sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
10993 
10994 	searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
10995 
10996 	/* Now set up the various pointers to the correct place in our newly allocated memory */
10997 
10998 	searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
10999 	returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11000 	state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11001 
11002 	/* Now copy in the stuff given our local variables. */
11003 
11004 	if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11005 		goto freeandexit;
11006 	}
11007 
11008 	if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11009 		goto freeandexit;
11010 	}
11011 
11012 	if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11013 		goto freeandexit;
11014 	}
11015 
11016 	if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11017 		goto freeandexit;
11018 	}
11019 
11020 	/*
11021 	 * When searching a union mount, need to set the
11022 	 * start flag at the first call on each layer to
11023 	 * reset state for the new volume.
11024 	 */
11025 	if (uap->options & SRCHFS_START) {
11026 		state->ss_union_layer = 0;
11027 	} else {
11028 		uap->options |= state->ss_union_flags;
11029 	}
11030 	state->ss_union_flags = 0;
11031 
11032 	/*
11033 	 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11034 	 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11035 	 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11036 	 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11037 	 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11038 	 */
11039 
11040 	if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11041 		attrreference_t* string_ref;
11042 		u_int32_t* start_length;
11043 		user64_size_t param_length;
11044 
11045 		/* validate searchparams1 */
11046 		param_length = searchblock.sizeofsearchparams1;
11047 		/* skip the word that specifies length of the buffer */
11048 		start_length = (u_int32_t*) searchparams1;
11049 		start_length = start_length + 1;
11050 		string_ref = (attrreference_t*) start_length;
11051 
11052 		/* ensure no negative offsets or too big offsets */
11053 		if (string_ref->attr_dataoffset < 0) {
11054 			error = EINVAL;
11055 			goto freeandexit;
11056 		}
11057 		if (string_ref->attr_length > MAXPATHLEN) {
11058 			error = EINVAL;
11059 			goto freeandexit;
11060 		}
11061 
11062 		/* Check for pointer overflow in the string ref */
11063 		if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11064 			error = EINVAL;
11065 			goto freeandexit;
11066 		}
11067 
11068 		if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11069 			error = EINVAL;
11070 			goto freeandexit;
11071 		}
11072 		if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11073 			error = EINVAL;
11074 			goto freeandexit;
11075 		}
11076 	}
11077 
11078 	/* set up the uio structure which will contain the users return buffer */
11079 	auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11080 	uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11081 
11082 	nameiflags = 0;
11083 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11084 		nameiflags |= FOLLOW;
11085 	}
11086 	NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11087 	    UIO_USERSPACE, uap->path, ctx);
11088 
11089 	error = namei(&nd);
11090 	if (error) {
11091 		goto freeandexit;
11092 	}
11093 	vp = nd.ni_vp;
11094 	nameidone(&nd);
11095 
11096 	/*
11097 	 * Switch to the root vnode for the volume
11098 	 */
11099 	error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11100 	vnode_put(vp);
11101 	if (error) {
11102 		goto freeandexit;
11103 	}
11104 	vp = tvp;
11105 
11106 #if CONFIG_UNION_MOUNTS
11107 	/*
11108 	 * If it's a union mount, the path lookup takes
11109 	 * us to the top layer. But we may need to descend
11110 	 * to a lower layer. For non-union mounts the layer
11111 	 * is always zero.
11112 	 */
11113 	for (i = 0; i < (int) state->ss_union_layer; i++) {
11114 		if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11115 			break;
11116 		}
11117 		tvp = vp;
11118 		vp = vp->v_mount->mnt_vnodecovered;
11119 		if (vp == NULL) {
11120 			vnode_put(tvp);
11121 			error = ENOENT;
11122 			goto freeandexit;
11123 		}
11124 		error = vnode_getwithref(vp);
11125 		vnode_put(tvp);
11126 		if (error) {
11127 			goto freeandexit;
11128 		}
11129 	}
11130 #endif /* CONFIG_UNION_MOUNTS */
11131 
11132 #if CONFIG_MACF
11133 	error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11134 	if (error) {
11135 		vnode_put(vp);
11136 		goto freeandexit;
11137 	}
11138 #endif
11139 
11140 
11141 	/*
11142 	 * If searchblock.maxmatches == 0, then skip the search. This has happened
11143 	 * before and sometimes the underlying code doesnt deal with it well.
11144 	 */
11145 	if (searchblock.maxmatches == 0) {
11146 		nummatches = 0;
11147 		goto saveandexit;
11148 	}
11149 
11150 	/*
11151 	 * Allright, we have everything we need, so lets make that call.
11152 	 *
11153 	 * We keep special track of the return value from the file system:
11154 	 * EAGAIN is an acceptable error condition that shouldn't keep us
11155 	 * from copying out any results...
11156 	 */
11157 
11158 	fserror = VNOP_SEARCHFS(vp,
11159 	    searchparams1,
11160 	    searchparams2,
11161 	    &searchblock.searchattrs,
11162 	    (uint32_t)searchblock.maxmatches,
11163 	    &timelimit,
11164 	    returnattrs,
11165 	    &nummatches,
11166 	    (uint32_t)uap->scriptcode,
11167 	    (uint32_t)uap->options,
11168 	    auio,
11169 	    (struct searchstate *) &state->ss_fsstate,
11170 	    ctx);
11171 
11172 #if CONFIG_UNION_MOUNTS
11173 	/*
11174 	 * If it's a union mount we need to be called again
11175 	 * to search the mounted-on filesystem.
11176 	 */
11177 	if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11178 		state->ss_union_flags = SRCHFS_START;
11179 		state->ss_union_layer++;        // search next layer down
11180 		fserror = EAGAIN;
11181 	}
11182 #endif /* CONFIG_UNION_MOUNTS */
11183 
11184 saveandexit:
11185 
11186 	vnode_put(vp);
11187 
11188 	/* Now copy out the stuff that needs copying out. That means the number of matches, the
11189 	 *  search state.  Everything was already put into he return buffer by the vop call. */
11190 
11191 	if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11192 		goto freeandexit;
11193 	}
11194 
11195 	if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11196 		goto freeandexit;
11197 	}
11198 
11199 	error = fserror;
11200 
11201 freeandexit:
11202 
11203 	kfree_data(searchparams1, mallocsize);
11204 
11205 	return error;
11206 } /* end of searchfs system call */
11207 
11208 #else /* CONFIG_SEARCHFS */
11209 
11210 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11211 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11212 {
11213 	return ENOTSUP;
11214 }
11215 
11216 #endif /* CONFIG_SEARCHFS */
11217 
11218 
11219 #if CONFIG_DATALESS_FILES
11220 
11221 /*
11222  * === Namespace Resolver Up-call Mechanism ===
11223  *
11224  * When I/O is performed to a dataless file or directory (read, write,
11225  * lookup-in, etc.), the file system performs an upcall to the namespace
11226  * resolver (filecoordinationd) to materialize the object.
11227  *
11228  * We need multiple up-calls to be in flight at once, and we need these
11229  * up-calls to be interruptible, thus the following implementation:
11230  *
11231  * => The nspace_resolver_request represents the in-kernel request state.
11232  *    It contains a request ID, storage space for the errno code returned
11233  *    by filecoordinationd, and flags.
11234  *
11235  * => The request ID is simply a global monotonically incrementing 32-bit
11236  *    number.  Outstanding requests are stored in a hash table, and the
11237  *    hash function is extremely simple.
11238  *
11239  * => When an upcall is to be made to filecoordinationd, a request structure
11240  *    is allocated on the stack (it is small, and needs to live only during
11241  *    the duration of the call to resolve_nspace_item_ext()).  It is
11242  *    initialized and inserted into the table.  Some backpressure from
11243  *    filecoordinationd is applied by limiting the numnber of entries that
11244  *    can be inserted into the table (and thus limiting the number of
11245  *    outstanding requests issued to filecoordinationd); waiting for an
11246  *    available slot is interruptible.
11247  *
11248  * => Once the request has been inserted into the table, the up-call is made
11249  *    to filecoordinationd via a MiG-generated stub.  The up-call returns
11250  *    immediately and filecoordinationd processes the request asynchronously.
11251  *
11252  * => The caller now waits for the request to complete.  Tnis is achieved by
11253  *    sleeping on the address of the request structure and waiting for
11254  *    filecoordinationd to mark the request structure as complete.  This
11255  *    is an interruptible sleep call; if interrupted, the request structure
11256  *    is removed from the table and EINTR is returned to the caller.  If
11257  *    this occurs, an advisory up-call is made to filecoordinationd with
11258  *    the request ID to indicate that the request can be aborted or
11259  *    de-prioritized at the discretion of filecoordinationd.
11260  *
11261  * => When filecoordinationd has completed the request, it signals completion
11262  *    by writing to the vfs.nspace.complete sysctl node.  Only a process
11263  *    decorated as a namespace resolver can write to this sysctl node.  The
11264  *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11265  *    The request ID is looked up in the table, and if the request is found,
11266  *    the error code is stored in the request structure and a wakeup()
11267  *    issued on the address of the request structure.  If the request is not
11268  *    found, we simply drop the completion notification, assuming that the
11269  *    caller was interrupted.
11270  *
11271  * => When the waiting thread wakes up, it extracts the error code from the
11272  *    request structure, removes the request from the table, and returns the
11273  *    error code to the calling function.  Fini!
11274  */
11275 
11276 struct nspace_resolver_request {
11277 	LIST_ENTRY(nspace_resolver_request) r_hashlink;
11278 	vnode_t         r_vp;
11279 	vnode_t         r_tdvp;
11280 	uint32_t        r_req_id;
11281 	int             r_resolver_error;
11282 	int             r_flags;
11283 };
11284 
11285 #define RRF_COMPLETE    0x0001
11286 #define RRF_COMPLETING  0x0002
11287 
11288 struct nspace_resolver_completion_data {
11289 	uint32_t req_id;
11290 	int32_t  resolver_error;
11291 	uint64_t orig_gencount;
11292 	uint64_t orig_syncroot;
11293 };
11294 
11295 static uint32_t
next_nspace_req_id(void)11296 next_nspace_req_id(void)
11297 {
11298 	static uint32_t next_req_id;
11299 
11300 	return OSAddAtomic(1, &next_req_id);
11301 }
11302 
11303 #define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
11304 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
11305 
11306 static LIST_HEAD(nspace_resolver_requesthead,
11307     nspace_resolver_request) * nspace_resolver_request_hashtbl;
11308 static u_long nspace_resolver_request_hashmask;
11309 static u_int nspace_resolver_request_count;
11310 static bool nspace_resolver_request_wait_slot;
11311 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11312 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11313     &nspace_resolver_request_lck_grp);
11314 
11315 #define NSPACE_REQ_LOCK() \
11316 	lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11317 #define NSPACE_REQ_UNLOCK() \
11318 	lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11319 
11320 #define NSPACE_RESOLVER_HASH(req_id)    \
11321 	(&nspace_resolver_request_hashtbl[(req_id) & \
11322 	 nspace_resolver_request_hashmask])
11323 
11324 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id,bool skip_completing)11325 nspace_resolver_req_lookup(uint32_t req_id, bool skip_completing)
11326 {
11327 	struct nspace_resolver_requesthead *bucket;
11328 	struct nspace_resolver_request *req;
11329 
11330 	bucket = NSPACE_RESOLVER_HASH(req_id);
11331 	LIST_FOREACH(req, bucket, r_hashlink) {
11332 		if (req->r_req_id == req_id) {
11333 			/*
11334 			 * If this request already has a completion
11335 			 * pending, don't return it again.
11336 			 */
11337 			if ((req->r_flags & RRF_COMPLETING) != 0 &&
11338 			    skip_completing) {
11339 				req = NULL;
11340 			}
11341 			return req;
11342 		}
11343 	}
11344 
11345 	return NULL;
11346 }
11347 
11348 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11349 nspace_resolver_req_add(struct nspace_resolver_request *req)
11350 {
11351 	struct nspace_resolver_requesthead *bucket;
11352 	int error;
11353 
11354 	NSPACE_REQ_LOCK();
11355 
11356 	while (nspace_resolver_request_count >=
11357 	    NSPACE_RESOLVER_MAX_OUTSTANDING) {
11358 		nspace_resolver_request_wait_slot = true;
11359 		error = msleep(&nspace_resolver_request_count,
11360 		    &nspace_resolver_request_hash_mutex,
11361 		    PVFS | PCATCH, "nspacerq", NULL);
11362 		if (error) {
11363 			NSPACE_REQ_UNLOCK();
11364 			return error;
11365 		}
11366 	}
11367 
11368 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11369 #if DIAGNOSTIC
11370 	assert(nspace_resolver_req_lookup(req->r_req_id, false) == NULL);
11371 #endif /* DIAGNOSTIC */
11372 	LIST_INSERT_HEAD(bucket, req, r_hashlink);
11373 	nspace_resolver_request_count++;
11374 
11375 	NSPACE_REQ_UNLOCK();
11376 
11377 	return 0;
11378 }
11379 
11380 static void
nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request * req)11381 nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request *req)
11382 {
11383 	/*
11384 	 * If a completion is in-progress, we have to wait for the
11385 	 * completion handler to finish because it's still using 'req',
11386 	 * which is allocated on our stack a couple of frames up.
11387 	 */
11388 	while ((req->r_flags & RRF_COMPLETING) != 0) {
11389 		(void) msleep(req, &nspace_resolver_request_hash_mutex,
11390 		    PVFS, "nspacecmplt", NULL);
11391 	}
11392 }
11393 
11394 static void
nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request * req)11395 nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request *req)
11396 {
11397 	struct nspace_resolver_requesthead *bucket;
11398 
11399 	/* We're called with NSPACE_REQ_LOCK held. */
11400 
11401 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11402 #if DIAGNOSTIC
11403 	assert((req->r_flags & RRF_COMPLETING) == 0);
11404 	assert(nspace_resolver_req_lookup(req->r_req_id, false) != NULL);
11405 #endif /* DIAGNOSTIC */
11406 	LIST_REMOVE(req, r_hashlink);
11407 	nspace_resolver_request_count--;
11408 
11409 	if (nspace_resolver_request_wait_slot) {
11410 		nspace_resolver_request_wait_slot = false;
11411 		wakeup(&nspace_resolver_request_count);
11412 	}
11413 
11414 	nspace_resolver_req_wait_pending_completion(req);
11415 
11416 	NSPACE_REQ_UNLOCK();
11417 }
11418 
11419 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)11420 nspace_resolver_req_remove(struct nspace_resolver_request *req)
11421 {
11422 	NSPACE_REQ_LOCK();
11423 	nspace_resolver_req_remove_and_unlock(req);
11424 }
11425 
11426 static void
nspace_resolver_req_cancel(uint32_t req_id)11427 nspace_resolver_req_cancel(uint32_t req_id)
11428 {
11429 	kern_return_t kr;
11430 	mach_port_t mp;
11431 
11432 	// Failures here aren't fatal -- the cancellation message
11433 	// sent to the resolver is merely advisory.
11434 
11435 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11436 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11437 		return;
11438 	}
11439 
11440 	kr = send_nspace_resolve_cancel(mp, req_id);
11441 	if (kr != KERN_SUCCESS) {
11442 		os_log_error(OS_LOG_DEFAULT,
11443 		    "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11444 	}
11445 
11446 	ipc_port_release_send(mp);
11447 }
11448 
11449 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)11450 nspace_resolver_req_wait(struct nspace_resolver_request *req)
11451 {
11452 	bool send_cancel_message = false;
11453 	int error;
11454 
11455 	NSPACE_REQ_LOCK();
11456 
11457 	while ((req->r_flags & RRF_COMPLETE) == 0) {
11458 		error = msleep(req, &nspace_resolver_request_hash_mutex,
11459 		    PVFS | PCATCH, "nspace", NULL);
11460 		if (error && error != ERESTART) {
11461 			req->r_resolver_error = (error == EINTR) ? EINTR :
11462 			    ETIMEDOUT;
11463 			send_cancel_message = true;
11464 			break;
11465 		}
11466 	}
11467 
11468 	nspace_resolver_req_remove_and_unlock(req);
11469 
11470 	/*
11471 	 * It's safe to continue referencing 'req' here because it's
11472 	 * allocated on our caller's stack.
11473 	 */
11474 
11475 	if (send_cancel_message) {
11476 		nspace_resolver_req_cancel(req->r_req_id);
11477 	}
11478 
11479 	return req->r_resolver_error;
11480 }
11481 
11482 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)11483 nspace_resolver_req_mark_complete(
11484 	struct nspace_resolver_request *req,
11485 	int resolver_error)
11486 {
11487 	req->r_resolver_error = resolver_error;
11488 	req->r_flags = (req->r_flags & ~RRF_COMPLETING) | RRF_COMPLETE;
11489 	wakeup(req);
11490 }
11491 
11492 static void
nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request * req)11493 nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request *req)
11494 {
11495 	req->r_flags |= RRF_COMPLETING;
11496 }
11497 
11498 static void
nspace_resolver_req_completed(const struct nspace_resolver_completion_data * c)11499 nspace_resolver_req_completed(const struct nspace_resolver_completion_data *c)
11500 {
11501 	struct nspace_resolver_request *req;
11502 	int error;
11503 	struct vnode_attr va;
11504 	vnode_t vp;
11505 
11506 	NSPACE_REQ_LOCK();
11507 
11508 	req = nspace_resolver_req_lookup(c->req_id, true);
11509 	if (req == NULL) {
11510 		/*
11511 		 * If we don't find the request corresponding to our req_id,
11512 		 * just drop the completion on the floor; it's likely that
11513 		 * the requester interrupted with a signal, or it may already
11514 		 * be completing.
11515 		 */
11516 		NSPACE_REQ_UNLOCK();
11517 		return;
11518 	}
11519 
11520 	/*
11521 	 * Get out now if the resolver reported an error.
11522 	 */
11523 	if ((error = c->resolver_error) != 0) {
11524 		goto out;
11525 	}
11526 
11527 	/*
11528 	 * If the resolver did not specify any namespace shape criteria
11529 	 * for letting the operation proceed, then get out now.
11530 	 */
11531 	if (c->orig_gencount == 0 && c->orig_syncroot == 0) {
11532 		goto out;
11533 	}
11534 
11535 	/*
11536 	 * We're going to have to acquire the mount rename lock and do
11537 	 * some I/O in order to verify the criteria.  Mark the request
11538 	 * as pending so no one else messes with it after we drop the
11539 	 * NSPACE_REQ_LOCK.
11540 	 */
11541 	nspace_resolver_req_mark_completion_pending(req);
11542 	NSPACE_REQ_UNLOCK();
11543 
11544 	/*
11545 	 * Lock out renames from changing the shape of the tree while
11546 	 * validate the criteria.
11547 	 */
11548 	mount_t locked_mp = req->r_vp->v_mount;
11549 	mount_ref(locked_mp, 0);
11550 	mount_lock_renames(locked_mp);
11551 
11552 	if (c->orig_gencount != 0) {
11553 		vp = req->r_vp;
11554 		if (error) {
11555 			goto out_dropmount;
11556 		}
11557 
11558 		VATTR_INIT(&va);
11559 		VATTR_WANTED(&va, va_recursive_gencount);
11560 		error = vnode_getattr(vp, &va, vfs_context_kernel());
11561 		if (error) {
11562 			goto out_dropmount;
11563 		}
11564 		if (VATTR_NOT_RETURNED(&va, va_recursive_gencount) ||
11565 		    va.va_recursive_gencount != c->orig_gencount) {
11566 			printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n",
11567 			    c->orig_gencount, va.va_recursive_gencount);
11568 			error = EBUSY;
11569 			goto out_dropmount;
11570 		}
11571 	}
11572 
11573 	/*
11574 	 * Ignore orig_syncroot if a destination directory wasn't specified
11575 	 * in the request.
11576 	 */
11577 	if (c->orig_syncroot != 0 && (vp = req->r_tdvp) != NULL) {
11578 		uint64_t syncroot_id;
11579 
11580 		if (error) {
11581 			goto out_dropmount;
11582 		}
11583 
11584 #ifndef APFSIOC_GET_SYNC_ROOT
11585 #define APFSIOC_GET_SYNC_ROOT _IOR('J', 115, uint64_t)
11586 #endif
11587 
11588 		error = VNOP_IOCTL(vp, APFSIOC_GET_SYNC_ROOT,
11589 		    (caddr_t)&syncroot_id, 0, vfs_context_kernel());
11590 		if (error) {
11591 			goto out_dropmount;
11592 		}
11593 		if (syncroot_id != c->orig_syncroot) {
11594 			printf("nspace.complete: syncroot changed! (orig %llu cur %llu)\n",
11595 			    c->orig_syncroot, syncroot_id);
11596 			error = EBUSY;
11597 			goto out_dropmount;
11598 		}
11599 	}
11600 
11601 out_dropmount:
11602 	mount_unlock_renames(locked_mp);
11603 	mount_drop(locked_mp, 0);
11604 	NSPACE_REQ_LOCK();
11605 
11606 out:
11607 	nspace_resolver_req_mark_complete(req, error);
11608 	NSPACE_REQ_UNLOCK();
11609 }
11610 
11611 static struct proc *nspace_resolver_proc;
11612 
11613 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)11614 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11615 {
11616 	*is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11617 	    p == nspace_resolver_proc) ? 1 : 0;
11618 	return 0;
11619 }
11620 
11621 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11622 
11623 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)11624 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11625 {
11626 	vfs_context_t ctx = vfs_context_current();
11627 	int error = 0;
11628 
11629 	//
11630 	// The system filecoordinationd runs as uid == 0.  This also
11631 	// has the nice side-effect of filtering out filecoordinationd
11632 	// running in the simulator.
11633 	//
11634 	if (!vfs_context_issuser(ctx) ||
11635 	    !vfs_context_is_dataless_resolver(ctx)) {
11636 		return EPERM;
11637 	}
11638 
11639 	if (is_resolver) {
11640 		NSPACE_REQ_LOCK();
11641 
11642 		if (nspace_resolver_proc == NULL) {
11643 			proc_lock(p);
11644 			p->p_lflag |= P_LNSPACE_RESOLVER;
11645 			proc_unlock(p);
11646 			nspace_resolver_proc = p;
11647 		} else {
11648 			error = EBUSY;
11649 		}
11650 
11651 		NSPACE_REQ_UNLOCK();
11652 	} else {
11653 		// This is basically just like the exit case.
11654 		// nspace_resolver_exited() will verify that the
11655 		// process is the resolver, and will clear the
11656 		// global.
11657 		nspace_resolver_exited(p);
11658 	}
11659 
11660 	return error;
11661 }
11662 
11663 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)11664 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11665 {
11666 	if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11667 	    (p->p_vfs_iopolicy &
11668 	    P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11669 		*is_prevented = 1;
11670 	} else {
11671 		*is_prevented = 0;
11672 	}
11673 	return 0;
11674 }
11675 
11676 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)11677 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11678 {
11679 	if (p->p_lflag & P_LNSPACE_RESOLVER) {
11680 		return is_prevented ? 0 : EBUSY;
11681 	}
11682 
11683 	if (is_prevented) {
11684 		OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
11685 	} else {
11686 		OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
11687 	}
11688 	return 0;
11689 }
11690 
11691 static int
nspace_materialization_get_thread_state(int * is_prevented)11692 nspace_materialization_get_thread_state(int *is_prevented)
11693 {
11694 	uthread_t ut = current_uthread();
11695 
11696 	*is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11697 	return 0;
11698 }
11699 
11700 static int
nspace_materialization_set_thread_state(int is_prevented)11701 nspace_materialization_set_thread_state(int is_prevented)
11702 {
11703 	uthread_t ut = current_uthread();
11704 
11705 	if (is_prevented) {
11706 		ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11707 	} else {
11708 		ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11709 	}
11710 	return 0;
11711 }
11712 
11713 /* the vfs.nspace branch */
11714 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11715 
11716 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11717 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11718     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11719 {
11720 	struct proc *p = req->p;
11721 	int new_value, old_value, changed = 0;
11722 	int error;
11723 
11724 	error = nspace_resolver_get_proc_state(p, &old_value);
11725 	if (error) {
11726 		return error;
11727 	}
11728 
11729 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11730 	    &changed);
11731 	if (error == 0 && changed) {
11732 		error = nspace_resolver_set_proc_state(p, new_value);
11733 	}
11734 	return error;
11735 }
11736 
11737 /* decorate this process as the dataless file resolver */
11738 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11739     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11740     0, 0, sysctl_nspace_resolver, "I", "");
11741 
11742 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11743 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11744     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11745 {
11746 	struct proc *p = req->p;
11747 	int new_value, old_value, changed = 0;
11748 	int error;
11749 
11750 	error = nspace_materialization_get_proc_state(p, &old_value);
11751 	if (error) {
11752 		return error;
11753 	}
11754 
11755 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11756 	    &changed);
11757 	if (error == 0 && changed) {
11758 		error = nspace_materialization_set_proc_state(p, new_value);
11759 	}
11760 	return error;
11761 }
11762 
11763 /* decorate this process as not wanting to materialize dataless files */
11764 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
11765     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11766     0, 0, sysctl_nspace_prevent_materialization, "I", "");
11767 
11768 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11769 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
11770     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11771 {
11772 	int new_value, old_value, changed = 0;
11773 	int error;
11774 
11775 	error = nspace_materialization_get_thread_state(&old_value);
11776 	if (error) {
11777 		return error;
11778 	}
11779 
11780 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11781 	    &changed);
11782 	if (error == 0 && changed) {
11783 		error = nspace_materialization_set_thread_state(new_value);
11784 	}
11785 	return error;
11786 }
11787 
11788 /* decorate this thread as not wanting to materialize dataless files */
11789 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
11790     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11791     0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
11792 
11793 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11794 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
11795     __unused int arg2, struct sysctl_req *req)
11796 {
11797 	struct proc *p = req->p;
11798 	uint32_t req_status[2] = { 0, 0 };
11799 	uint64_t gencount = 0;
11800 	uint64_t syncroot = 0;
11801 	int error, is_resolver, changed = 0, other_changed;
11802 
11803 	error = nspace_resolver_get_proc_state(p, &is_resolver);
11804 	if (error) {
11805 		return error;
11806 	}
11807 
11808 	if (!is_resolver) {
11809 		return EPERM;
11810 	}
11811 
11812 	error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11813 	    &changed);
11814 	if (error) {
11815 		return error;
11816 	}
11817 
11818 	/*
11819 	 * Get the gencount if it was passed.  Ignore errors, because
11820 	 * it's optional.
11821 	 */
11822 	error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11823 	    &other_changed);
11824 	if (error) {
11825 		gencount = 0;
11826 		error = 0;
11827 	}
11828 
11829 	/*
11830 	 * ...and now the syncroot ID.
11831 	 */
11832 	error = sysctl_io_opaque(req, &syncroot, sizeof(syncroot),
11833 	    &other_changed);
11834 	if (error) {
11835 		syncroot = 0;
11836 		error = 0;
11837 	}
11838 
11839 	/*
11840 	 * req_status[0] is the req_id
11841 	 *
11842 	 * req_status[1] is the errno
11843 	 */
11844 	if (error == 0 && changed) {
11845 		const struct nspace_resolver_completion_data cd = {
11846 			.req_id = req_status[0],
11847 			.resolver_error = req_status[1],
11848 			.orig_gencount = gencount,
11849 			.orig_syncroot = syncroot,
11850 		};
11851 		nspace_resolver_req_completed(&cd);
11852 	}
11853 	return error;
11854 }
11855 
11856 /* Resolver reports completed reqs here. */
11857 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11858     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11859     0, 0, sysctl_nspace_complete, "-", "");
11860 
11861 #endif /* CONFIG_DATALESS_FILES */
11862 
11863 #if CONFIG_DATALESS_FILES
11864 #define __no_dataless_unused    /* nothing */
11865 #else
11866 #define __no_dataless_unused    __unused
11867 #endif
11868 
11869 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)11870 vfs_context_dataless_materialization_is_prevented(
11871 	vfs_context_t const ctx __no_dataless_unused)
11872 {
11873 #if CONFIG_DATALESS_FILES
11874 	proc_t const p = vfs_context_proc(ctx);
11875 	thread_t const t = vfs_context_thread(ctx);
11876 	uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11877 
11878 	/*
11879 	 * Kernel context ==> return EDEADLK, as we would with any random
11880 	 * process decorated as no-materialize.
11881 	 */
11882 	if (ctx == vfs_context_kernel()) {
11883 		return EDEADLK;
11884 	}
11885 
11886 	/*
11887 	 * If the process has the dataless-manipulation entitlement,
11888 	 * materialization is prevented, and depending on the kind
11889 	 * of file system operation, things get to proceed as if the
11890 	 * object is not dataless.
11891 	 */
11892 	if (vfs_context_is_dataless_manipulator(ctx)) {
11893 		return EJUSTRETURN;
11894 	}
11895 
11896 	/*
11897 	 * Per-thread decorations override any process-wide decorations.
11898 	 * (Foundation uses this, and this overrides even the dataless-
11899 	 * manipulation entitlement so as to make API contracts consistent.)
11900 	 */
11901 	if (ut != NULL) {
11902 		if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11903 			return EDEADLK;
11904 		}
11905 		if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11906 			return 0;
11907 		}
11908 	}
11909 
11910 	/*
11911 	 * If the process's iopolicy specifies that dataless files
11912 	 * can be materialized, then we let it go ahead.
11913 	 */
11914 	if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11915 		return 0;
11916 	}
11917 #endif /* CONFIG_DATALESS_FILES */
11918 
11919 	/*
11920 	 * The default behavior is to not materialize dataless files;
11921 	 * return to the caller that deadlock was detected.
11922 	 */
11923 	return EDEADLK;
11924 }
11925 
11926 void
nspace_resolver_init(void)11927 nspace_resolver_init(void)
11928 {
11929 #if CONFIG_DATALESS_FILES
11930 	nspace_resolver_request_hashtbl =
11931 	    hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
11932 	    M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
11933 #endif /* CONFIG_DATALESS_FILES */
11934 }
11935 
11936 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)11937 nspace_resolver_exited(struct proc *p __no_dataless_unused)
11938 {
11939 #if CONFIG_DATALESS_FILES
11940 	struct nspace_resolver_requesthead *bucket;
11941 	struct nspace_resolver_request *req;
11942 	u_long idx;
11943 
11944 	NSPACE_REQ_LOCK();
11945 
11946 	if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11947 	    p == nspace_resolver_proc) {
11948 		for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
11949 			bucket = &nspace_resolver_request_hashtbl[idx];
11950 			LIST_FOREACH(req, bucket, r_hashlink) {
11951 				nspace_resolver_req_wait_pending_completion(req);
11952 				nspace_resolver_req_mark_complete(req,
11953 				    ETIMEDOUT);
11954 			}
11955 		}
11956 		nspace_resolver_proc = NULL;
11957 	}
11958 
11959 	NSPACE_REQ_UNLOCK();
11960 #endif /* CONFIG_DATALESS_FILES */
11961 }
11962 
11963 #define DATALESS_RESOLVER_ENTITLEMENT     \
11964 	"com.apple.private.vfs.dataless-resolver"
11965 #define DATALESS_MANIPULATION_ENTITLEMENT \
11966 	"com.apple.private.vfs.dataless-manipulation"
11967 
11968 #if CONFIG_DATALESS_FILES
11969 /*
11970  * Return TRUE if the vfs context is associated with the dataless
11971  * resolver.
11972  */
11973 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)11974 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
11975 {
11976 	return IOTaskHasEntitlement(vfs_context_task(ctx),
11977 	           DATALESS_RESOLVER_ENTITLEMENT);
11978 }
11979 #endif /* CONFIG_DATALESS_FILES */
11980 
11981 /*
11982  * Return TRUE if the vfs context is associated with a process entitled
11983  * for dataless manipulation.
11984  *
11985  * XXX Arguably belongs in vfs_subr.c, but is here because of the
11986  * complication around CONFIG_DATALESS_FILES.
11987  */
11988 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)11989 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
11990 {
11991 #if CONFIG_DATALESS_FILES
11992 	task_t task = vfs_context_task(ctx);
11993 	return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
11994 	       IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
11995 #else
11996 	return false;
11997 #endif /* CONFIG_DATALESS_FILES */
11998 }
11999 
12000 #if CONFIG_DATALESS_FILES
12001 static void
log_materialization_prevented(vnode_t vp,uint64_t op)12002 log_materialization_prevented(vnode_t vp, uint64_t op)
12003 {
12004 	char p_name[MAXCOMLEN + 1];
12005 	char *vntype;
12006 	proc_selfname(&p_name[0], sizeof(p_name));
12007 
12008 	if (vp->v_type == VREG) {
12009 		vntype = "File";
12010 	} else if (vp->v_type == VDIR) {
12011 		vntype = "Dir";
12012 	} else if (vp->v_type == VLNK) {
12013 		vntype = "SymLink";
12014 	} else {
12015 		vntype = "Other";
12016 	}
12017 
12018 #if DEVELOPMENT
12019 	char *path = NULL;
12020 	int   len;
12021 
12022 	path = get_pathbuff();
12023 	len = MAXPATHLEN;
12024 	if (path) {
12025 		vn_getpath(vp, path, &len);
12026 	}
12027 
12028 	os_log_debug(OS_LOG_DEFAULT,
12029 	    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) path: %s",
12030 	    p_name, proc_selfpid(),
12031 	    op, vntype, path ? path : "<unknown-path>");
12032 	if (path) {
12033 		release_pathbuff(path);
12034 	}
12035 #else
12036 	os_log_debug(OS_LOG_DEFAULT,
12037 	    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
12038 	    p_name, proc_selfpid(),
12039 	    op, vntype);
12040 #endif
12041 }
12042 #endif /* CONFIG_DATALESS_FILES */
12043 
12044 static int
vfs_materialize_item(vnode_t vp __no_dataless_unused,uint32_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused,vnode_t tdvp __no_dataless_unused)12045 vfs_materialize_item(
12046 	vnode_t vp __no_dataless_unused,
12047 	uint32_t op __no_dataless_unused,
12048 	int64_t offset __no_dataless_unused,
12049 	int64_t size __no_dataless_unused,
12050 	char *lookup_name __no_dataless_unused,
12051 	size_t const namelen __no_dataless_unused,
12052 	vnode_t tdvp __no_dataless_unused)
12053 {
12054 #if CONFIG_DATALESS_FILES
12055 	kern_return_t kern_ret;
12056 	mach_port_t mach_port;
12057 	char *path = NULL;
12058 	vfs_context_t context;
12059 	int path_len;
12060 	int error;
12061 	audit_token_t atoken;
12062 	enum vtype vp_vtype;
12063 
12064 	/* Swap files are special; ignore them */
12065 	if (vnode_isswap(vp)) {
12066 		return 0;
12067 	}
12068 
12069 	/*
12070 	 * NAMESPACE_HANDLER_SNAPSHOT_EVENT and NAMESPACE_HANDLER_TRACK_EVENT
12071 	 * are no longer used nor supported.
12072 	 */
12073 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12074 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12075 		return ENOTSUP;
12076 	}
12077 	if (op & NAMESPACE_HANDLER_TRACK_EVENT) {
12078 		os_log_debug(OS_LOG_DEFAULT, "NSPACE TRACK not handled");
12079 		return ENOTSUP;
12080 	}
12081 
12082 	/* Normalize 'op'. */
12083 	op &= ~NAMESPACE_HANDLER_EVENT_TYPE_MASK;
12084 
12085 	/*
12086 	 * To-directory is only meaningful for rename operations;
12087 	 * ignore it if someone handed one to us unexpectedly.
12088 	 */
12089 	if (op != NAMESPACE_HANDLER_RENAME_OP) {
12090 		tdvp = NULL;
12091 	}
12092 
12093 	context = vfs_context_current();
12094 
12095 	/* Remember this for later. */
12096 	vp_vtype = vnode_vtype(vp);
12097 
12098 	error = vfs_context_dataless_materialization_is_prevented(context);
12099 	if (error) {
12100 		log_materialization_prevented(vp, op);
12101 		goto out_check_errors;
12102 	}
12103 
12104 	kern_ret = host_get_filecoordinationd_port(host_priv_self(),
12105 	    &mach_port);
12106 	if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
12107 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12108 		/*
12109 		 * Treat this like being unable to access the backing store
12110 		 * server.
12111 		 */
12112 		return ETIMEDOUT;
12113 	}
12114 
12115 	int path_alloc_len = MAXPATHLEN;
12116 	do {
12117 		path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
12118 		if (path == NULL) {
12119 			return ENOMEM;
12120 		}
12121 
12122 		path_len = path_alloc_len;
12123 		error = vn_getpath(vp, path, &path_len);
12124 		if (error == 0) {
12125 			break;
12126 		} else if (error == ENOSPC) {
12127 			kfree_data(path, path_alloc_len);
12128 			path = NULL;
12129 		} else {
12130 			goto out_release_port;
12131 		}
12132 	} while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) && path_alloc_len <= FSGETPATH_MAXBUFLEN);
12133 
12134 	error = vfs_context_copy_audit_token(context, &atoken);
12135 	if (error) {
12136 		goto out_release_port;
12137 	}
12138 
12139 	struct nspace_resolver_request req = {
12140 		.r_req_id = next_nspace_req_id(),
12141 		.r_vp = vp,
12142 		.r_tdvp = tdvp,
12143 	};
12144 
12145 	error = nspace_resolver_req_add(&req);
12146 	if (error) {
12147 		goto out_release_port;
12148 	}
12149 
12150 	os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12151 
12152 	if (op == NAMESPACE_HANDLER_RENAME_OP && tdvp != NULL) {
12153 		char *dest_path = NULL;
12154 		int dest_path_len;
12155 
12156 		dest_path = zalloc(ZV_NAMEI);
12157 		dest_path_len = MAXPATHLEN;
12158 
12159 		error = vn_getpath(tdvp, dest_path, &dest_path_len);
12160 		if (error) {
12161 			zfree(ZV_NAMEI, dest_path);
12162 			goto out_release_port;
12163 		}
12164 
12165 		/*
12166 		 * Force setting NAMESPACE_HANDLER_NSPACE_EVENT for
12167 		 * compatibility with existing agents in user-space
12168 		 * who get passed this value.
12169 		 */
12170 		kern_ret = send_vfs_resolve_reparent_with_audit_token(mach_port,
12171 		    req.r_req_id,
12172 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12173 		    path, dest_path, atoken);
12174 
12175 		zfree(ZV_NAMEI, dest_path);
12176 	} else if (vp_vtype == VDIR) {
12177 		char *tmpname = NULL;
12178 
12179 		/*
12180 		 * If the caller provided a lookup_name *and* a name length,
12181 		 * then we assume the lookup_name is not NUL-terminated.
12182 		 * Allocate a temporary buffer in this case to provide
12183 		 * a NUL-terminated path name to the IPC call.
12184 		 */
12185 		if (lookup_name != NULL && namelen != 0) {
12186 			if (namelen >= PATH_MAX) {
12187 				error = EINVAL;
12188 				goto out_req_remove;
12189 			}
12190 			tmpname = zalloc(ZV_NAMEI);
12191 			strlcpy(tmpname, lookup_name, namelen + 1);
12192 			lookup_name = tmpname;
12193 		} else if (lookup_name != NULL) {
12194 			/*
12195 			 * If the caller provided a lookup_name with a
12196 			 * zero name length, then we assume it's NUL-
12197 			 * terminated.  Verify it has a valid length.
12198 			 */
12199 			if (strlen(lookup_name) >= PATH_MAX) {
12200 				error = EINVAL;
12201 				goto out_req_remove;
12202 			}
12203 		}
12204 
12205 		/* (See above.) */
12206 		kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12207 		    req.r_req_id,
12208 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12209 		    lookup_name == NULL ? "" : lookup_name, path, atoken);
12210 
12211 		if (tmpname != NULL) {
12212 			zfree(ZV_NAMEI, tmpname);
12213 
12214 			/*
12215 			 * Poison lookup_name rather than reference
12216 			 * freed memory.
12217 			 */
12218 			lookup_name = NULL;
12219 		}
12220 	} else {
12221 		/* (See above.) */
12222 		kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12223 		    req.r_req_id,
12224 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12225 		    offset, size, path, atoken);
12226 	}
12227 	if (kern_ret != KERN_SUCCESS) {
12228 		/*
12229 		 * Also treat this like being unable to access the backing
12230 		 * store server.
12231 		 */
12232 		os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12233 		    kern_ret);
12234 		error = ETIMEDOUT;
12235 		goto out_req_remove;
12236 	}
12237 
12238 	/*
12239 	 * Give back the memory we allocated earlier while we wait; we
12240 	 * no longer need it.
12241 	 */
12242 	kfree_data(path, path_alloc_len);
12243 	path = NULL;
12244 
12245 	/*
12246 	 * Request has been submitted to the resolver. Now (interruptibly)
12247 	 * wait for completion. Upon requrn, the request will have been
12248 	 * removed from the lookup table.
12249 	 */
12250 	error = nspace_resolver_req_wait(&req);
12251 
12252 out_release_port:
12253 	if (path != NULL) {
12254 		kfree_data(path, path_alloc_len);
12255 		path = NULL;
12256 	}
12257 	ipc_port_release_send(mach_port);
12258 
12259 out_check_errors:
12260 	/*
12261 	 * The file resolver owns the logic about what error to return
12262 	 * to the caller.  We only need to handle a couple of special
12263 	 * cases here:
12264 	 */
12265 	if (error == EJUSTRETURN) {
12266 		/*
12267 		 * The requesting process is allowed to interact with
12268 		 * dataless objects.  Make a couple of sanity-checks
12269 		 * here to ensure the action makes sense.
12270 		 */
12271 		switch (op) {
12272 		case NAMESPACE_HANDLER_WRITE_OP:
12273 		case NAMESPACE_HANDLER_TRUNCATE_OP:
12274 		case NAMESPACE_HANDLER_RENAME_OP:
12275 			/*
12276 			 * This handles the case of the resolver itself
12277 			 * writing data to the file (or throwing it
12278 			 * away).
12279 			 */
12280 			error = 0;
12281 			break;
12282 		case NAMESPACE_HANDLER_READ_OP:
12283 		case NAMESPACE_HANDLER_LOOKUP_OP:
12284 			/*
12285 			 * This handles the case of the resolver needing
12286 			 * to look up inside of a dataless directory while
12287 			 * it's in the process of materializing it (for
12288 			 * example, creating files or directories).
12289 			 */
12290 			error = (vp_vtype == VDIR) ? 0 : EBADF;
12291 			break;
12292 		default:
12293 			error = EBADF;
12294 			break;
12295 		}
12296 	}
12297 
12298 	return error;
12299 
12300 out_req_remove:
12301 	nspace_resolver_req_remove(&req);
12302 	goto out_release_port;
12303 #else
12304 	return ENOTSUP;
12305 #endif /* CONFIG_DATALESS_FILES */
12306 }
12307 
12308 /*
12309  * vfs_materialize_file: Materialize a regular file.
12310  *
12311  * Inputs:
12312  * vp		The dataless file to be materialized.
12313  *
12314  * op		What kind of operation is being performed:
12315  *		-> NAMESPACE_HANDLER_READ_OP
12316  *		-> NAMESPACE_HANDLER_WRITE_OP
12317  *		-> NAMESPACE_HANDLER_LINK_CREATE
12318  *		-> NAMESPACE_HANDLER_DELETE_OP
12319  *		-> NAMESPACE_HANDLER_TRUNCATE_OP
12320  *		-> NAMESPACE_HANDLER_RENAME_OP
12321  *
12322  * offset	offset of I/O for READ or WRITE.  Ignored for
12323  *		other ops.
12324  *
12325  * size		size of I/O for READ or WRITE  Ignored for
12326  *		other ops.
12327  *
12328  * If offset or size are -1 for a READ or WRITE, then the resolver should
12329  * consider the range to be unknown.
12330  *
12331  * Upon successful return, the caller may proceed with the operation.
12332  * N.B. the file may still be "dataless" in this case.
12333  */
12334 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12335 vfs_materialize_file(
12336 	struct vnode *vp,
12337 	uint64_t op,
12338 	int64_t offset,
12339 	int64_t size)
12340 {
12341 	if (vp->v_type != VREG) {
12342 		return EFTYPE;
12343 	}
12344 	return vfs_materialize_item(vp, (uint32_t)op, offset, size, NULL, 0,
12345 	           NULL);
12346 }
12347 
12348 /*
12349  * vfs_materialize_dir:
12350  *
12351  * Inputs:
12352  * vp		The dataless directory to be materialized.
12353  *
12354  * op		What kind of operation is being performed:
12355  *		-> NAMESPACE_HANDLER_READ_OP
12356  *		-> NAMESPACE_HANDLER_WRITE_OP
12357  *		-> NAMESPACE_HANDLER_DELETE_OP
12358  *		-> NAMESPACE_HANDLER_RENAME_OP
12359  *		-> NAMESPACE_HANDLER_LOOKUP_OP
12360  *
12361  * lookup_name	Name being looked up for a LOOKUP op.  Ignored for
12362  *		other ops.  May or may not be NUL-terminated; see below.
12363  *
12364  * namelen	If non-zero, then lookup_name is assumed to not be NUL-
12365  *		terminated and namelen is the number of valid bytes in
12366  *		lookup_name. If zero, then lookup_name is assumed to be
12367  *		NUL-terminated.
12368  *
12369  * Upon successful return, the caller may proceed with the operation.
12370  * N.B. the directory may still be "dataless" in this case.
12371  */
12372 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12373 vfs_materialize_dir(
12374 	struct vnode *vp,
12375 	uint64_t op,
12376 	char *lookup_name,
12377 	size_t namelen)
12378 {
12379 	if (vp->v_type != VDIR) {
12380 		return EFTYPE;
12381 	}
12382 	if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12383 		return EINVAL;
12384 	}
12385 	return vfs_materialize_item(vp, (uint32_t)op, 0, 0, lookup_name,
12386 	           namelen, NULL);
12387 }
12388 
12389 /*
12390  * vfs_materialize_reparent:
12391  *
12392  * Inputs:
12393  * vp		The dataless file or directory to be materialized.
12394  *
12395  * tdvp		The new parent directory for the dataless file.
12396  *
12397  * Upon successful return, the caller may proceed with the operation.
12398  * N.B. the item may still be "dataless" in this case.
12399  */
12400 int
vfs_materialize_reparent(vnode_t vp,vnode_t tdvp)12401 vfs_materialize_reparent(vnode_t vp, vnode_t tdvp)
12402 {
12403 	if (vp->v_type != VDIR && vp->v_type != VREG) {
12404 		return EFTYPE;
12405 	}
12406 	return vfs_materialize_item(vp, NAMESPACE_HANDLER_RENAME_OP,
12407 	           0, 0, NULL, 0, tdvp);
12408 }
12409 
12410 #if 0
12411 static int
12412 build_volfs_path(struct vnode *vp, char *path, int *len)
12413 {
12414 	struct vnode_attr va;
12415 	int ret;
12416 
12417 	VATTR_INIT(&va);
12418 	VATTR_WANTED(&va, va_fsid);
12419 	VATTR_WANTED(&va, va_fileid);
12420 
12421 	if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12422 		*len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12423 		ret = -1;
12424 	} else {
12425 		*len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12426 		ret = 0;
12427 	}
12428 
12429 	return ret;
12430 }
12431 #endif
12432 
12433 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)12434 fsctl_bogus_command_compat(unsigned long cmd)
12435 {
12436 	switch (cmd) {
12437 	case IOCBASECMD(FSIOC_SYNC_VOLUME):
12438 		return FSIOC_SYNC_VOLUME;
12439 	case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12440 		return FSIOC_ROUTEFS_SETROUTEID;
12441 	case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12442 		return FSIOC_SET_PACKAGE_EXTS;
12443 	case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12444 		return FSIOC_SET_FSTYPENAME_OVERRIDE;
12445 	case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12446 		return DISK_CONDITIONER_IOC_GET;
12447 	case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12448 		return DISK_CONDITIONER_IOC_SET;
12449 	case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12450 		return FSIOC_FIOSEEKHOLE;
12451 	case IOCBASECMD(FSIOC_FIOSEEKDATA):
12452 		return FSIOC_FIOSEEKDATA;
12453 	case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12454 		return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12455 	case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12456 		return SPOTLIGHT_IOC_GET_LAST_MTIME;
12457 	}
12458 
12459 	return cmd;
12460 }
12461 
12462 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)12463 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12464 {
12465 	return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
12466 }
12467 
12468 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)12469 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12470 {
12471 	struct vfs_attr vfa;
12472 	mount_t mp = vp->v_mount;
12473 	unsigned arg;
12474 	int error;
12475 
12476 	/* record vid of vp so we can drop it below. */
12477 	uint32_t vvid = vp->v_id;
12478 
12479 	/*
12480 	 * Then grab mount_iterref so that we can release the vnode.
12481 	 * Without this, a thread may call vnode_iterate_prepare then
12482 	 * get into a deadlock because we've never released the root vp
12483 	 */
12484 	error = mount_iterref(mp, 0);
12485 	if (error) {
12486 		return error;
12487 	}
12488 	vnode_hold(vp);
12489 	vnode_put(vp);
12490 
12491 	arg = MNT_NOWAIT;
12492 	if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12493 		arg = MNT_WAIT;
12494 	}
12495 
12496 	/*
12497 	 * If the filessytem supports multiple filesytems in a
12498 	 * partition (For eg APFS volumes in a container, it knows
12499 	 * that the waitfor argument to VFS_SYNC are flags.
12500 	 */
12501 	VFSATTR_INIT(&vfa);
12502 	VFSATTR_WANTED(&vfa, f_capabilities);
12503 	if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
12504 	    VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12505 	    ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12506 	    ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12507 		arg |= MNT_VOLUME;
12508 	}
12509 
12510 	/* issue the sync for this volume */
12511 	(void)sync_callback(mp, &arg);
12512 
12513 	/*
12514 	 * Then release the mount_iterref once we're done syncing; it's not
12515 	 * needed for the VNOP_IOCTL below
12516 	 */
12517 	mount_iterdrop(mp);
12518 
12519 	if (arg & FSCTL_SYNC_FULLSYNC) {
12520 		/* re-obtain vnode iocount on the root vp, if possible */
12521 		error = vnode_getwithvid(vp, vvid);
12522 		if (error == 0) {
12523 			error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
12524 			vnode_put(vp);
12525 		}
12526 	}
12527 	vnode_drop(vp);
12528 	/* mark the argument VP as having been released */
12529 	*arg_vp = NULL;
12530 	return error;
12531 }
12532 
12533 #if ROUTEFS
12534 static int __attribute__((noinline))
handle_routes(user_addr_t udata)12535 handle_routes(user_addr_t udata)
12536 {
12537 	char routepath[MAXPATHLEN];
12538 	size_t len = 0;
12539 	int error;
12540 
12541 	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12542 		return error;
12543 	}
12544 	bzero(routepath, MAXPATHLEN);
12545 	error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12546 	if (error) {
12547 		return error;
12548 	}
12549 	error = routefs_kernel_mount(routepath);
12550 	return error;
12551 }
12552 #endif
12553 
12554 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)12555 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12556 {
12557 	struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12558 	struct vnode_attr va;
12559 	int error;
12560 
12561 	VATTR_INIT(&va);
12562 	VATTR_SET(&va, va_flags, cas->new_flags);
12563 
12564 	error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
12565 
12566 #if CONFIG_FSE
12567 	if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12568 		add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12569 	}
12570 #endif
12571 
12572 	return error;
12573 }
12574 
12575 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)12576 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12577 {
12578 	struct mount *mp = NULL;
12579 	errno_t rootauth = 0;
12580 
12581 	mp = vp->v_mount;
12582 
12583 	/*
12584 	 * query the underlying FS and see if it reports something
12585 	 * sane for this vnode. If volume is authenticated via
12586 	 * chunklist, leave that for the caller to determine.
12587 	 */
12588 	rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12589 
12590 	return rootauth;
12591 }
12592 
12593 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12594 	"com.apple.private.kernel.set-package-extensions"
12595 
12596 /*
12597  * Make a filesystem-specific control call:
12598  */
12599 /* ARGSUSED */
12600 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)12601 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12602 {
12603 	int error = 0;
12604 	boolean_t is64bit;
12605 	u_int size;
12606 #define STK_PARAMS 128
12607 	char stkbuf[STK_PARAMS] = {0};
12608 	caddr_t data, memp;
12609 	vnode_t vp = *arg_vp;
12610 
12611 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
12612 		return ENOTTY;
12613 	}
12614 
12615 	cmd = fsctl_bogus_command_compat(cmd);
12616 
12617 	size = IOCPARM_LEN(cmd);
12618 	if (size > IOCPARM_MAX) {
12619 		return EINVAL;
12620 	}
12621 
12622 	is64bit = proc_is64bit(p);
12623 
12624 	memp = NULL;
12625 
12626 	if (size > sizeof(stkbuf)) {
12627 		if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12628 			return ENOMEM;
12629 		}
12630 		data = memp;
12631 	} else {
12632 		data = &stkbuf[0];
12633 	};
12634 
12635 	if (cmd & IOC_IN) {
12636 		if (size) {
12637 			error = copyin(udata, data, size);
12638 			if (error) {
12639 				if (memp) {
12640 					kfree_data(memp, size);
12641 				}
12642 				return error;
12643 			}
12644 		} else {
12645 			if (is64bit) {
12646 				*(user_addr_t *)data = udata;
12647 			} else {
12648 				*(uint32_t *)data = (uint32_t)udata;
12649 			}
12650 		};
12651 	} else if ((cmd & IOC_OUT) && size) {
12652 		/*
12653 		 * Zero the buffer so the user always
12654 		 * gets back something deterministic.
12655 		 */
12656 		bzero(data, size);
12657 	} else if (cmd & IOC_VOID) {
12658 		if (is64bit) {
12659 			*(user_addr_t *)data = udata;
12660 		} else {
12661 			*(uint32_t *)data = (uint32_t)udata;
12662 		}
12663 	}
12664 
12665 	/* Check to see if it's a generic command */
12666 	switch (cmd) {
12667 	case FSIOC_SYNC_VOLUME:
12668 		error = handle_sync_volume(vp, arg_vp, data, ctx);
12669 		break;
12670 
12671 	case FSIOC_ROUTEFS_SETROUTEID:
12672 #if ROUTEFS
12673 		error = handle_routes(udata);
12674 #endif
12675 		break;
12676 
12677 	case FSIOC_SET_PACKAGE_EXTS: {
12678 		user_addr_t ext_strings;
12679 		uint32_t    num_entries;
12680 		uint32_t    max_width;
12681 
12682 		if (!IOTaskHasEntitlement(vfs_context_task(ctx),
12683 		    SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12684 			error = EPERM;
12685 			break;
12686 		}
12687 
12688 		if ((is64bit && size != sizeof(user64_package_ext_info))
12689 		    || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12690 			// either you're 64-bit and passed a 64-bit struct or
12691 			// you're 32-bit and passed a 32-bit struct.  otherwise
12692 			// it's not ok.
12693 			error = EINVAL;
12694 			break;
12695 		}
12696 
12697 		if (is64bit) {
12698 			if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12699 				assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12700 			}
12701 			ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12702 			num_entries = ((user64_package_ext_info *)data)->num_entries;
12703 			max_width   = ((user64_package_ext_info *)data)->max_width;
12704 		} else {
12705 			ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12706 			num_entries = ((user32_package_ext_info *)data)->num_entries;
12707 			max_width   = ((user32_package_ext_info *)data)->max_width;
12708 		}
12709 		error = set_package_extensions_table(ext_strings, num_entries, max_width);
12710 	}
12711 	break;
12712 
12713 	case FSIOC_SET_FSTYPENAME_OVERRIDE:
12714 	{
12715 		mount_t mp;
12716 
12717 		if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12718 			break;
12719 		}
12720 		if ((mp = vp->v_mount) != NULL) {
12721 			mount_lock(mp);
12722 			if (data[0] != 0) {
12723 				for (int i = 0; i < MFSTYPENAMELEN; i++) {
12724 					if (!data[i]) {
12725 						goto continue_copy;
12726 					}
12727 				}
12728 				/*
12729 				 * Getting here means we have a user data
12730 				 * string which has no NULL termination in
12731 				 * its first MFSTYPENAMELEN bytes.  This is
12732 				 * bogus, let's avoid strlcpy-ing the read
12733 				 * data and return an error.
12734 				 */
12735 				error = EINVAL;
12736 				goto unlock;
12737 continue_copy:
12738 				vfs_setfstypename_locked(mp, data);
12739 				if (vfs_isrdonly(mp) &&
12740 				    strcmp(data, "mtmfs") == 0) {
12741 					mp->mnt_kern_flag |=
12742 					    MNTK_EXTENDED_SECURITY;
12743 					mp->mnt_kern_flag &=
12744 					    ~MNTK_AUTH_OPAQUE;
12745 				}
12746 			} else if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12747 				const char *name =
12748 				    vfs_getfstypenameref_locked(mp, NULL);
12749 				if (strcmp(name, "mtmfs") == 0) {
12750 					mp->mnt_kern_flag &=
12751 					    ~MNTK_EXTENDED_SECURITY;
12752 				}
12753 				vfs_setfstypename_locked(mp, NULL);
12754 			}
12755 unlock:
12756 			mount_unlock(mp);
12757 		}
12758 	}
12759 	break;
12760 
12761 	case DISK_CONDITIONER_IOC_GET: {
12762 		error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12763 	}
12764 	break;
12765 
12766 	case DISK_CONDITIONER_IOC_SET: {
12767 		error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
12768 	}
12769 	break;
12770 
12771 	case FSIOC_CAS_BSDFLAGS:
12772 		error = handle_flags(vp, data, ctx);
12773 		break;
12774 
12775 	case FSIOC_FD_ONLY_OPEN_ONCE: {
12776 		error = 0;
12777 		if (vnode_usecount(vp) > 1) {
12778 			vnode_lock_spin(vp);
12779 			if (vp->v_lflag & VL_HASSTREAMS) {
12780 				if (vnode_isinuse_locked(vp, 1, 1)) {
12781 					error = EBUSY;
12782 				}
12783 			} else if (vnode_usecount(vp) > 1) {
12784 				error = EBUSY;
12785 			}
12786 			vnode_unlock(vp);
12787 		}
12788 	}
12789 	break;
12790 
12791 	case FSIOC_EVAL_ROOTAUTH:
12792 		error = handle_auth(vp, cmd, data, options, ctx);
12793 		break;
12794 
12795 	case FSIOC_TEST_FSE_ACCESS_GRANTED:
12796 		error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
12797 		break;
12798 
12799 	default: {
12800 		/*
12801 		 * Other, known commands shouldn't be passed down here.
12802 		 * (When adding a selector to this list, it may be prudent
12803 		 * to consider adding it to the list in sys_fcntl_nocancel() as well.)
12804 		 */
12805 		switch (cmd) {
12806 		case F_PUNCHHOLE:
12807 		case F_TRIM_ACTIVE_FILE:
12808 		case F_RDADVISE:
12809 		case F_TRANSCODEKEY:
12810 		case F_GETPROTECTIONLEVEL:
12811 		case F_GETDEFAULTPROTLEVEL:
12812 		case F_MAKECOMPRESSED:
12813 		case F_SET_GREEDY_MODE:
12814 		case F_SETSTATICCONTENT:
12815 		case F_SETIOTYPE:
12816 		case F_SETBACKINGSTORE:
12817 		case F_GETPATH_MTMINFO:
12818 		case APFSIOC_REVERT_TO_SNAPSHOT:
12819 		case FSIOC_FIOSEEKHOLE:
12820 		case FSIOC_FIOSEEKDATA:
12821 		case HFS_GET_BOOT_INFO:
12822 		case HFS_SET_BOOT_INFO:
12823 		case FIOPINSWAP:
12824 		case F_CHKCLEAN:
12825 		case F_FULLFSYNC:
12826 		case F_BARRIERFSYNC:
12827 		case F_FREEZE_FS:
12828 		case F_THAW_FS:
12829 		case FSIOC_KERNEL_ROOTAUTH:
12830 		case FSIOC_GRAFT_FS:
12831 		case FSIOC_UNGRAFT_FS:
12832 		case FSIOC_AUTH_FS:
12833 			error = EINVAL;
12834 			goto outdrop;
12835 		}
12836 		/* Invoke the filesystem-specific code */
12837 		error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12838 	}
12839 	} /* end switch stmt */
12840 
12841 	/*
12842 	 * if no errors, copy any data to user. Size was
12843 	 * already set and checked above.
12844 	 */
12845 	if (error == 0 && (cmd & IOC_OUT) && size) {
12846 		error = copyout(data, udata, size);
12847 	}
12848 
12849 outdrop:
12850 	if (memp) {
12851 		kfree_data(memp, size);
12852 	}
12853 
12854 	return error;
12855 }
12856 
12857 /* ARGSUSED */
12858 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)12859 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
12860 {
12861 	int error;
12862 	struct nameidata nd;
12863 	uint32_t nameiflags;
12864 	vnode_t vp = NULL;
12865 	vfs_context_t ctx = vfs_context_current();
12866 
12867 	AUDIT_ARG(cmd, (int)uap->cmd);
12868 	AUDIT_ARG(value32, uap->options);
12869 	/* Get the vnode for the file we are getting info on:  */
12870 	nameiflags = 0;
12871 	//
12872 	// if we come through fsctl() then the file is by definition not open.
12873 	// therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
12874 	// lest the caller mistakenly thinks the only open is their own (but in
12875 	// reality it's someone elses).
12876 	//
12877 	if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
12878 		return EINVAL;
12879 	}
12880 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
12881 		nameiflags |= FOLLOW;
12882 	}
12883 	if (uap->cmd == FSIOC_FIRMLINK_CTL) {
12884 		nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
12885 	}
12886 	NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
12887 	    UIO_USERSPACE, uap->path, ctx);
12888 	if ((error = namei(&nd))) {
12889 		goto done;
12890 	}
12891 	vp = nd.ni_vp;
12892 	nameidone(&nd);
12893 
12894 #if CONFIG_MACF
12895 	error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
12896 	if (error) {
12897 		goto done;
12898 	}
12899 #endif
12900 
12901 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12902 
12903 done:
12904 	if (vp) {
12905 		vnode_put(vp);
12906 	}
12907 	return error;
12908 }
12909 /* ARGSUSED */
12910 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)12911 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
12912 {
12913 	int error;
12914 	vnode_t vp = NULL;
12915 	vfs_context_t ctx = vfs_context_current();
12916 	int fd = -1;
12917 
12918 	AUDIT_ARG(fd, uap->fd);
12919 	AUDIT_ARG(cmd, (int)uap->cmd);
12920 	AUDIT_ARG(value32, uap->options);
12921 
12922 	/* Get the vnode for the file we are getting info on:  */
12923 	if ((error = file_vnode(uap->fd, &vp))) {
12924 		return error;
12925 	}
12926 	fd = uap->fd;
12927 	if ((error = vnode_getwithref(vp))) {
12928 		file_drop(fd);
12929 		return error;
12930 	}
12931 
12932 #if CONFIG_MACF
12933 	if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
12934 		file_drop(fd);
12935 		vnode_put(vp);
12936 		return error;
12937 	}
12938 #endif
12939 
12940 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12941 
12942 	file_drop(fd);
12943 
12944 	/*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
12945 	if (vp) {
12946 		vnode_put(vp);
12947 	}
12948 
12949 	return error;
12950 }
12951 /* end of fsctl system call */
12952 
12953 #define FILESEC_ACCESS_ENTITLEMENT              \
12954 	"com.apple.private.vfs.filesec-access"
12955 
12956 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)12957 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
12958 {
12959 	if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
12960 		/*
12961 		 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
12962 		 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
12963 		 */
12964 		if ((!setting && vfs_context_issuser(ctx)) ||
12965 		    IOTaskHasEntitlement(vfs_context_task(ctx),
12966 		    FILESEC_ACCESS_ENTITLEMENT)) {
12967 			return 0;
12968 		}
12969 	}
12970 
12971 	return EPERM;
12972 }
12973 
12974 /*
12975  *  Retrieve the data of an extended attribute.
12976  */
12977 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)12978 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
12979 {
12980 	vnode_t vp;
12981 	struct nameidata nd;
12982 	char attrname[XATTR_MAXNAMELEN + 1];
12983 	vfs_context_t ctx = vfs_context_current();
12984 	uio_t auio = NULL;
12985 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12986 	size_t attrsize = 0;
12987 	size_t namelen;
12988 	u_int32_t nameiflags;
12989 	int error;
12990 	UIO_STACKBUF(uio_buf, 1);
12991 
12992 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12993 		return EINVAL;
12994 	}
12995 
12996 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12997 	NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
12998 	if ((error = namei(&nd))) {
12999 		return error;
13000 	}
13001 	vp = nd.ni_vp;
13002 	nameidone(&nd);
13003 
13004 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13005 	if (error != 0) {
13006 		goto out;
13007 	}
13008 	if (xattr_protected(attrname) &&
13009 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13010 		goto out;
13011 	}
13012 	/*
13013 	 * the specific check for 0xffffffff is a hack to preserve
13014 	 * binaray compatibilty in K64 with applications that discovered
13015 	 * that passing in a buf pointer and a size of -1 resulted in
13016 	 * just the size of the indicated extended attribute being returned.
13017 	 * this isn't part of the documented behavior, but because of the
13018 	 * original implemtation's check for "uap->size > 0", this behavior
13019 	 * was allowed. In K32 that check turned into a signed comparison
13020 	 * even though uap->size is unsigned...  in K64, we blow by that
13021 	 * check because uap->size is unsigned and doesn't get sign smeared
13022 	 * in the munger for a 32 bit user app.  we also need to add a
13023 	 * check to limit the maximum size of the buffer being passed in...
13024 	 * unfortunately, the underlying fileystems seem to just malloc
13025 	 * the requested size even if the actual extended attribute is tiny.
13026 	 * because that malloc is for kernel wired memory, we have to put a
13027 	 * sane limit on it.
13028 	 *
13029 	 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
13030 	 * U64 running on K64 will yield -1 (64 bits wide)
13031 	 * U32/U64 running on K32 will yield -1 (32 bits wide)
13032 	 */
13033 	if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
13034 		goto no_uio;
13035 	}
13036 
13037 	if (uap->value) {
13038 		if (uap->size > (size_t)XATTR_MAXSIZE) {
13039 			uap->size = XATTR_MAXSIZE;
13040 		}
13041 
13042 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13043 		    &uio_buf[0], sizeof(uio_buf));
13044 		uio_addiov(auio, uap->value, uap->size);
13045 	}
13046 no_uio:
13047 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
13048 out:
13049 	vnode_put(vp);
13050 
13051 	if (auio) {
13052 		*retval = uap->size - uio_resid(auio);
13053 	} else {
13054 		*retval = (user_ssize_t)attrsize;
13055 	}
13056 
13057 	return error;
13058 }
13059 
13060 /*
13061  * Retrieve the data of an extended attribute.
13062  */
13063 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)13064 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
13065 {
13066 	vnode_t vp;
13067 	char attrname[XATTR_MAXNAMELEN + 1];
13068 	vfs_context_t ctx = vfs_context_current();
13069 	uio_t auio = NULL;
13070 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13071 	size_t attrsize = 0;
13072 	size_t namelen;
13073 	int error;
13074 	UIO_STACKBUF(uio_buf, 1);
13075 
13076 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13077 		return EINVAL;
13078 	}
13079 
13080 	if ((error = file_vnode(uap->fd, &vp))) {
13081 		return error;
13082 	}
13083 	if ((error = vnode_getwithref(vp))) {
13084 		file_drop(uap->fd);
13085 		return error;
13086 	}
13087 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13088 	if (error != 0) {
13089 		goto out;
13090 	}
13091 	if (xattr_protected(attrname) &&
13092 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13093 		goto out;
13094 	}
13095 	if (uap->value && uap->size > 0) {
13096 		if (uap->size > (size_t)XATTR_MAXSIZE) {
13097 			uap->size = XATTR_MAXSIZE;
13098 		}
13099 
13100 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13101 		    &uio_buf[0], sizeof(uio_buf));
13102 		uio_addiov(auio, uap->value, uap->size);
13103 	}
13104 
13105 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13106 out:
13107 	(void)vnode_put(vp);
13108 	file_drop(uap->fd);
13109 
13110 	if (auio) {
13111 		*retval = uap->size - uio_resid(auio);
13112 	} else {
13113 		*retval = (user_ssize_t)attrsize;
13114 	}
13115 	return error;
13116 }
13117 
13118 /* struct for checkdirs iteration */
13119 struct setxattr_ctx {
13120 	struct nameidata nd;
13121 	char attrname[XATTR_MAXNAMELEN + 1];
13122 	UIO_STACKBUF(uio_buf, 1);
13123 };
13124 
13125 /*
13126  * Set the data of an extended attribute.
13127  */
13128 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13129 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13130 {
13131 	vnode_t vp;
13132 	vfs_context_t ctx = vfs_context_current();
13133 	uio_t auio = NULL;
13134 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13135 	size_t namelen;
13136 	u_int32_t nameiflags;
13137 	int error;
13138 	struct setxattr_ctx *sactx;
13139 
13140 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13141 		return EINVAL;
13142 	}
13143 
13144 	sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13145 	if (sactx == NULL) {
13146 		return ENOMEM;
13147 	}
13148 
13149 	error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13150 	if (error != 0) {
13151 		if (error == EPERM) {
13152 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13153 			error = ENAMETOOLONG;
13154 		}
13155 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13156 		goto out;
13157 	}
13158 	if (xattr_protected(sactx->attrname) &&
13159 	    (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13160 		goto out;
13161 	}
13162 	if (uap->size != 0 && uap->value == 0) {
13163 		error = EINVAL;
13164 		goto out;
13165 	}
13166 	if (uap->size > INT_MAX) {
13167 		error = E2BIG;
13168 		goto out;
13169 	}
13170 
13171 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13172 #if CONFIG_FILE_LEASES
13173 	nameiflags |= WANTPARENT;
13174 #endif
13175 	NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13176 	if ((error = namei(&sactx->nd))) {
13177 		goto out;
13178 	}
13179 	vp = sactx->nd.ni_vp;
13180 #if CONFIG_FILE_LEASES
13181 	vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13182 	vnode_put(sactx->nd.ni_dvp);
13183 #endif
13184 	nameidone(&sactx->nd);
13185 
13186 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13187 	    &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13188 	uio_addiov(auio, uap->value, uap->size);
13189 
13190 	error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13191 #if CONFIG_FSE
13192 	if (error == 0) {
13193 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13194 		    FSE_ARG_VNODE, vp,
13195 		    FSE_ARG_DONE);
13196 	}
13197 #endif
13198 	vnode_put(vp);
13199 out:
13200 	kfree_type(struct setxattr_ctx, sactx);
13201 	*retval = 0;
13202 	return error;
13203 }
13204 
13205 /*
13206  * Set the data of an extended attribute.
13207  */
13208 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13209 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13210 {
13211 	vnode_t vp;
13212 	char attrname[XATTR_MAXNAMELEN + 1];
13213 	vfs_context_t ctx = vfs_context_current();
13214 	uio_t auio = NULL;
13215 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13216 	size_t namelen;
13217 	int error;
13218 	UIO_STACKBUF(uio_buf, 1);
13219 
13220 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13221 		return EINVAL;
13222 	}
13223 
13224 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13225 	if (error != 0) {
13226 		if (error == EPERM) {
13227 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13228 			return ENAMETOOLONG;
13229 		}
13230 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13231 		return error;
13232 	}
13233 	if (xattr_protected(attrname) &&
13234 	    (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13235 		return error;
13236 	}
13237 	if (uap->size != 0 && uap->value == 0) {
13238 		return EINVAL;
13239 	}
13240 	if (uap->size > INT_MAX) {
13241 		return E2BIG;
13242 	}
13243 	if ((error = file_vnode(uap->fd, &vp))) {
13244 		return error;
13245 	}
13246 	if ((error = vnode_getwithref(vp))) {
13247 		file_drop(uap->fd);
13248 		return error;
13249 	}
13250 
13251 #if CONFIG_FILE_LEASES
13252 	vnode_breakdirlease(vp, true, O_WRONLY);
13253 #endif
13254 
13255 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13256 	    &uio_buf[0], sizeof(uio_buf));
13257 	uio_addiov(auio, uap->value, uap->size);
13258 
13259 	error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13260 #if CONFIG_FSE
13261 	if (error == 0) {
13262 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13263 		    FSE_ARG_VNODE, vp,
13264 		    FSE_ARG_DONE);
13265 	}
13266 #endif
13267 	vnode_put(vp);
13268 	file_drop(uap->fd);
13269 	*retval = 0;
13270 	return error;
13271 }
13272 
13273 /*
13274  * Remove an extended attribute.
13275  * XXX Code duplication here.
13276  */
13277 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13278 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13279 {
13280 	vnode_t vp;
13281 	struct nameidata nd;
13282 	char attrname[XATTR_MAXNAMELEN + 1];
13283 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13284 	vfs_context_t ctx = vfs_context_current();
13285 	size_t namelen;
13286 	u_int32_t nameiflags;
13287 	int error;
13288 
13289 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13290 		return EINVAL;
13291 	}
13292 
13293 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13294 	if (error != 0) {
13295 		return error;
13296 	}
13297 	if (xattr_protected(attrname)) {
13298 		return EPERM;
13299 	}
13300 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13301 #if CONFIG_FILE_LEASES
13302 	nameiflags |= WANTPARENT;
13303 #endif
13304 	NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13305 	if ((error = namei(&nd))) {
13306 		return error;
13307 	}
13308 	vp = nd.ni_vp;
13309 #if CONFIG_FILE_LEASES
13310 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13311 	vnode_put(nd.ni_dvp);
13312 #endif
13313 	nameidone(&nd);
13314 
13315 	error = vn_removexattr(vp, attrname, uap->options, ctx);
13316 #if CONFIG_FSE
13317 	if (error == 0) {
13318 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13319 		    FSE_ARG_VNODE, vp,
13320 		    FSE_ARG_DONE);
13321 	}
13322 #endif
13323 	vnode_put(vp);
13324 	*retval = 0;
13325 	return error;
13326 }
13327 
13328 /*
13329  * Remove an extended attribute.
13330  * XXX Code duplication here.
13331  */
13332 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)13333 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13334 {
13335 	vnode_t vp;
13336 	char attrname[XATTR_MAXNAMELEN + 1];
13337 	size_t namelen;
13338 	int error;
13339 #if CONFIG_FSE
13340 	vfs_context_t ctx = vfs_context_current();
13341 #endif
13342 
13343 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13344 		return EINVAL;
13345 	}
13346 
13347 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13348 	if (error != 0) {
13349 		return error;
13350 	}
13351 	if (xattr_protected(attrname)) {
13352 		return EPERM;
13353 	}
13354 	if ((error = file_vnode(uap->fd, &vp))) {
13355 		return error;
13356 	}
13357 	if ((error = vnode_getwithref(vp))) {
13358 		file_drop(uap->fd);
13359 		return error;
13360 	}
13361 
13362 #if CONFIG_FILE_LEASES
13363 	vnode_breakdirlease(vp, true, O_WRONLY);
13364 #endif
13365 
13366 	error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13367 #if CONFIG_FSE
13368 	if (error == 0) {
13369 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13370 		    FSE_ARG_VNODE, vp,
13371 		    FSE_ARG_DONE);
13372 	}
13373 #endif
13374 	vnode_put(vp);
13375 	file_drop(uap->fd);
13376 	*retval = 0;
13377 	return error;
13378 }
13379 
13380 /*
13381  * Retrieve the list of extended attribute names.
13382  * XXX Code duplication here.
13383  */
13384 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)13385 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13386 {
13387 	vnode_t vp;
13388 	struct nameidata nd;
13389 	vfs_context_t ctx = vfs_context_current();
13390 	uio_t auio = NULL;
13391 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13392 	size_t attrsize = 0;
13393 	u_int32_t nameiflags;
13394 	int error;
13395 	UIO_STACKBUF(uio_buf, 1);
13396 
13397 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13398 		return EINVAL;
13399 	}
13400 
13401 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13402 	NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13403 	if ((error = namei(&nd))) {
13404 		return error;
13405 	}
13406 	vp = nd.ni_vp;
13407 	nameidone(&nd);
13408 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13409 		auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
13410 		    &uio_buf[0], sizeof(uio_buf));
13411 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13412 	}
13413 
13414 	error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13415 
13416 	vnode_put(vp);
13417 	if (auio) {
13418 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13419 	} else {
13420 		*retval = (user_ssize_t)attrsize;
13421 	}
13422 	return error;
13423 }
13424 
13425 /*
13426  * Retrieve the list of extended attribute names.
13427  * XXX Code duplication here.
13428  */
13429 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)13430 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13431 {
13432 	vnode_t vp;
13433 	uio_t auio = NULL;
13434 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13435 	size_t attrsize = 0;
13436 	int error;
13437 	UIO_STACKBUF(uio_buf, 1);
13438 
13439 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13440 		return EINVAL;
13441 	}
13442 
13443 	if ((error = file_vnode(uap->fd, &vp))) {
13444 		return error;
13445 	}
13446 	if ((error = vnode_getwithref(vp))) {
13447 		file_drop(uap->fd);
13448 		return error;
13449 	}
13450 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13451 		auio = uio_createwithbuffer(1, 0, spacetype,
13452 		    UIO_READ, &uio_buf[0], sizeof(uio_buf));
13453 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13454 	}
13455 
13456 	error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13457 
13458 	vnode_put(vp);
13459 	file_drop(uap->fd);
13460 	if (auio) {
13461 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13462 	} else {
13463 		*retval = (user_ssize_t)attrsize;
13464 	}
13465 	return error;
13466 }
13467 
13468 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)13469 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13470     vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13471 {
13472 	int error;
13473 	struct mount *mp = NULL;
13474 	vnode_t vp;
13475 	int length;
13476 	int bpflags;
13477 	/* maximum number of times to retry build_path */
13478 	unsigned int retries = 0x10;
13479 
13480 	if (bufsize > FSGETPATH_MAXBUFLEN) {
13481 		return EINVAL;
13482 	}
13483 
13484 	if (buf == NULL) {
13485 		return ENOMEM;
13486 	}
13487 
13488 retry:
13489 	if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13490 		error = ENOTSUP;  /* unexpected failure */
13491 		return ENOTSUP;
13492 	}
13493 
13494 #if CONFIG_UNION_MOUNTS
13495 unionget:
13496 #endif /* CONFIG_UNION_MOUNTS */
13497 	if (objid == 2) {
13498 		struct vfs_attr vfsattr;
13499 		int use_vfs_root = TRUE;
13500 
13501 		VFSATTR_INIT(&vfsattr);
13502 		VFSATTR_WANTED(&vfsattr, f_capabilities);
13503 		if (!(options & FSOPT_ISREALFSID) &&
13504 		    vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
13505 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13506 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13507 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13508 				use_vfs_root = FALSE;
13509 			}
13510 		}
13511 
13512 		if (use_vfs_root) {
13513 			error = VFS_ROOT(mp, &vp, ctx);
13514 		} else {
13515 			error = VFS_VGET(mp, objid, &vp, ctx);
13516 		}
13517 	} else {
13518 		error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13519 	}
13520 
13521 #if CONFIG_UNION_MOUNTS
13522 	if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13523 		/*
13524 		 * If the fileid isn't found and we're in a union
13525 		 * mount volume, then see if the fileid is in the
13526 		 * mounted-on volume.
13527 		 */
13528 		struct mount *tmp = mp;
13529 		mp = vnode_mount(tmp->mnt_vnodecovered);
13530 		vfs_unbusy(tmp);
13531 		if (vfs_busy(mp, LK_NOWAIT) == 0) {
13532 			goto unionget;
13533 		}
13534 	} else {
13535 		vfs_unbusy(mp);
13536 	}
13537 #else
13538 	vfs_unbusy(mp);
13539 #endif /* CONFIG_UNION_MOUNTS */
13540 
13541 	if (error) {
13542 		return error;
13543 	}
13544 
13545 #if CONFIG_MACF
13546 	error = mac_vnode_check_fsgetpath(ctx, vp);
13547 	if (error) {
13548 		vnode_put(vp);
13549 		return error;
13550 	}
13551 #endif
13552 
13553 	/* Obtain the absolute path to this vnode. */
13554 	bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13555 	if (options & FSOPT_NOFIRMLINKPATH) {
13556 		bpflags |= BUILDPATH_NO_FIRMLINK;
13557 	}
13558 	bpflags |= BUILDPATH_CHECK_MOVED;
13559 	error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
13560 	vnode_put(vp);
13561 
13562 	if (error) {
13563 		/* there was a race building the path, try a few more times */
13564 		if (error == EAGAIN) {
13565 			--retries;
13566 			if (retries > 0) {
13567 				goto retry;
13568 			}
13569 
13570 			error = ENOENT;
13571 		}
13572 		goto out;
13573 	}
13574 
13575 	AUDIT_ARG(text, buf);
13576 
13577 	if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13578 		unsigned long path_words[NUMPARMS];
13579 		size_t path_len = sizeof(path_words);
13580 
13581 		if ((size_t)length < path_len) {
13582 			memcpy((char *)path_words, buf, length);
13583 			memset((char *)path_words + length, 0, path_len - length);
13584 
13585 			path_len = length;
13586 		} else {
13587 			memcpy((char *)path_words, buf + (length - path_len), path_len);
13588 		}
13589 
13590 		kdebug_vfs_lookup(path_words, (int)path_len, vp,
13591 		    KDBG_VFS_LOOKUP_FLAG_LOOKUP);
13592 	}
13593 
13594 	*pathlen = length; /* may be superseded by error */
13595 
13596 out:
13597 	return error;
13598 }
13599 
13600 /*
13601  * Obtain the full pathname of a file system object by id.
13602  */
13603 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)13604 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13605     uint32_t options, user_ssize_t *retval)
13606 {
13607 	vfs_context_t ctx = vfs_context_current();
13608 	fsid_t fsid;
13609 	char *realpath;
13610 	int length;
13611 	int error;
13612 
13613 	if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13614 		return EINVAL;
13615 	}
13616 
13617 	if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13618 		return error;
13619 	}
13620 	AUDIT_ARG(value32, fsid.val[0]);
13621 	AUDIT_ARG(value64, objid);
13622 	/* Restrict output buffer size for now. */
13623 
13624 	if (bufsize > FSGETPATH_MAXBUFLEN || bufsize <= 0) {
13625 		return EINVAL;
13626 	}
13627 	realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13628 	if (realpath == NULL) {
13629 		return ENOMEM;
13630 	}
13631 
13632 	error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
13633 	    options, &length);
13634 
13635 	if (error) {
13636 		goto out;
13637 	}
13638 
13639 	error = copyout((caddr_t)realpath, buf, length);
13640 
13641 	*retval = (user_ssize_t)length; /* may be superseded by error */
13642 out:
13643 	kfree_data(realpath, bufsize);
13644 	return error;
13645 }
13646 
13647 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)13648 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13649 {
13650 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13651 	           0, retval);
13652 }
13653 
13654 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)13655 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13656 {
13657 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13658 	           uap->options, retval);
13659 }
13660 
13661 /*
13662  * Common routine to handle various flavors of statfs data heading out
13663  *	to user space.
13664  *
13665  * Returns:	0			Success
13666  *		EFAULT
13667  */
13668 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)13669 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
13670     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
13671     boolean_t partial_copy)
13672 {
13673 	int             error;
13674 	int             my_size, copy_size;
13675 
13676 	if (is_64_bit) {
13677 		struct user64_statfs sfs;
13678 		my_size = copy_size = sizeof(sfs);
13679 		bzero(&sfs, my_size);
13680 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13681 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13682 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13683 		sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
13684 		sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
13685 		sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
13686 		sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
13687 		sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
13688 		sfs.f_files = (user64_long_t)sfsp->f_files;
13689 		sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
13690 		sfs.f_fsid = sfsp->f_fsid;
13691 		sfs.f_owner = sfsp->f_owner;
13692 		vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
13693 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13694 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13695 
13696 		if (partial_copy) {
13697 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13698 		}
13699 		error = copyout((caddr_t)&sfs, bufp, copy_size);
13700 	} else {
13701 		struct user32_statfs sfs;
13702 
13703 		my_size = copy_size = sizeof(sfs);
13704 		bzero(&sfs, my_size);
13705 
13706 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13707 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13708 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13709 
13710 		/*
13711 		 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
13712 		 * have to fudge the numbers here in that case.   We inflate the blocksize in order
13713 		 * to reflect the filesystem size as best we can.
13714 		 */
13715 		if ((sfsp->f_blocks > INT_MAX)
13716 		    /* Hack for 4061702 . I think the real fix is for Carbon to
13717 		     * look for some volume capability and not depend on hidden
13718 		     * semantics agreed between a FS and carbon.
13719 		     * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
13720 		     * for Carbon to set bNoVolumeSizes volume attribute.
13721 		     * Without this the webdavfs files cannot be copied onto
13722 		     * disk as they look huge. This change should not affect
13723 		     * XSAN as they should not setting these to -1..
13724 		     */
13725 		    && (sfsp->f_blocks != 0xffffffffffffffffULL)
13726 		    && (sfsp->f_bfree != 0xffffffffffffffffULL)
13727 		    && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
13728 			int             shift;
13729 
13730 			/*
13731 			 * Work out how far we have to shift the block count down to make it fit.
13732 			 * Note that it's possible to have to shift so far that the resulting
13733 			 * blocksize would be unreportably large.  At that point, we will clip
13734 			 * any values that don't fit.
13735 			 *
13736 			 * For safety's sake, we also ensure that f_iosize is never reported as
13737 			 * being smaller than f_bsize.
13738 			 */
13739 			for (shift = 0; shift < 32; shift++) {
13740 				if ((sfsp->f_blocks >> shift) <= INT_MAX) {
13741 					break;
13742 				}
13743 				if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
13744 					break;
13745 				}
13746 			}
13747 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
13748 			sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
13749 			sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
13750 			sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
13751 #undef __SHIFT_OR_CLIP
13752 			sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
13753 			sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
13754 		} else {
13755 			/* filesystem is small enough to be reported honestly */
13756 			sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
13757 			sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
13758 			sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
13759 			sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
13760 			sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
13761 		}
13762 		sfs.f_files = (user32_long_t)sfsp->f_files;
13763 		sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
13764 		sfs.f_fsid = sfsp->f_fsid;
13765 		sfs.f_owner = sfsp->f_owner;
13766 		vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
13767 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13768 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13769 
13770 		if (partial_copy) {
13771 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13772 		}
13773 		error = copyout((caddr_t)&sfs, bufp, copy_size);
13774 	}
13775 
13776 	if (sizep != NULL) {
13777 		*sizep = my_size;
13778 	}
13779 	return error;
13780 }
13781 
13782 /*
13783  * copy stat structure into user_stat structure.
13784  */
13785 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)13786 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
13787 {
13788 	bzero(usbp, sizeof(*usbp));
13789 
13790 	usbp->st_dev = sbp->st_dev;
13791 	usbp->st_ino = sbp->st_ino;
13792 	usbp->st_mode = sbp->st_mode;
13793 	usbp->st_nlink = sbp->st_nlink;
13794 	usbp->st_uid = sbp->st_uid;
13795 	usbp->st_gid = sbp->st_gid;
13796 	usbp->st_rdev = sbp->st_rdev;
13797 #ifndef _POSIX_C_SOURCE
13798 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13799 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13800 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13801 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13802 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13803 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13804 #else
13805 	usbp->st_atime = sbp->st_atime;
13806 	usbp->st_atimensec = sbp->st_atimensec;
13807 	usbp->st_mtime = sbp->st_mtime;
13808 	usbp->st_mtimensec = sbp->st_mtimensec;
13809 	usbp->st_ctime = sbp->st_ctime;
13810 	usbp->st_ctimensec = sbp->st_ctimensec;
13811 #endif
13812 	usbp->st_size = sbp->st_size;
13813 	usbp->st_blocks = sbp->st_blocks;
13814 	usbp->st_blksize = sbp->st_blksize;
13815 	usbp->st_flags = sbp->st_flags;
13816 	usbp->st_gen = sbp->st_gen;
13817 	usbp->st_lspare = sbp->st_lspare;
13818 	usbp->st_qspare[0] = sbp->st_qspare[0];
13819 	usbp->st_qspare[1] = sbp->st_qspare[1];
13820 }
13821 
13822 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)13823 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
13824 {
13825 	bzero(usbp, sizeof(*usbp));
13826 
13827 	usbp->st_dev = sbp->st_dev;
13828 	usbp->st_ino = sbp->st_ino;
13829 	usbp->st_mode = sbp->st_mode;
13830 	usbp->st_nlink = sbp->st_nlink;
13831 	usbp->st_uid = sbp->st_uid;
13832 	usbp->st_gid = sbp->st_gid;
13833 	usbp->st_rdev = sbp->st_rdev;
13834 #ifndef _POSIX_C_SOURCE
13835 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13836 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13837 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13838 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13839 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13840 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13841 #else
13842 	usbp->st_atime = sbp->st_atime;
13843 	usbp->st_atimensec = sbp->st_atimensec;
13844 	usbp->st_mtime = sbp->st_mtime;
13845 	usbp->st_mtimensec = sbp->st_mtimensec;
13846 	usbp->st_ctime = sbp->st_ctime;
13847 	usbp->st_ctimensec = sbp->st_ctimensec;
13848 #endif
13849 	usbp->st_size = sbp->st_size;
13850 	usbp->st_blocks = sbp->st_blocks;
13851 	usbp->st_blksize = sbp->st_blksize;
13852 	usbp->st_flags = sbp->st_flags;
13853 	usbp->st_gen = sbp->st_gen;
13854 	usbp->st_lspare = sbp->st_lspare;
13855 	usbp->st_qspare[0] = sbp->st_qspare[0];
13856 	usbp->st_qspare[1] = sbp->st_qspare[1];
13857 }
13858 
13859 /*
13860  * copy stat64 structure into user_stat64 structure.
13861  */
13862 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)13863 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
13864 {
13865 	bzero(usbp, sizeof(*usbp));
13866 
13867 	usbp->st_dev = sbp->st_dev;
13868 	usbp->st_ino = sbp->st_ino;
13869 	usbp->st_mode = sbp->st_mode;
13870 	usbp->st_nlink = sbp->st_nlink;
13871 	usbp->st_uid = sbp->st_uid;
13872 	usbp->st_gid = sbp->st_gid;
13873 	usbp->st_rdev = sbp->st_rdev;
13874 #ifndef _POSIX_C_SOURCE
13875 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13876 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13877 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13878 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13879 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13880 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13881 	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
13882 	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
13883 #else
13884 	usbp->st_atime = sbp->st_atime;
13885 	usbp->st_atimensec = sbp->st_atimensec;
13886 	usbp->st_mtime = sbp->st_mtime;
13887 	usbp->st_mtimensec = sbp->st_mtimensec;
13888 	usbp->st_ctime = sbp->st_ctime;
13889 	usbp->st_ctimensec = sbp->st_ctimensec;
13890 	usbp->st_birthtime = sbp->st_birthtime;
13891 	usbp->st_birthtimensec = sbp->st_birthtimensec;
13892 #endif
13893 	usbp->st_size = sbp->st_size;
13894 	usbp->st_blocks = sbp->st_blocks;
13895 	usbp->st_blksize = sbp->st_blksize;
13896 	usbp->st_flags = sbp->st_flags;
13897 	usbp->st_gen = sbp->st_gen;
13898 	usbp->st_lspare = sbp->st_lspare;
13899 	usbp->st_qspare[0] = sbp->st_qspare[0];
13900 	usbp->st_qspare[1] = sbp->st_qspare[1];
13901 }
13902 
13903 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)13904 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
13905 {
13906 	bzero(usbp, sizeof(*usbp));
13907 
13908 	usbp->st_dev = sbp->st_dev;
13909 	usbp->st_ino = sbp->st_ino;
13910 	usbp->st_mode = sbp->st_mode;
13911 	usbp->st_nlink = sbp->st_nlink;
13912 	usbp->st_uid = sbp->st_uid;
13913 	usbp->st_gid = sbp->st_gid;
13914 	usbp->st_rdev = sbp->st_rdev;
13915 #ifndef _POSIX_C_SOURCE
13916 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13917 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13918 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13919 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13920 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13921 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13922 	usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
13923 	usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
13924 #else
13925 	usbp->st_atime = sbp->st_atime;
13926 	usbp->st_atimensec = sbp->st_atimensec;
13927 	usbp->st_mtime = sbp->st_mtime;
13928 	usbp->st_mtimensec = sbp->st_mtimensec;
13929 	usbp->st_ctime = sbp->st_ctime;
13930 	usbp->st_ctimensec = sbp->st_ctimensec;
13931 	usbp->st_birthtime = sbp->st_birthtime;
13932 	usbp->st_birthtimensec = sbp->st_birthtimensec;
13933 #endif
13934 	usbp->st_size = sbp->st_size;
13935 	usbp->st_blocks = sbp->st_blocks;
13936 	usbp->st_blksize = sbp->st_blksize;
13937 	usbp->st_flags = sbp->st_flags;
13938 	usbp->st_gen = sbp->st_gen;
13939 	usbp->st_lspare = sbp->st_lspare;
13940 	usbp->st_qspare[0] = sbp->st_qspare[0];
13941 	usbp->st_qspare[1] = sbp->st_qspare[1];
13942 }
13943 
13944 /*
13945  * Purge buffer cache for simulating cold starts
13946  */
13947 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)13948 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
13949 {
13950 	ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
13951 
13952 	return VNODE_RETURNED;
13953 }
13954 
13955 static int
vfs_purge_callback(mount_t mp,__unused void * arg)13956 vfs_purge_callback(mount_t mp, __unused void * arg)
13957 {
13958 	vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
13959 
13960 	return VFS_RETURNED;
13961 }
13962 
13963 static TUNABLE_WRITEABLE(boolean_t, vfs_purge_vm_pagers, "vfs_purge_vm_pagers", TRUE);
13964 SYSCTL_INT(_vfs, OID_AUTO, purge_vm_pagers, CTLFLAG_RW | CTLFLAG_LOCKED, &vfs_purge_vm_pagers, 0, "VFS purge also purges file-backed VM pagers");
13965 
13966 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)13967 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
13968 {
13969 	if (!kauth_cred_issuser(kauth_cred_get())) {
13970 		return EPERM;
13971 	}
13972 
13973 	vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
13974 
13975 	/* also flush any VM pagers backed by files */
13976 	if (vfs_purge_vm_pagers) {
13977 		vm_purge_filebacked_pagers();
13978 	}
13979 
13980 	return 0;
13981 }
13982 
13983 /*
13984  * gets the vnode associated with the (unnamed) snapshot directory
13985  * for a Filesystem. The snapshot directory vnode is returned with
13986  * an iocount on it.
13987  */
13988 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)13989 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
13990 {
13991 	return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
13992 }
13993 
13994 /*
13995  * Get the snapshot vnode.
13996  *
13997  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
13998  * needs nameidone() on ndp.
13999  *
14000  * If the snapshot vnode exists it is returned in ndp->ni_vp.
14001  *
14002  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
14003  * not needed.
14004  */
14005 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)14006 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
14007     user_addr_t name, struct nameidata *ndp, int32_t op,
14008 #if !CONFIG_TRIGGERS
14009     __unused
14010 #endif
14011     enum path_operation pathop,
14012     vfs_context_t ctx)
14013 {
14014 	int error, i;
14015 	caddr_t name_buf;
14016 	size_t name_len;
14017 	struct vfs_attr vfa;
14018 
14019 	*sdvpp = NULLVP;
14020 	*rvpp = NULLVP;
14021 
14022 	error = vnode_getfromfd(ctx, dirfd, rvpp);
14023 	if (error) {
14024 		return error;
14025 	}
14026 
14027 	if (!vnode_isvroot(*rvpp)) {
14028 		error = EINVAL;
14029 		goto out;
14030 	}
14031 
14032 	/* Make sure the filesystem supports snapshots */
14033 	VFSATTR_INIT(&vfa);
14034 	VFSATTR_WANTED(&vfa, f_capabilities);
14035 	if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
14036 	    !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
14037 	    !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
14038 	    VOL_CAP_INT_SNAPSHOT)) ||
14039 	    !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
14040 	    VOL_CAP_INT_SNAPSHOT))) {
14041 		error = ENOTSUP;
14042 		goto out;
14043 	}
14044 
14045 	error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
14046 	if (error) {
14047 		goto out;
14048 	}
14049 
14050 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14051 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14052 	if (error) {
14053 		goto out1;
14054 	}
14055 
14056 	/*
14057 	 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
14058 	 * (the length returned by copyinstr includes the terminating NUL)
14059 	 */
14060 	if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
14061 	    (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
14062 		error = EINVAL;
14063 		goto out1;
14064 	}
14065 	for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
14066 		;
14067 	}
14068 	if (i < (int)name_len) {
14069 		error = EINVAL;
14070 		goto out1;
14071 	}
14072 
14073 #if CONFIG_MACF
14074 	if (op == CREATE) {
14075 		error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
14076 		    name_buf);
14077 	} else if (op == DELETE) {
14078 		error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
14079 		    name_buf);
14080 	}
14081 	if (error) {
14082 		goto out1;
14083 	}
14084 #endif
14085 
14086 	/* Check if the snapshot already exists ... */
14087 	NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
14088 	    UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14089 	ndp->ni_dvp = *sdvpp;
14090 
14091 	error = namei(ndp);
14092 out1:
14093 	zfree(ZV_NAMEI, name_buf);
14094 out:
14095 	if (error) {
14096 		if (*sdvpp) {
14097 			vnode_put(*sdvpp);
14098 			*sdvpp = NULLVP;
14099 		}
14100 		if (*rvpp) {
14101 			vnode_put(*rvpp);
14102 			*rvpp = NULLVP;
14103 		}
14104 	}
14105 	return error;
14106 }
14107 
14108 /*
14109  * create a filesystem snapshot (for supporting filesystems)
14110  *
14111  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14112  * We get to the (unnamed) snapshot directory vnode and create the vnode
14113  * for the snapshot in it.
14114  *
14115  * Restrictions:
14116  *
14117  *    a) Passed in name for snapshot cannot have slashes.
14118  *    b) name can't be "." or ".."
14119  *
14120  * Since this requires superuser privileges, vnode_authorize calls are not
14121  * made.
14122  */
14123 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14124 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
14125     vfs_context_t ctx)
14126 {
14127 	vnode_t rvp, snapdvp;
14128 	int error;
14129 	struct nameidata *ndp;
14130 
14131 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14132 
14133 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14134 	    OP_LINK, ctx);
14135 	if (error) {
14136 		goto out;
14137 	}
14138 
14139 	if (ndp->ni_vp) {
14140 		vnode_put(ndp->ni_vp);
14141 		error = EEXIST;
14142 	} else {
14143 		struct vnode_attr *vap;
14144 		vnode_t vp = NULLVP;
14145 
14146 		vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14147 
14148 		VATTR_INIT(vap);
14149 		VATTR_SET(vap, va_type, VREG);
14150 		VATTR_SET(vap, va_mode, 0);
14151 
14152 		error = vn_create(snapdvp, &vp, ndp, vap,
14153 		    VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14154 		if (!error && vp) {
14155 			vnode_put(vp);
14156 		}
14157 
14158 		kfree_type(struct vnode_attr, vap);
14159 	}
14160 
14161 	nameidone(ndp);
14162 	vnode_put(snapdvp);
14163 	vnode_put(rvp);
14164 out:
14165 	kfree_type(struct nameidata, ndp);
14166 
14167 	return error;
14168 }
14169 
14170 /*
14171  * Delete a Filesystem snapshot
14172  *
14173  * get the vnode for the unnamed snapshot directory and the snapshot and
14174  * delete the snapshot.
14175  */
14176 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14177 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14178     vfs_context_t ctx)
14179 {
14180 	vnode_t rvp, snapdvp;
14181 	int error;
14182 	struct nameidata *ndp;
14183 
14184 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14185 
14186 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14187 	    OP_UNLINK, ctx);
14188 	if (error) {
14189 		goto out;
14190 	}
14191 
14192 	error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14193 	    VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14194 
14195 	vnode_put(ndp->ni_vp);
14196 	nameidone(ndp);
14197 	vnode_put(snapdvp);
14198 	vnode_put(rvp);
14199 out:
14200 	kfree_type(struct nameidata, ndp);
14201 
14202 	return error;
14203 }
14204 
14205 /*
14206  * Revert a filesystem to a snapshot
14207  *
14208  * Marks the filesystem to revert to the given snapshot on next mount.
14209  */
14210 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14211 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14212     vfs_context_t ctx)
14213 {
14214 	int error;
14215 	vnode_t rvp;
14216 	mount_t mp;
14217 	struct fs_snapshot_revert_args revert_data;
14218 	struct componentname cnp;
14219 	caddr_t name_buf;
14220 	size_t name_len;
14221 
14222 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14223 	if (error) {
14224 		return error;
14225 	}
14226 	mp = vnode_mount(rvp);
14227 
14228 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14229 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14230 	if (error) {
14231 		zfree(ZV_NAMEI, name_buf);
14232 		vnode_put(rvp);
14233 		return error;
14234 	}
14235 
14236 #if CONFIG_MACF
14237 	error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14238 	if (error) {
14239 		zfree(ZV_NAMEI, name_buf);
14240 		vnode_put(rvp);
14241 		return error;
14242 	}
14243 #endif
14244 
14245 	/*
14246 	 * Grab mount_iterref so that we can release the vnode,
14247 	 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14248 	 */
14249 	error = mount_iterref(mp, 0);
14250 	vnode_put(rvp);
14251 	if (error) {
14252 		zfree(ZV_NAMEI, name_buf);
14253 		return error;
14254 	}
14255 
14256 	memset(&cnp, 0, sizeof(cnp));
14257 	cnp.cn_pnbuf = (char *)name_buf;
14258 	cnp.cn_nameiop = LOOKUP;
14259 	cnp.cn_flags = ISLASTCN | HASBUF;
14260 	cnp.cn_pnlen = MAXPATHLEN;
14261 	cnp.cn_nameptr = cnp.cn_pnbuf;
14262 	cnp.cn_namelen = (int)name_len;
14263 	revert_data.sr_cnp = &cnp;
14264 
14265 	error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14266 	mount_iterdrop(mp);
14267 	zfree(ZV_NAMEI, name_buf);
14268 
14269 	if (error) {
14270 		/* If there was any error, try again using VNOP_IOCTL */
14271 
14272 		vnode_t snapdvp;
14273 		struct nameidata namend;
14274 
14275 		error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14276 		    OP_LOOKUP, ctx);
14277 		if (error) {
14278 			return error;
14279 		}
14280 
14281 
14282 		error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14283 		    0, ctx);
14284 
14285 		vnode_put(namend.ni_vp);
14286 		nameidone(&namend);
14287 		vnode_put(snapdvp);
14288 		vnode_put(rvp);
14289 	}
14290 
14291 	return error;
14292 }
14293 
14294 /*
14295  * rename a Filesystem snapshot
14296  *
14297  * get the vnode for the unnamed snapshot directory and the snapshot and
14298  * rename the snapshot. This is a very specialised (and simple) case of
14299  * rename(2) (which has to deal with a lot more complications). It differs
14300  * slightly from rename(2) in that EEXIST is returned if the new name exists.
14301  */
14302 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)14303 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14304     __unused uint32_t flags, vfs_context_t ctx)
14305 {
14306 	vnode_t rvp, snapdvp;
14307 	int error, i;
14308 	caddr_t newname_buf;
14309 	size_t name_len;
14310 	vnode_t fvp;
14311 	struct nameidata *fromnd, *tond;
14312 	/* carving out a chunk for structs that are too big to be on stack. */
14313 	struct {
14314 		struct nameidata from_node;
14315 		struct nameidata to_node;
14316 	} * __rename_data;
14317 
14318 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14319 	fromnd = &__rename_data->from_node;
14320 	tond = &__rename_data->to_node;
14321 
14322 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14323 	    OP_UNLINK, ctx);
14324 	if (error) {
14325 		goto out;
14326 	}
14327 	fvp  = fromnd->ni_vp;
14328 
14329 	newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14330 	error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14331 	if (error) {
14332 		goto out1;
14333 	}
14334 
14335 	/*
14336 	 * Some sanity checks- new name can't be empty, "." or ".." or have
14337 	 * slashes.
14338 	 * (the length returned by copyinstr includes the terminating NUL)
14339 	 *
14340 	 * The FS rename VNOP is suppossed to handle this but we'll pick it
14341 	 * off here itself.
14342 	 */
14343 	if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14344 	    (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14345 		error = EINVAL;
14346 		goto out1;
14347 	}
14348 	for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14349 		;
14350 	}
14351 	if (i < (int)name_len) {
14352 		error = EINVAL;
14353 		goto out1;
14354 	}
14355 
14356 #if CONFIG_MACF
14357 	error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14358 	    newname_buf);
14359 	if (error) {
14360 		goto out1;
14361 	}
14362 #endif
14363 
14364 	NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14365 	    UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14366 	tond->ni_dvp = snapdvp;
14367 
14368 	error = namei(tond);
14369 	if (error) {
14370 		goto out2;
14371 	} else if (tond->ni_vp) {
14372 		/*
14373 		 * snapshot rename behaves differently than rename(2) - if the
14374 		 * new name exists, EEXIST is returned.
14375 		 */
14376 		vnode_put(tond->ni_vp);
14377 		error = EEXIST;
14378 		goto out2;
14379 	}
14380 
14381 	error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14382 	    &tond->ni_cnd, ctx);
14383 
14384 out2:
14385 	nameidone(tond);
14386 out1:
14387 	zfree(ZV_NAMEI, newname_buf);
14388 	vnode_put(fvp);
14389 	vnode_put(snapdvp);
14390 	vnode_put(rvp);
14391 	nameidone(fromnd);
14392 out:
14393 	kfree_type(typeof(*__rename_data), __rename_data);
14394 	return error;
14395 }
14396 
14397 /*
14398  * Mount a Filesystem snapshot
14399  *
14400  * get the vnode for the unnamed snapshot directory and the snapshot and
14401  * mount the snapshot.
14402  */
14403 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)14404 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14405     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14406 {
14407 	mount_t mp;
14408 	vnode_t rvp, snapdvp, snapvp, vp, pvp;
14409 	struct fs_snapshot_mount_args smnt_data;
14410 	int error;
14411 	struct nameidata *snapndp, *dirndp;
14412 	/* carving out a chunk for structs that are too big to be on stack. */
14413 	struct {
14414 		struct nameidata snapnd;
14415 		struct nameidata dirnd;
14416 	} * __snapshot_mount_data;
14417 
14418 	__snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14419 	snapndp = &__snapshot_mount_data->snapnd;
14420 	dirndp = &__snapshot_mount_data->dirnd;
14421 
14422 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
14423 	    OP_LOOKUP, ctx);
14424 	if (error) {
14425 		goto out;
14426 	}
14427 
14428 	snapvp  = snapndp->ni_vp;
14429 	if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
14430 		error = EIO;
14431 		goto out1;
14432 	}
14433 
14434 	/* Get the vnode to be covered */
14435 	NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14436 	    UIO_USERSPACE, directory, ctx);
14437 	error = namei(dirndp);
14438 	if (error) {
14439 		goto out1;
14440 	}
14441 
14442 	vp = dirndp->ni_vp;
14443 	pvp = dirndp->ni_dvp;
14444 	mp = vnode_mount(rvp);
14445 
14446 	if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14447 		error = EINVAL;
14448 		goto out2;
14449 	}
14450 
14451 #if CONFIG_MACF
14452 	error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
14453 	    mp->mnt_vfsstat.f_fstypename);
14454 	if (error) {
14455 		goto out2;
14456 	}
14457 #endif
14458 
14459 	smnt_data.sm_mp  = mp;
14460 	smnt_data.sm_cnp = &snapndp->ni_cnd;
14461 	error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
14462 	    &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & (MNT_DONTBROWSE | MNT_IGNORE_OWNERSHIP),
14463 	    KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14464 
14465 out2:
14466 	vnode_put(vp);
14467 	vnode_put(pvp);
14468 	nameidone(dirndp);
14469 out1:
14470 	vnode_put(snapvp);
14471 	vnode_put(snapdvp);
14472 	vnode_put(rvp);
14473 	nameidone(snapndp);
14474 out:
14475 	kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14476 	return error;
14477 }
14478 
14479 /*
14480  * Root from a snapshot of the filesystem
14481  *
14482  * Marks the filesystem to root from the given snapshot on next boot.
14483  */
14484 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14485 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14486     vfs_context_t ctx)
14487 {
14488 	int error;
14489 	vnode_t rvp;
14490 	mount_t mp;
14491 	struct fs_snapshot_root_args root_data;
14492 	struct componentname cnp;
14493 	caddr_t name_buf;
14494 	size_t name_len;
14495 
14496 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14497 	if (error) {
14498 		return error;
14499 	}
14500 	mp = vnode_mount(rvp);
14501 
14502 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14503 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14504 	if (error) {
14505 		zfree(ZV_NAMEI, name_buf);
14506 		vnode_put(rvp);
14507 		return error;
14508 	}
14509 
14510 	// XXX MAC checks ?
14511 
14512 	/*
14513 	 * Grab mount_iterref so that we can release the vnode,
14514 	 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14515 	 */
14516 	error = mount_iterref(mp, 0);
14517 	vnode_put(rvp);
14518 	if (error) {
14519 		zfree(ZV_NAMEI, name_buf);
14520 		return error;
14521 	}
14522 
14523 	memset(&cnp, 0, sizeof(cnp));
14524 	cnp.cn_pnbuf = (char *)name_buf;
14525 	cnp.cn_nameiop = LOOKUP;
14526 	cnp.cn_flags = ISLASTCN | HASBUF;
14527 	cnp.cn_pnlen = MAXPATHLEN;
14528 	cnp.cn_nameptr = cnp.cn_pnbuf;
14529 	cnp.cn_namelen = (int)name_len;
14530 	root_data.sr_cnp = &cnp;
14531 
14532 	error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
14533 
14534 	mount_iterdrop(mp);
14535 	zfree(ZV_NAMEI, name_buf);
14536 
14537 	return error;
14538 }
14539 
14540 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)14541 vfs_context_can_snapshot(vfs_context_t ctx)
14542 {
14543 	static const char * const snapshot_entitlements[] = {
14544 		"com.apple.private.vfs.snapshot",
14545 		"com.apple.developer.vfs.snapshot",
14546 		"com.apple.private.apfs.arv.limited.snapshot",
14547 	};
14548 	static const size_t nentitlements =
14549 	    sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14550 	size_t i;
14551 
14552 	task_t task = vfs_context_task(ctx);
14553 	for (i = 0; i < nentitlements; i++) {
14554 		if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
14555 			return TRUE;
14556 		}
14557 	}
14558 	return FALSE;
14559 }
14560 
14561 /*
14562  * FS snapshot operations dispatcher
14563  */
14564 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)14565 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14566     __unused int32_t *retval)
14567 {
14568 	int error;
14569 	vfs_context_t ctx = vfs_context_current();
14570 
14571 	AUDIT_ARG(fd, uap->dirfd);
14572 	AUDIT_ARG(value32, uap->op);
14573 
14574 	if (!vfs_context_can_snapshot(ctx)) {
14575 		return EPERM;
14576 	}
14577 
14578 	/*
14579 	 * Enforce user authorization for snapshot modification operations,
14580 	 * or if trying to root from snapshot.
14581 	 */
14582 	if (uap->op != SNAPSHOT_OP_MOUNT) {
14583 		vnode_t dvp = NULLVP;
14584 		vnode_t devvp = NULLVP;
14585 		mount_t mp;
14586 
14587 		error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
14588 		if (error) {
14589 			return error;
14590 		}
14591 		mp = vnode_mount(dvp);
14592 		devvp = mp->mnt_devvp;
14593 
14594 		/* get an iocount on devvp */
14595 		if (devvp == NULLVP) {
14596 			error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
14597 			/* for mounts which arent block devices */
14598 			if (error == ENOENT) {
14599 				error = ENXIO;
14600 			}
14601 		} else {
14602 			error = vnode_getwithref(devvp);
14603 		}
14604 
14605 		if (error) {
14606 			vnode_put(dvp);
14607 			return error;
14608 		}
14609 
14610 		if ((vfs_context_issuser(ctx) == 0) &&
14611 		    (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14612 		    (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
14613 			error = EPERM;
14614 		}
14615 		vnode_put(dvp);
14616 		vnode_put(devvp);
14617 
14618 		if (error) {
14619 			return error;
14620 		}
14621 	}
14622 
14623 	switch (uap->op) {
14624 	case SNAPSHOT_OP_CREATE:
14625 		error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
14626 		break;
14627 	case SNAPSHOT_OP_DELETE:
14628 		error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
14629 		break;
14630 	case SNAPSHOT_OP_RENAME:
14631 		error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
14632 		    uap->flags, ctx);
14633 		break;
14634 	case SNAPSHOT_OP_MOUNT:
14635 		error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
14636 		    uap->data, uap->flags, ctx);
14637 		break;
14638 	case SNAPSHOT_OP_REVERT:
14639 		error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
14640 		break;
14641 #if CONFIG_MNT_ROOTSNAP
14642 	case SNAPSHOT_OP_ROOT:
14643 		error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
14644 		break;
14645 #endif /* CONFIG_MNT_ROOTSNAP */
14646 	default:
14647 		error = ENOSYS;
14648 	}
14649 
14650 	return error;
14651 }
14652