xref: /xnu-8792.41.9/bsd/vfs/vfs_syscalls.c (revision 5c2921b07a2480ab43ec66f5b9e41cb872bc554f)
1 /*
2  * Copyright (c) 1995-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1989, 1993
30  *	The Regents of the University of California.  All rights reserved.
31  * (c) UNIX System Laboratories, Inc.
32  * All or some portions of this file are derived from material licensed
33  * to the University of California by American Telephone and Telegraph
34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35  * the permission of UNIX System Laboratories, Inc.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. All advertising materials mentioning features or use of this software
46  *    must display the following acknowledgement:
47  *	This product includes software developed by the University of
48  *	California, Berkeley and its contributors.
49  * 4. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.41 (Berkeley) 6/15/95
66  */
67 /*
68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69  * support for mandatory and extensible security protections.  This notice
70  * is included in support of clause 2.2 (b) of the Apple Public License,
71  * Version 2.0.
72  */
73 
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112 
113 #include <vfs/vfs_disk_conditioner.h>
114 
115 #include <security/audit/audit.h>
116 #include <bsm/audit_kevents.h>
117 
118 #include <mach/mach_types.h>
119 #include <kern/kern_types.h>
120 #include <kern/kalloc.h>
121 #include <kern/task.h>
122 
123 #include <vm/vm_pageout.h>
124 #include <vm/vm_protos.h>
125 
126 #include <libkern/OSAtomic.h>
127 #include <os/atomic_private.h>
128 #include <pexpert/pexpert.h>
129 #include <IOKit/IOBSD.h>
130 
131 // deps for MIG call
132 #include <kern/host.h>
133 #include <kern/ipc_misc.h>
134 #include <mach/host_priv.h>
135 #include <mach/vfs_nspace.h>
136 #include <os/log.h>
137 
138 #include <nfs/nfs_conf.h>
139 
140 #if ROUTEFS
141 #include <miscfs/routefs/routefs.h>
142 #endif /* ROUTEFS */
143 
144 #if CONFIG_MACF
145 #include <security/mac.h>
146 #include <security/mac_framework.h>
147 #endif
148 
149 #if CONFIG_FSE
150 #define GET_PATH(x) \
151 	((x) = get_pathbuff())
152 #define RELEASE_PATH(x) \
153 	release_pathbuff(x)
154 #else
155 #define GET_PATH(x)     \
156 	((x) = zalloc(ZV_NAMEI))
157 #define RELEASE_PATH(x) \
158 	zfree(ZV_NAMEI, x)
159 #endif /* CONFIG_FSE */
160 
161 #ifndef HFS_GET_BOOT_INFO
162 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
163 #endif
164 
165 #ifndef HFS_SET_BOOT_INFO
166 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
167 #endif
168 
169 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
170 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
171 #endif
172 
173 extern void disk_conditioner_unmount(mount_t mp);
174 
175 /* struct for checkdirs iteration */
176 struct cdirargs {
177 	vnode_t olddp;
178 	vnode_t newdp;
179 };
180 /* callback  for checkdirs iteration */
181 static int checkdirs_callback(proc_t p, void * arg);
182 
183 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
184 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
185 void enablequotas(struct mount *mp, vfs_context_t ctx);
186 static int getfsstat_callback(mount_t mp, void * arg);
187 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
188 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
189 static int sync_callback(mount_t, void *);
190 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
191     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
192     boolean_t partial_copy);
193 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
194 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
195     struct componentname *cnp, user_addr_t fsmountargs,
196     int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
197 void vfs_notify_mount(vnode_t pdvp);
198 
199 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
200 
201 struct fd_vn_data * fg_vn_data_alloc(void);
202 
203 /*
204  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
205  * Concurrent lookups (or lookups by ids) on hard links can cause the
206  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
207  * does) to return ENOENT as the path cannot be returned from the name cache
208  * alone. We have no option but to retry and hope to get one namei->reverse path
209  * generation done without an intervening lookup, lookup by id on the hard link
210  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
211  * which currently are the MAC hooks for rename, unlink and rmdir.
212  */
213 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
214 
215 /* Max retry limit for rename due to vnode recycling. */
216 #define MAX_RENAME_ERECYCLE_RETRIES 1024
217 
218 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
219     int unlink_flags);
220 
221 #ifdef CONFIG_IMGSRC_ACCESS
222 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
223 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
224 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
225 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
226 static void mount_end_update(mount_t mp);
227 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
228 #endif /* CONFIG_IMGSRC_ACCESS */
229 
230 //snapshot functions
231 #if CONFIG_MNT_ROOTSNAP
232 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
233 #else
234 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
235 #endif
236 
237 __private_extern__
238 int sync_internal(void);
239 
240 __private_extern__
241 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
242 
243 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
244 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
245 
246 /* vars for sync mutex */
247 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
248 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
249 
250 extern lck_rw_t rootvnode_rw_lock;
251 
252 /*
253  * incremented each time a mount or unmount operation occurs
254  * used to invalidate the cached value of the rootvp in the
255  * mount structure utilized by cache_lookup_path
256  */
257 uint32_t mount_generation = 0;
258 
259 /* counts number of mount and unmount operations */
260 unsigned int vfs_nummntops = 0;
261 
262 /* system-wide, per-boot unique mount ID */
263 static _Atomic uint64_t mount_unique_id = 1;
264 
265 extern const struct fileops vnops;
266 #if CONFIG_APPLEDOUBLE
267 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
268 #endif /* CONFIG_APPLEDOUBLE */
269 
270 /*
271  * Virtual File System System Calls
272  */
273 
274 /*
275  * Private in-kernel mounting spi (specific use-cases only)
276  */
277 boolean_t
vfs_iskernelmount(mount_t mp)278 vfs_iskernelmount(mount_t mp)
279 {
280 	return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
281 }
282 
283 __private_extern__
284 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)285 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
286     void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
287     vfs_context_t ctx)
288 {
289 	struct nameidata nd;
290 	boolean_t did_namei;
291 	int error;
292 
293 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
294 	    UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
295 
296 	kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
297 
298 	/*
299 	 * Get the vnode to be covered if it's not supplied
300 	 */
301 	if (vp == NULLVP) {
302 		error = namei(&nd);
303 		if (error) {
304 			if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
305 				printf("failed to locate mount-on path: %s ", path);
306 			}
307 			return error;
308 		}
309 		vp = nd.ni_vp;
310 		pvp = nd.ni_dvp;
311 		did_namei = TRUE;
312 	} else {
313 		char *pnbuf = CAST_DOWN(char *, path);
314 
315 		nd.ni_cnd.cn_pnbuf = pnbuf;
316 		nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
317 		did_namei = FALSE;
318 	}
319 
320 	kern_flags |= KERNEL_MOUNT_KMOUNT;
321 	error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
322 	    syscall_flags, kern_flags, NULL, ctx);
323 
324 	if (did_namei) {
325 		vnode_put(vp);
326 		vnode_put(pvp);
327 		nameidone(&nd);
328 	}
329 
330 	return error;
331 }
332 
333 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)334 vfs_mount_at_path(const char *fstype, const char *path,
335     vnode_t pvp, vnode_t vp, void *data, size_t datalen,
336     int mnt_flags, int flags)
337 {
338 	int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
339 	int error, km_flags = 0;
340 	vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
341 
342 	/*
343 	 * This call is currently restricted to specific use cases.
344 	 */
345 	if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
346 		return ENOTSUP;
347 	}
348 
349 #if !defined(XNU_TARGET_OS_OSX)
350 	if (strcmp(fstype, "lifs") == 0) {
351 		syscall_flags |= MNT_NOEXEC;
352 	}
353 #endif
354 
355 	if (flags & VFS_MOUNT_FLAG_NOAUTH) {
356 		km_flags |= KERNEL_MOUNT_NOAUTH;
357 	}
358 	if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
359 		km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
360 	}
361 
362 	error = kernel_mount(fstype, pvp, vp, path, data, datalen,
363 	    syscall_flags, km_flags, ctx);
364 	if (error) {
365 		printf("%s: mount on %s failed, error %d\n", __func__, path,
366 		    error);
367 	}
368 
369 	return error;
370 }
371 
372 int
vfs_mount_override_type_name(mount_t mp,const char * name)373 vfs_mount_override_type_name(mount_t mp, const char *name)
374 {
375 	if (mp == NULL || name == NULL) {
376 		return EINVAL;
377 	}
378 
379 	/* Override the FS type name. */
380 	mount_lock_spin(mp);
381 	strlcpy(mp->fstypename_override, name, sizeof(mp->fstypename_override));
382 	mp->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
383 	mount_unlock(mp);
384 
385 	return 0;
386 }
387 
388 /*
389  * Mount a file system.
390  */
391 /* ARGSUSED */
392 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)393 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
394 {
395 	struct __mac_mount_args muap;
396 
397 	muap.type = uap->type;
398 	muap.path = uap->path;
399 	muap.flags = uap->flags;
400 	muap.data = uap->data;
401 	muap.mac_p = USER_ADDR_NULL;
402 	return __mac_mount(p, &muap, retval);
403 }
404 
405 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)406 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
407 {
408 	struct componentname    cn;
409 	vfs_context_t           ctx = vfs_context_current();
410 	size_t                  dummy = 0;
411 	int                     error;
412 	int                     flags = uap->flags;
413 	char                    fstypename[MFSNAMELEN];
414 	char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
415 	vnode_t                 pvp;
416 	vnode_t                 vp;
417 
418 	AUDIT_ARG(fd, uap->fd);
419 	AUDIT_ARG(fflags, flags);
420 	/* fstypename will get audited by mount_common */
421 
422 	/* Sanity check the flags */
423 	if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
424 		return ENOTSUP;
425 	}
426 
427 	if (flags & MNT_UNION) {
428 		return EPERM;
429 	}
430 
431 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
432 	if (error) {
433 		return error;
434 	}
435 
436 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
437 		return error;
438 	}
439 
440 	if ((error = vnode_getwithref(vp)) != 0) {
441 		file_drop(uap->fd);
442 		return error;
443 	}
444 
445 	pvp = vnode_getparent(vp);
446 	if (pvp == NULL) {
447 		if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
448 			error = EBUSY;
449 		} else {
450 			error = EINVAL;
451 		}
452 		vnode_put(vp);
453 		file_drop(uap->fd);
454 		return error;
455 	}
456 
457 	memset(&cn, 0, sizeof(struct componentname));
458 	cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
459 	cn.cn_pnlen = MAXPATHLEN;
460 
461 	if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
462 		zfree(ZV_NAMEI, cn.cn_pnbuf);
463 		vnode_put(pvp);
464 		vnode_put(vp);
465 		file_drop(uap->fd);
466 		return error;
467 	}
468 
469 	error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
470 
471 	zfree(ZV_NAMEI, cn.cn_pnbuf);
472 	vnode_put(pvp);
473 	vnode_put(vp);
474 	file_drop(uap->fd);
475 
476 	return error;
477 }
478 
479 #define MAX_GRAFT_METADATA_SIZE             16384 /* bytes */
480 
481 /*
482  * Get the size of a graft file (a manifest or payload file).
483  * The vp should be an iocounted vnode.
484  */
485 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)486 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
487 {
488 	struct stat64 sb = {};
489 	int error;
490 
491 	*size = 0;
492 
493 	error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
494 	if (error) {
495 		return error;
496 	}
497 
498 	if (sb.st_size == 0) {
499 		error = ENODATA;
500 	} else if (sb.st_size > MAX_GRAFT_METADATA_SIZE) {
501 		error = EFBIG;
502 	} else {
503 		*size = (size_t) sb.st_size;
504 	}
505 
506 	return error;
507 }
508 
509 /*
510  * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
511  * `size` must already be validated.
512  */
513 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)514 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
515 {
516 	return vn_rdwr(UIO_READ, graft_vp,
517 	           (caddr_t) buf, (int) size, /* offset */ 0,
518 	           UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
519 	           vfs_context_ucred(vctx), /* resid */ NULL,
520 	           vfs_context_proc(vctx));
521 }
522 
523 /*
524  * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
525  * and read it into `buf`.
526  */
527 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,size_t * size,void * buf)528 graft_secureboot_read_fd(int fd, vfs_context_t vctx, size_t *size, void *buf)
529 {
530 	vnode_t metadata_vp = NULLVP;
531 	int error;
532 
533 	// Convert this graft fd to a vnode.
534 	if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
535 		goto out;
536 	}
537 
538 	// Get (and validate) size information.
539 	if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
540 		goto out;
541 	}
542 
543 	// Read each file into the provided buffer - we must get the expected amount of bytes.
544 	if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
545 		goto out;
546 	}
547 
548 out:
549 	if (metadata_vp) {
550 		vnode_put(metadata_vp);
551 		metadata_vp = NULLVP;
552 	}
553 
554 	return error;
555 }
556 
557 /*
558  * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
559  * provided in `gfs`, saving the size of data read in `gfs`.
560  */
561 static int
graft_secureboot_read_metadata(secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)562 graft_secureboot_read_metadata(secure_boot_cryptex_args_t *sbc_args, vfs_context_t vctx,
563     fsioc_graft_fs_t *gfs)
564 {
565 	int error;
566 
567 	// Read the authentic manifest.
568 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
569 	    &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
570 		return error;
571 	}
572 
573 	// The user manifest is currently unused, but set its size.
574 	gfs->user_manifest_size = 0;
575 
576 	// Read the payload.
577 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
578 	    &gfs->payload_size, gfs->payload))) {
579 		return error;
580 	}
581 
582 	return 0;
583 }
584 
585 /*
586  * Call into the filesystem to verify and graft a cryptex.
587  */
588 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)589 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
590     vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
591 {
592 	fsioc_graft_fs_t gfs = {};
593 	uint64_t graft_dir_ino = 0;
594 	struct stat64 sb = {};
595 	int error;
596 
597 	// Pre-flight arguments.
598 	if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
599 		// Make sure that this graft version matches what we support.
600 		return ENOTSUP;
601 	} else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
602 		// For this type, cryptex VP must live on same volume as the target of graft.
603 		return EXDEV;
604 	} else if (mounton_vp && mounton_vp->v_type != VDIR) {
605 		// We cannot graft upon non-directories.
606 		return ENOTDIR;
607 	} else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
608 	    sbc_args->sbc_payload_fd < 0) {
609 		// We cannot graft without a manifest and payload.
610 		return EINVAL;
611 	}
612 
613 	if (mounton_vp) {
614 		// Get the mounton's inode number.
615 		error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
616 		if (error) {
617 			return error;
618 		}
619 		graft_dir_ino = (uint64_t) sb.st_ino;
620 	}
621 
622 	// Create buffers (of our maximum-defined size) to store authentication info.
623 	gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
624 	gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
625 
626 	if (!gfs.authentic_manifest || !gfs.payload) {
627 		error = ENOMEM;
628 		goto out;
629 	}
630 
631 	// Read our fd's into our buffers.
632 	// (Note that this will set the buffer size fields in `gfs`.)
633 	error = graft_secureboot_read_metadata(sbc_args, vctx, &gfs);
634 	if (error) {
635 		goto out;
636 	}
637 
638 	gfs.graft_version = FSIOC_GRAFT_VERSION;
639 	gfs.graft_type = graft_type;
640 	gfs.graft_4cc = sbc_args->sbc_4cc;
641 	if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
642 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
643 	}
644 	if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
645 		gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
646 	}
647 	if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
648 		gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
649 	}
650 	if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
651 		gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
652 	}
653 	if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
654 		gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
655 	}
656 	if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
657 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
658 	}
659 	gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
660 
661 	// Call into the FS to perform the graft (and validation).
662 	error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
663 
664 out:
665 	if (gfs.authentic_manifest) {
666 		kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
667 		gfs.authentic_manifest = NULL;
668 	}
669 	if (gfs.payload) {
670 		kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
671 		gfs.payload = NULL;
672 	}
673 
674 	return error;
675 }
676 
677 #define GRAFTDMG_ENTITLEMENT  "com.apple.private.vfs.graftdmg"
678 
679 /*
680  * Graft a cryptex disk image (via FD) onto the appropriate mount-point
681  * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
682  */
683 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)684 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
685 {
686 	int ua_dmgfd = uap->dmg_fd;
687 	user_addr_t ua_mountdir = uap->mountdir;
688 	uint32_t ua_grafttype = uap->graft_type;
689 	user_addr_t ua_graftargs = uap->gda;
690 
691 	graftdmg_args_un kern_gda = {};
692 	int error = 0;
693 	secure_boot_cryptex_args_t *sbc_args = NULL;
694 
695 	vnode_t cryptex_vp = NULLVP;
696 	vnode_t mounton_vp = NULLVP;
697 	struct nameidata nd = {};
698 	vfs_context_t ctx = vfs_context_current();
699 
700 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
701 		return EPERM;
702 	}
703 
704 	error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
705 	if (error) {
706 		return error;
707 	}
708 
709 	// Copy mount dir in, if provided.
710 	if (ua_mountdir != USER_ADDR_NULL) {
711 		// Acquire vnode for mount-on path
712 		NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
713 		    UIO_USERSPACE, ua_mountdir, ctx);
714 
715 		error = namei(&nd);
716 		if (error) {
717 			return error;
718 		}
719 		mounton_vp = nd.ni_vp;
720 	}
721 
722 	// Convert fd to vnode.
723 	error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
724 	if (error) {
725 		goto graftout;
726 	}
727 
728 	if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_DOWNLEVEL) {
729 		error = EINVAL;
730 	} else {
731 		sbc_args = &kern_gda.sbc_args;
732 		error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp);
733 	}
734 
735 graftout:
736 	if (cryptex_vp) {
737 		vnode_put(cryptex_vp);
738 		cryptex_vp = NULLVP;
739 	}
740 	if (mounton_vp) {
741 		vnode_put(mounton_vp);
742 		mounton_vp = NULLVP;
743 	}
744 	if (ua_mountdir != USER_ADDR_NULL) {
745 		nameidone(&nd);
746 	}
747 
748 	return error;
749 }
750 
751 /*
752  * Ungraft a cryptex disk image (via mount dir FD)
753  * { int ungraftdmg(const char *mountdir, uint64_t flags); }
754  */
755 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)756 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
757 {
758 	int error = 0;
759 	user_addr_t ua_mountdir = uap->mountdir;
760 	fsioc_ungraft_fs_t ugfs;
761 	vnode_t mounton_vp = NULLVP;
762 	struct nameidata nd = {};
763 	vfs_context_t ctx = vfs_context_current();
764 
765 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
766 		return EPERM;
767 	}
768 
769 	if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
770 		return EINVAL;
771 	}
772 
773 	ugfs.ungraft_flags = 0;
774 
775 	// Acquire vnode for mount-on path
776 	NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
777 	    UIO_USERSPACE, ua_mountdir, ctx);
778 
779 	error = namei(&nd);
780 	if (error) {
781 		return error;
782 	}
783 	mounton_vp = nd.ni_vp;
784 
785 	// Call into the FS to perform the ungraft
786 	error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
787 
788 	vnode_put(mounton_vp);
789 	nameidone(&nd);
790 
791 	return error;
792 }
793 
794 
795 void
vfs_notify_mount(vnode_t pdvp)796 vfs_notify_mount(vnode_t pdvp)
797 {
798 	vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
799 	lock_vnode_and_post(pdvp, NOTE_WRITE);
800 }
801 
802 /*
803  * __mac_mount:
804  *	Mount a file system taking into account MAC label behavior.
805  *	See mount(2) man page for more information
806  *
807  * Parameters:    p                        Process requesting the mount
808  *                uap                      User argument descriptor (see below)
809  *                retval                   (ignored)
810  *
811  * Indirect:      uap->type                Filesystem type
812  *                uap->path                Path to mount
813  *                uap->data                Mount arguments
814  *                uap->mac_p               MAC info
815  *                uap->flags               Mount flags
816  *
817  *
818  * Returns:        0                       Success
819  *                !0                       Not success
820  */
821 boolean_t root_fs_upgrade_try = FALSE;
822 
823 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)824 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
825 {
826 	vnode_t pvp = NULL;
827 	vnode_t vp = NULL;
828 	int need_nameidone = 0;
829 	vfs_context_t ctx = vfs_context_current();
830 	char fstypename[MFSNAMELEN];
831 	struct nameidata nd;
832 	size_t dummy = 0;
833 	char *labelstr = NULL;
834 	size_t labelsz = 0;
835 	int flags = uap->flags;
836 	int error;
837 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
838 	boolean_t is_64bit = IS_64BIT_PROCESS(p);
839 #else
840 #pragma unused(p)
841 #endif
842 	/*
843 	 * Get the fs type name from user space
844 	 */
845 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
846 	if (error) {
847 		return error;
848 	}
849 
850 	/*
851 	 * Get the vnode to be covered
852 	 */
853 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
854 	    UIO_USERSPACE, uap->path, ctx);
855 	if (flags & MNT_NOFOLLOW) {
856 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
857 	}
858 	error = namei(&nd);
859 	if (error) {
860 		goto out;
861 	}
862 	need_nameidone = 1;
863 	vp = nd.ni_vp;
864 	pvp = nd.ni_dvp;
865 
866 #ifdef CONFIG_IMGSRC_ACCESS
867 	/* Mounting image source cannot be batched with other operations */
868 	if (flags == MNT_IMGSRC_BY_INDEX) {
869 		error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
870 		    ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
871 		goto out;
872 	}
873 #endif /* CONFIG_IMGSRC_ACCESS */
874 
875 #if CONFIG_MACF
876 	/*
877 	 * Get the label string (if any) from user space
878 	 */
879 	if (uap->mac_p != USER_ADDR_NULL) {
880 		struct user_mac mac;
881 		size_t ulen = 0;
882 
883 		if (is_64bit) {
884 			struct user64_mac mac64;
885 			error = copyin(uap->mac_p, &mac64, sizeof(mac64));
886 			mac.m_buflen = (user_size_t)mac64.m_buflen;
887 			mac.m_string = (user_addr_t)mac64.m_string;
888 		} else {
889 			struct user32_mac mac32;
890 			error = copyin(uap->mac_p, &mac32, sizeof(mac32));
891 			mac.m_buflen = mac32.m_buflen;
892 			mac.m_string = mac32.m_string;
893 		}
894 		if (error) {
895 			goto out;
896 		}
897 		if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
898 		    (mac.m_buflen < 2)) {
899 			error = EINVAL;
900 			goto out;
901 		}
902 		labelsz = mac.m_buflen;
903 		labelstr = kalloc_data(labelsz, Z_WAITOK);
904 		error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
905 		if (error) {
906 			goto out;
907 		}
908 		AUDIT_ARG(mac_string, labelstr);
909 	}
910 #endif /* CONFIG_MACF */
911 
912 	AUDIT_ARG(fflags, flags);
913 
914 #if !CONFIG_UNION_MOUNTS
915 	if (flags & MNT_UNION) {
916 		error = EPERM;
917 		goto out;
918 	}
919 #endif
920 
921 	if ((vp->v_flag & VROOT) &&
922 	    (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
923 #if CONFIG_UNION_MOUNTS
924 		if (!(flags & MNT_UNION)) {
925 			flags |= MNT_UPDATE;
926 		} else {
927 			/*
928 			 * For a union mount on '/', treat it as fresh
929 			 * mount instead of update.
930 			 * Otherwise, union mouting on '/' used to panic the
931 			 * system before, since mnt_vnodecovered was found to
932 			 * be NULL for '/' which is required for unionlookup
933 			 * after it gets ENOENT on union mount.
934 			 */
935 			flags = (flags & ~(MNT_UPDATE));
936 		}
937 #else
938 		flags |= MNT_UPDATE;
939 #endif /* CONFIG_UNION_MOUNTS */
940 
941 #if SECURE_KERNEL
942 		if ((flags & MNT_RDONLY) == 0) {
943 			/* Release kernels are not allowed to mount "/" as rw */
944 			error = EPERM;
945 			goto out;
946 		}
947 #endif
948 
949 		/*
950 		 * See 7392553 for more details on why this check exists.
951 		 * Suffice to say: If this check is ON and something tries
952 		 * to mount the rootFS RW, we'll turn off the codesign
953 		 * bitmap optimization.
954 		 */
955 #if CHECK_CS_VALIDATION_BITMAP
956 		if ((flags & MNT_RDONLY) == 0) {
957 			root_fs_upgrade_try = TRUE;
958 		}
959 #endif
960 	}
961 
962 	error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
963 	    labelstr, ctx);
964 
965 out:
966 
967 #if CONFIG_MACF
968 	kfree_data(labelstr, labelsz);
969 #endif /* CONFIG_MACF */
970 
971 	if (vp) {
972 		vnode_put(vp);
973 	}
974 	if (pvp) {
975 		vnode_put(pvp);
976 	}
977 	if (need_nameidone) {
978 		nameidone(&nd);
979 	}
980 
981 	return error;
982 }
983 
984 /*
985  * common mount implementation (final stage of mounting)
986  *
987  * Arguments:
988  *  fstypename	file system type (ie it's vfs name)
989  *  pvp		parent of covered vnode
990  *  vp		covered vnode
991  *  cnp		component name (ie path) of covered vnode
992  *  flags	generic mount flags
993  *  fsmountargs	file system specific data
994  *  labelstr	optional MAC label
995  *  kernelmount	TRUE for mounts initiated from inside the kernel
996  *  ctx		caller's context
997  */
998 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)999 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
1000     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
1001     char *labelstr, vfs_context_t ctx)
1002 {
1003 #if !CONFIG_MACF
1004 #pragma unused(labelstr)
1005 #endif
1006 	struct vnode *devvp = NULLVP;
1007 	struct vnode *device_vnode = NULLVP;
1008 #if CONFIG_MACF
1009 	struct vnode *rvp;
1010 #endif
1011 	struct mount *mp = NULL;
1012 	struct vfstable *vfsp = (struct vfstable *)0;
1013 	struct proc *p = vfs_context_proc(ctx);
1014 	int error, flag = 0;
1015 	bool flag_set = false;
1016 	user_addr_t devpath = USER_ADDR_NULL;
1017 	int ronly = 0;
1018 	int mntalloc = 0;
1019 	boolean_t vfsp_ref = FALSE;
1020 	boolean_t is_rwlock_locked = FALSE;
1021 	boolean_t did_rele = FALSE;
1022 	boolean_t have_usecount = FALSE;
1023 	boolean_t did_set_lmount = FALSE;
1024 	boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1025 
1026 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1027 	/* Check for mutually-exclusive flag bits */
1028 	uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1029 	int bitcount = 0;
1030 	while (checkflags != 0) {
1031 		checkflags &= (checkflags - 1);
1032 		bitcount++;
1033 	}
1034 
1035 	if (bitcount > 1) {
1036 		//not allowed to request multiple mount-by-role flags
1037 		error = EINVAL;
1038 		goto out1;
1039 	}
1040 #endif
1041 
1042 	/*
1043 	 * Process an update for an existing mount
1044 	 */
1045 	if (flags & MNT_UPDATE) {
1046 		if ((vp->v_flag & VROOT) == 0) {
1047 			error = EINVAL;
1048 			goto out1;
1049 		}
1050 		mp = vp->v_mount;
1051 
1052 		/* if unmount or mount in progress, return error */
1053 		mount_lock_spin(mp);
1054 		if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1055 			mount_unlock(mp);
1056 			error = EBUSY;
1057 			goto out1;
1058 		}
1059 		mp->mnt_lflag |= MNT_LMOUNT;
1060 		did_set_lmount = TRUE;
1061 		mount_unlock(mp);
1062 		lck_rw_lock_exclusive(&mp->mnt_rwlock);
1063 		is_rwlock_locked = TRUE;
1064 		/*
1065 		 * We only allow the filesystem to be reloaded if it
1066 		 * is currently mounted read-only.
1067 		 */
1068 		if ((flags & MNT_RELOAD) &&
1069 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1070 			error = ENOTSUP;
1071 			goto out1;
1072 		}
1073 
1074 		/*
1075 		 * If content protection is enabled, update mounts are not
1076 		 * allowed to turn it off.
1077 		 */
1078 		if ((mp->mnt_flag & MNT_CPROTECT) &&
1079 		    ((flags & MNT_CPROTECT) == 0)) {
1080 			error = EINVAL;
1081 			goto out1;
1082 		}
1083 
1084 		/*
1085 		 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1086 		 * failure to return an error for this so we'll just silently
1087 		 * add it if it is not passed in.
1088 		 */
1089 		if ((mp->mnt_flag & MNT_REMOVABLE) &&
1090 		    ((flags & MNT_REMOVABLE) == 0)) {
1091 			flags |= MNT_REMOVABLE;
1092 		}
1093 
1094 		/* Can't downgrade the backer of the root FS */
1095 		if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1096 		    (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1097 			error = ENOTSUP;
1098 			goto out1;
1099 		}
1100 
1101 		/*
1102 		 * Only root, or the user that did the original mount is
1103 		 * permitted to update it.
1104 		 */
1105 		if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1106 		    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1107 			goto out1;
1108 		}
1109 #if CONFIG_MACF
1110 		error = mac_mount_check_remount(ctx, mp);
1111 		if (error != 0) {
1112 			goto out1;
1113 		}
1114 #endif
1115 		/*
1116 		 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1117 		 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1118 		 */
1119 		if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1120 			flags |= MNT_NOSUID | MNT_NODEV;
1121 			if (mp->mnt_flag & MNT_NOEXEC) {
1122 				flags |= MNT_NOEXEC;
1123 			}
1124 		}
1125 		flag = mp->mnt_flag;
1126 		flag_set = true;
1127 
1128 
1129 
1130 		mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1131 
1132 		vfsp = mp->mnt_vtable;
1133 		goto update;
1134 	} // MNT_UPDATE
1135 
1136 	/*
1137 	 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1138 	 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1139 	 */
1140 	if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1141 		flags |= MNT_NOSUID | MNT_NODEV;
1142 		if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1143 			flags |= MNT_NOEXEC;
1144 		}
1145 	}
1146 
1147 	/* XXXAUDIT: Should we capture the type on the error path as well? */
1148 	/* XXX cast-away const (audit_arg_text() does not modify its input) */
1149 	AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1150 	mount_list_lock();
1151 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1152 		if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1153 			vfsp->vfc_refcount++;
1154 			vfsp_ref = TRUE;
1155 			break;
1156 		}
1157 	}
1158 	mount_list_unlock();
1159 	if (vfsp == NULL) {
1160 		error = ENODEV;
1161 		goto out1;
1162 	}
1163 
1164 	/*
1165 	 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1166 	 * except in ROSV configs and for the initial BaseSystem root.
1167 	 */
1168 	if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1169 	    ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1170 	    ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1171 		error = EINVAL;  /* unsupported request */
1172 		goto out1;
1173 	}
1174 
1175 	error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1176 	if (error != 0) {
1177 		goto out1;
1178 	}
1179 
1180 	/*
1181 	 * Allocate and initialize the filesystem (mount_t)
1182 	 */
1183 	mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1184 	mntalloc = 1;
1185 
1186 	/* Initialize the default IO constraints */
1187 	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1188 	mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1189 	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1190 	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1191 	mp->mnt_devblocksize = DEV_BSIZE;
1192 	mp->mnt_alignmentmask = PAGE_MASK;
1193 	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1194 	mp->mnt_ioscale = 1;
1195 	mp->mnt_ioflags = 0;
1196 	mp->mnt_realrootvp = NULLVP;
1197 	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1198 
1199 	mp->mnt_lflag |= MNT_LMOUNT;
1200 	did_set_lmount = TRUE;
1201 
1202 	TAILQ_INIT(&mp->mnt_vnodelist);
1203 	TAILQ_INIT(&mp->mnt_workerqueue);
1204 	TAILQ_INIT(&mp->mnt_newvnodes);
1205 	mount_lock_init(mp);
1206 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
1207 	is_rwlock_locked = TRUE;
1208 	mp->mnt_op = vfsp->vfc_vfsops;
1209 	mp->mnt_vtable = vfsp;
1210 	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
1211 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1212 	strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1213 	do {
1214 		size_t pathlen = MAXPATHLEN;
1215 
1216 		if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1217 			strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1218 		}
1219 	} while (0);
1220 	mp->mnt_vnodecovered = vp;
1221 	mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1222 	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1223 	mp->mnt_devbsdunit = 0;
1224 	mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1225 
1226 	/* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1227 	vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1228 
1229 	if (kernelmount) {
1230 		mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1231 	}
1232 	if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1233 		mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1234 	}
1235 
1236 	if (KERNEL_MOUNT_DEVFS & internal_flags) {
1237 		// kernel mounted devfs
1238 		mp->mnt_kern_flag |= MNTK_SYSTEM;
1239 	}
1240 
1241 update:
1242 
1243 	/*
1244 	 * Set the mount level flags.
1245 	 */
1246 	if (flags & MNT_RDONLY) {
1247 		mp->mnt_flag |= MNT_RDONLY;
1248 	} else if (mp->mnt_flag & MNT_RDONLY) {
1249 		// disallow read/write upgrades of file systems that
1250 		// had the TYPENAME_OVERRIDE feature set.
1251 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1252 			error = EPERM;
1253 			goto out1;
1254 		}
1255 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
1256 	}
1257 	mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1258 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1259 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1260 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1261 	    MNT_QUARANTINE | MNT_CPROTECT);
1262 
1263 #if SECURE_KERNEL
1264 #if !CONFIG_MNT_SUID
1265 	/*
1266 	 * On release builds of iOS based platforms, always enforce NOSUID on
1267 	 * all mounts. We do this here because we can catch update mounts as well as
1268 	 * non-update mounts in this case.
1269 	 */
1270 	mp->mnt_flag |= (MNT_NOSUID);
1271 #endif
1272 #endif
1273 
1274 	mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1275 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1276 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1277 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1278 	    MNT_QUARANTINE | MNT_CPROTECT);
1279 
1280 #if CONFIG_MACF
1281 	if (flags & MNT_MULTILABEL) {
1282 		if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1283 			error = EINVAL;
1284 			goto out1;
1285 		}
1286 		mp->mnt_flag |= MNT_MULTILABEL;
1287 	}
1288 #endif
1289 	/*
1290 	 * Process device path for local file systems if requested.
1291 	 *
1292 	 * Snapshot and mount-by-role mounts do not use this path; they are
1293 	 * passing other opaque data in the device path field.
1294 	 *
1295 	 * Basesystemroot mounts pass a device path to be resolved here,
1296 	 * but it's just a char * already inside the kernel, which
1297 	 * kernel_mount() shoved into a user_addr_t to call us. So for such
1298 	 * mounts we must skip copyin (both of the address and of the string
1299 	 * (in NDINIT).
1300 	 */
1301 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1302 	    !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1303 		boolean_t do_copyin_devpath = true;
1304 #if CONFIG_BASESYSTEMROOT
1305 		if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1306 			// KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1307 			// We have been passed fsmountargs, which is typed as a user_addr_t,
1308 			// but is actually a char ** pointing to a (kernelspace) string.
1309 			// We manually unpack it with a series of casts and dereferences
1310 			// that reverses what was done just above us on the stack in
1311 			// imageboot_pivot_image().
1312 			// After retrieving the path to the dev node (which we will NDINIT
1313 			// in a moment), we pass NULL fsmountargs on to the filesystem.
1314 			_Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1315 			char **devnamepp = (char **)fsmountargs;
1316 			char *devnamep = *devnamepp;
1317 			devpath = CAST_USER_ADDR_T(devnamep);
1318 			do_copyin_devpath = false;
1319 			fsmountargs = USER_ADDR_NULL;
1320 
1321 			//Now that we have a mp, denote that this mount is for the basesystem.
1322 			mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1323 		}
1324 #endif // CONFIG_BASESYSTEMROOT
1325 
1326 		if (do_copyin_devpath) {
1327 			if (vfs_context_is64bit(ctx)) {
1328 				if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1329 					goto out1;
1330 				}
1331 				fsmountargs += sizeof(devpath);
1332 			} else {
1333 				user32_addr_t tmp;
1334 				if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1335 					goto out1;
1336 				}
1337 				/* munge into LP64 addr */
1338 				devpath = CAST_USER_ADDR_T(tmp);
1339 				fsmountargs += sizeof(tmp);
1340 			}
1341 		}
1342 
1343 		/* Lookup device and authorize access to it */
1344 		if ((devpath)) {
1345 			struct nameidata nd;
1346 
1347 			enum uio_seg seg = UIO_USERSPACE;
1348 #if CONFIG_BASESYSTEMROOT
1349 			if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1350 				seg = UIO_SYSSPACE;
1351 			}
1352 #endif // CONFIG_BASESYSTEMROOT
1353 
1354 			NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1355 			if ((error = namei(&nd))) {
1356 				goto out1;
1357 			}
1358 
1359 			strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1360 			devvp = nd.ni_vp;
1361 
1362 			nameidone(&nd);
1363 
1364 			if (devvp->v_type != VBLK) {
1365 				error = ENOTBLK;
1366 				goto out2;
1367 			}
1368 			if (major(devvp->v_rdev) >= nblkdev) {
1369 				error = ENXIO;
1370 				goto out2;
1371 			}
1372 			/*
1373 			 * If mount by non-root, then verify that user has necessary
1374 			 * permissions on the device.
1375 			 */
1376 			if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1377 				mode_t accessmode = KAUTH_VNODE_READ_DATA;
1378 
1379 				if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1380 					accessmode |= KAUTH_VNODE_WRITE_DATA;
1381 				}
1382 				if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1383 					goto out2;
1384 				}
1385 			}
1386 		}
1387 		/* On first mount, preflight and open device */
1388 		if (devpath && ((flags & MNT_UPDATE) == 0)) {
1389 			if ((error = vnode_ref(devvp))) {
1390 				goto out2;
1391 			}
1392 			/*
1393 			 * Disallow multiple mounts of the same device.
1394 			 * Disallow mounting of a device that is currently in use
1395 			 * (except for root, which might share swap device for miniroot).
1396 			 * Flush out any old buffers remaining from a previous use.
1397 			 */
1398 			if ((error = vfs_mountedon(devvp))) {
1399 				goto out3;
1400 			}
1401 
1402 			if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1403 				error = EBUSY;
1404 				goto out3;
1405 			}
1406 			if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1407 				error = ENOTBLK;
1408 				goto out3;
1409 			}
1410 			if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1411 				goto out3;
1412 			}
1413 
1414 			ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1415 #if CONFIG_MACF
1416 			error = mac_vnode_check_open(ctx,
1417 			    devvp,
1418 			    ronly ? FREAD : FREAD | FWRITE);
1419 			if (error) {
1420 				goto out3;
1421 			}
1422 #endif /* MAC */
1423 			if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1424 				goto out3;
1425 			}
1426 
1427 			mp->mnt_devvp = devvp;
1428 			device_vnode = devvp;
1429 		} else if ((mp->mnt_flag & MNT_RDONLY) &&
1430 		    (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1431 		    (device_vnode = mp->mnt_devvp)) {
1432 			dev_t dev;
1433 			int maj;
1434 			/*
1435 			 * If upgrade to read-write by non-root, then verify
1436 			 * that user has necessary permissions on the device.
1437 			 */
1438 			vnode_getalways(device_vnode);
1439 
1440 			if (suser(vfs_context_ucred(ctx), NULL) &&
1441 			    (error = vnode_authorize(device_vnode, NULL,
1442 			    KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1443 			    ctx)) != 0) {
1444 				vnode_put(device_vnode);
1445 				goto out2;
1446 			}
1447 
1448 			/* Tell the device that we're upgrading */
1449 			dev = (dev_t)device_vnode->v_rdev;
1450 			maj = major(dev);
1451 
1452 			if ((u_int)maj >= (u_int)nblkdev) {
1453 				panic("Volume mounted on a device with invalid major number.");
1454 			}
1455 
1456 			error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1457 			vnode_put(device_vnode);
1458 			device_vnode = NULLVP;
1459 			if (error != 0) {
1460 				goto out2;
1461 			}
1462 		}
1463 	} // localargs && !(snapshot | data | vm)
1464 
1465 #if CONFIG_MACF
1466 	if ((flags & MNT_UPDATE) == 0) {
1467 		mac_mount_label_init(mp);
1468 		mac_mount_label_associate(ctx, mp);
1469 	}
1470 	if (labelstr) {
1471 		if ((flags & MNT_UPDATE) != 0) {
1472 			error = mac_mount_check_label_update(ctx, mp);
1473 			if (error != 0) {
1474 				goto out3;
1475 			}
1476 		}
1477 	}
1478 #endif
1479 	/*
1480 	 * Mount the filesystem.  We already asserted that internal_flags
1481 	 * cannot have more than one mount-by-role bit set.
1482 	 */
1483 	if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1484 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1485 		    (caddr_t)fsmountargs, 0, ctx);
1486 	} else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1487 #if CONFIG_ROSV_STARTUP
1488 		struct mount *origin_mp = (struct mount*)fsmountargs;
1489 		fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1490 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1491 		if (error) {
1492 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1493 		} else {
1494 			/* Mark volume associated with system volume */
1495 			mp->mnt_kern_flag |= MNTK_SYSTEM;
1496 
1497 			/* Attempt to acquire the mnt_devvp and set it up */
1498 			struct vnode *mp_devvp = NULL;
1499 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1500 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1501 				    0, &mp_devvp, vfs_context_kernel());
1502 				if (!lerr) {
1503 					mp->mnt_devvp = mp_devvp;
1504 					//vnode_lookup took an iocount, need to drop it.
1505 					vnode_put(mp_devvp);
1506 					// now set `device_vnode` to the devvp that was acquired.
1507 					// this is needed in order to ensure vfs_init_io_attributes is invoked.
1508 					// note that though the iocount above was dropped, the mount acquires
1509 					// an implicit reference against the device.
1510 					device_vnode = mp_devvp;
1511 				}
1512 			}
1513 		}
1514 #else
1515 		error = EINVAL;
1516 #endif
1517 	} else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1518 #if CONFIG_MOUNT_VM
1519 		struct mount *origin_mp = (struct mount*)fsmountargs;
1520 		fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1521 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1522 		if (error) {
1523 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1524 		} else {
1525 			/* Mark volume associated with system volume and a swap mount */
1526 			mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1527 			/* Attempt to acquire the mnt_devvp and set it up */
1528 			struct vnode *mp_devvp = NULL;
1529 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1530 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1531 				    0, &mp_devvp, vfs_context_kernel());
1532 				if (!lerr) {
1533 					mp->mnt_devvp = mp_devvp;
1534 					//vnode_lookup took an iocount, need to drop it.
1535 					vnode_put(mp_devvp);
1536 
1537 					// now set `device_vnode` to the devvp that was acquired.
1538 					// note that though the iocount above was dropped, the mount acquires
1539 					// an implicit reference against the device.
1540 					device_vnode = mp_devvp;
1541 				}
1542 			}
1543 		}
1544 #else
1545 		error = EINVAL;
1546 #endif
1547 	} else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1548 #if CONFIG_MOUNT_PREBOOTRECOVERY
1549 		struct mount *origin_mp = (struct mount*)fsmountargs;
1550 		uint32_t mount_role = 0;
1551 		if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1552 			mount_role = VFS_PREBOOT_ROLE;
1553 		} else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1554 			mount_role = VFS_RECOVERY_ROLE;
1555 		}
1556 
1557 		if (mount_role != 0) {
1558 			fs_role_mount_args_t frma = {origin_mp, mount_role};
1559 			error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1560 			if (error) {
1561 				printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1562 			} else {
1563 				// NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1564 				/* Mark volume associated with system volume */
1565 				//mp->mnt_kern_flag |= MNTK_SYSTEM;
1566 				/* Attempt to acquire the mnt_devvp and set it up */
1567 				struct vnode *mp_devvp = NULL;
1568 				if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1569 					errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1570 					    0, &mp_devvp, vfs_context_kernel());
1571 					if (!lerr) {
1572 						mp->mnt_devvp = mp_devvp;
1573 						//vnode_lookup took an iocount, need to drop it.
1574 						vnode_put(mp_devvp);
1575 
1576 						// now set `device_vnode` to the devvp that was acquired.
1577 						// note that though the iocount above was dropped, the mount acquires
1578 						// an implicit reference against the device.
1579 						device_vnode = mp_devvp;
1580 					}
1581 				}
1582 			}
1583 		} else {
1584 			printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1585 			error = EINVAL;
1586 		}
1587 #else
1588 		error = EINVAL;
1589 #endif
1590 	} else {
1591 		error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1592 	}
1593 
1594 	if (flags & MNT_UPDATE) {
1595 		if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1596 			mp->mnt_flag &= ~MNT_RDONLY;
1597 		}
1598 		mp->mnt_flag &= ~
1599 		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1600 		mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1601 		if (error) {
1602 			mp->mnt_flag = flag;  /* restore flag value */
1603 		}
1604 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1605 		lck_rw_done(&mp->mnt_rwlock);
1606 		is_rwlock_locked = FALSE;
1607 		if (!error) {
1608 			enablequotas(mp, ctx);
1609 		}
1610 		goto exit;
1611 	}
1612 
1613 	/*
1614 	 * Put the new filesystem on the mount list after root.
1615 	 */
1616 	if (error == 0) {
1617 		struct vfs_attr vfsattr;
1618 		if (device_vnode) {
1619 			/*
1620 			 *   cache the IO attributes for the underlying physical media...
1621 			 *   an error return indicates the underlying driver doesn't
1622 			 *   support all the queries necessary... however, reasonable
1623 			 *   defaults will have been set, so no reason to bail or care
1624 			 *
1625 			 *   Need to do this before calling the MAC hook as it needs
1626 			 *   information from this call.
1627 			 */
1628 			vfs_init_io_attributes(device_vnode, mp);
1629 		}
1630 
1631 #if CONFIG_MACF
1632 		error = mac_mount_check_mount_late(ctx, mp);
1633 		if (error != 0) {
1634 			goto out4;
1635 		}
1636 
1637 		if (vfs_flags(mp) & MNT_MULTILABEL) {
1638 			error = VFS_ROOT(mp, &rvp, ctx);
1639 			if (error) {
1640 				printf("%s() VFS_ROOT returned %d\n", __func__, error);
1641 				goto out4;
1642 			}
1643 			error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1644 			/*
1645 			 * drop reference provided by VFS_ROOT
1646 			 */
1647 			vnode_put(rvp);
1648 
1649 			if (error) {
1650 				goto out4;
1651 			}
1652 		}
1653 #endif  /* MAC */
1654 
1655 		vnode_lock_spin(vp);
1656 		CLR(vp->v_flag, VMOUNT);
1657 		vp->v_mountedhere = mp;
1658 		vnode_unlock(vp);
1659 
1660 		/*
1661 		 * taking the name_cache_lock exclusively will
1662 		 * insure that everyone is out of the fast path who
1663 		 * might be trying to use a now stale copy of
1664 		 * vp->v_mountedhere->mnt_realrootvp
1665 		 * bumping mount_generation causes the cached values
1666 		 * to be invalidated
1667 		 */
1668 		name_cache_lock();
1669 		mount_generation++;
1670 		name_cache_unlock();
1671 
1672 		error = vnode_ref(vp);
1673 		if (error != 0) {
1674 			goto out4;
1675 		}
1676 
1677 		have_usecount = TRUE;
1678 
1679 		error = checkdirs(vp, ctx);
1680 		if (error != 0) {
1681 			/* Unmount the filesystem as cdir/rdirs cannot be updated */
1682 			goto out4;
1683 		}
1684 		/*
1685 		 * there is no cleanup code here so I have made it void
1686 		 * we need to revisit this
1687 		 */
1688 		(void)VFS_START(mp, 0, ctx);
1689 
1690 		if (mount_list_add(mp) != 0) {
1691 			/*
1692 			 * The system is shutting down trying to umount
1693 			 * everything, so fail with a plausible errno.
1694 			 */
1695 			error = EBUSY;
1696 			goto out4;
1697 		}
1698 		lck_rw_done(&mp->mnt_rwlock);
1699 		is_rwlock_locked = FALSE;
1700 
1701 		/* Check if this mounted file system supports EAs or named streams. */
1702 		/* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1703 		VFSATTR_INIT(&vfsattr);
1704 		VFSATTR_WANTED(&vfsattr, f_capabilities);
1705 		if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1706 		    vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1707 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1708 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1709 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1710 				mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1711 			}
1712 #if NAMEDSTREAMS
1713 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1714 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1715 				mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1716 			}
1717 #endif
1718 			/* Check if this file system supports path from id lookups. */
1719 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1720 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1721 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1722 			} else if (mp->mnt_flag & MNT_DOVOLFS) {
1723 				/* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1724 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1725 			}
1726 
1727 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1728 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1729 				mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1730 			}
1731 		}
1732 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1733 			mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1734 		}
1735 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1736 			mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1737 		}
1738 		/* increment the operations count */
1739 		OSAddAtomic(1, &vfs_nummntops);
1740 		enablequotas(mp, ctx);
1741 
1742 		if (device_vnode) {
1743 			device_vnode->v_specflags |= SI_MOUNTEDON;
1744 		}
1745 
1746 		/* Now that mount is setup, notify the listeners */
1747 		vfs_notify_mount(pvp);
1748 		IOBSDMountChange(mp, kIOMountChangeMount);
1749 	} else {
1750 		/* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1751 		if (mp->mnt_vnodelist.tqh_first != NULL) {
1752 			panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1753 			    mp->mnt_vtable->vfc_name, error);
1754 		}
1755 
1756 		vnode_lock_spin(vp);
1757 		CLR(vp->v_flag, VMOUNT);
1758 		vnode_unlock(vp);
1759 		mount_list_lock();
1760 		mp->mnt_vtable->vfc_refcount--;
1761 		mount_list_unlock();
1762 
1763 		if (device_vnode) {
1764 			vnode_rele(device_vnode);
1765 			VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1766 		}
1767 		lck_rw_done(&mp->mnt_rwlock);
1768 		is_rwlock_locked = FALSE;
1769 
1770 		/*
1771 		 * if we get here, we have a mount structure that needs to be freed,
1772 		 * but since the coveredvp hasn't yet been updated to point at it,
1773 		 * no need to worry about other threads holding a crossref on this mp
1774 		 * so it's ok to just free it
1775 		 */
1776 		mount_lock_destroy(mp);
1777 #if CONFIG_MACF
1778 		mac_mount_label_destroy(mp);
1779 #endif
1780 		zfree(mount_zone, mp);
1781 		did_set_lmount = false;
1782 	}
1783 exit:
1784 	/*
1785 	 * drop I/O count on the device vp if there was one
1786 	 */
1787 	if (devpath && devvp) {
1788 		vnode_put(devvp);
1789 	}
1790 
1791 	if (did_set_lmount) {
1792 		mount_lock_spin(mp);
1793 		mp->mnt_lflag &= ~MNT_LMOUNT;
1794 		mount_unlock(mp);
1795 	}
1796 
1797 	return error;
1798 
1799 /* Error condition exits */
1800 out4:
1801 	(void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1802 
1803 	/*
1804 	 * If the mount has been placed on the covered vp,
1805 	 * it may have been discovered by now, so we have
1806 	 * to treat this just like an unmount
1807 	 */
1808 	mount_lock_spin(mp);
1809 	mp->mnt_lflag |= MNT_LDEAD;
1810 	mount_unlock(mp);
1811 
1812 	if (device_vnode != NULLVP) {
1813 		vnode_rele(device_vnode);
1814 		VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1815 		    ctx);
1816 		did_rele = TRUE;
1817 	}
1818 
1819 	vnode_lock_spin(vp);
1820 
1821 	mp->mnt_crossref++;
1822 	vp->v_mountedhere = (mount_t) 0;
1823 
1824 	vnode_unlock(vp);
1825 
1826 	if (have_usecount) {
1827 		vnode_rele(vp);
1828 	}
1829 out3:
1830 	if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1831 		vnode_rele(devvp);
1832 	}
1833 out2:
1834 	if (devpath && devvp) {
1835 		vnode_put(devvp);
1836 	}
1837 out1:
1838 	/* Release mnt_rwlock only when it was taken */
1839 	if (is_rwlock_locked == TRUE) {
1840 		if (flag_set) {
1841 			mp->mnt_flag = flag;  /* restore mnt_flag value */
1842 		}
1843 		lck_rw_done(&mp->mnt_rwlock);
1844 	}
1845 
1846 	if (did_set_lmount) {
1847 		mount_lock_spin(mp);
1848 		mp->mnt_lflag &= ~MNT_LMOUNT;
1849 		mount_unlock(mp);
1850 	}
1851 
1852 	if (mntalloc) {
1853 		if (mp->mnt_crossref) {
1854 			mount_dropcrossref(mp, vp, 0);
1855 		} else {
1856 			mount_lock_destroy(mp);
1857 #if CONFIG_MACF
1858 			mac_mount_label_destroy(mp);
1859 #endif
1860 			zfree(mount_zone, mp);
1861 		}
1862 	}
1863 	if (vfsp_ref) {
1864 		mount_list_lock();
1865 		vfsp->vfc_refcount--;
1866 		mount_list_unlock();
1867 	}
1868 
1869 	return error;
1870 }
1871 
1872 /*
1873  * Flush in-core data, check for competing mount attempts,
1874  * and set VMOUNT
1875  */
1876 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)1877 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1878 {
1879 #if !CONFIG_MACF
1880 #pragma unused(cnp,fsname)
1881 #endif
1882 	struct vnode_attr va;
1883 	int error;
1884 	boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1885 	boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1886 	boolean_t is_busy;
1887 
1888 	if (!skip_auth) {
1889 		/*
1890 		 * If the user is not root, ensure that they own the directory
1891 		 * onto which we are attempting to mount.
1892 		 */
1893 		VATTR_INIT(&va);
1894 		VATTR_WANTED(&va, va_uid);
1895 		if ((error = vnode_getattr(vp, &va, ctx)) ||
1896 		    (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1897 		    (!vfs_context_issuser(ctx)))) {
1898 			error = EPERM;
1899 			goto out;
1900 		}
1901 	}
1902 
1903 	if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1904 		goto out;
1905 	}
1906 
1907 	if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1908 		goto out;
1909 	}
1910 
1911 	if (vp->v_type != VDIR) {
1912 		error = ENOTDIR;
1913 		goto out;
1914 	}
1915 
1916 	vnode_lock_spin(vp);
1917 	is_busy = is_fmount ?
1918 	    (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL)) :
1919 	    (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL));
1920 	if (is_busy) {
1921 		vnode_unlock(vp);
1922 		error = EBUSY;
1923 		goto out;
1924 	}
1925 	SET(vp->v_flag, VMOUNT);
1926 	vnode_unlock(vp);
1927 
1928 #if CONFIG_MACF
1929 	error = mac_mount_check_mount(ctx, vp,
1930 	    cnp, fsname);
1931 	if (error != 0) {
1932 		vnode_lock_spin(vp);
1933 		CLR(vp->v_flag, VMOUNT);
1934 		vnode_unlock(vp);
1935 	}
1936 #endif
1937 
1938 out:
1939 	return error;
1940 }
1941 
1942 #if CONFIG_IMGSRC_ACCESS
1943 
1944 #define DEBUG_IMGSRC 0
1945 
1946 #if DEBUG_IMGSRC
1947 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1948 #else
1949 #define IMGSRC_DEBUG(args...) do { } while(0)
1950 #endif
1951 
1952 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)1953 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1954 {
1955 	struct nameidata nd;
1956 	vnode_t vp, realdevvp;
1957 	mode_t accessmode;
1958 	int error;
1959 	enum uio_seg uio = UIO_USERSPACE;
1960 
1961 	if (ctx == vfs_context_kernel()) {
1962 		uio = UIO_SYSSPACE;
1963 	}
1964 
1965 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1966 	if ((error = namei(&nd))) {
1967 		IMGSRC_DEBUG("namei() failed with %d\n", error);
1968 		return error;
1969 	}
1970 
1971 	vp = nd.ni_vp;
1972 
1973 	if (!vnode_isblk(vp)) {
1974 		IMGSRC_DEBUG("Not block device.\n");
1975 		error = ENOTBLK;
1976 		goto out;
1977 	}
1978 
1979 	realdevvp = mp->mnt_devvp;
1980 	if (realdevvp == NULLVP) {
1981 		IMGSRC_DEBUG("No device backs the mount.\n");
1982 		error = ENXIO;
1983 		goto out;
1984 	}
1985 
1986 	error = vnode_getwithref(realdevvp);
1987 	if (error != 0) {
1988 		IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1989 		goto out;
1990 	}
1991 
1992 	if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1993 		IMGSRC_DEBUG("Wrong dev_t.\n");
1994 		error = ENXIO;
1995 		goto out1;
1996 	}
1997 
1998 	strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1999 
2000 	/*
2001 	 * If mount by non-root, then verify that user has necessary
2002 	 * permissions on the device.
2003 	 */
2004 	if (!vfs_context_issuser(ctx)) {
2005 		accessmode = KAUTH_VNODE_READ_DATA;
2006 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2007 			accessmode |= KAUTH_VNODE_WRITE_DATA;
2008 		}
2009 		if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2010 			IMGSRC_DEBUG("Access denied.\n");
2011 			goto out1;
2012 		}
2013 	}
2014 
2015 	*devvpp = vp;
2016 
2017 out1:
2018 	vnode_put(realdevvp);
2019 
2020 out:
2021 	nameidone(&nd);
2022 
2023 	if (error) {
2024 		vnode_put(vp);
2025 	}
2026 
2027 	return error;
2028 }
2029 
2030 /*
2031  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2032  * and call checkdirs()
2033  */
2034 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2035 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2036 {
2037 	int error;
2038 
2039 	mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2040 
2041 	IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2042 	    mp->mnt_vtable->vfc_name, vnode_getname(vp));
2043 
2044 	vnode_lock_spin(vp);
2045 	CLR(vp->v_flag, VMOUNT);
2046 	vp->v_mountedhere = mp;
2047 	vnode_unlock(vp);
2048 
2049 	/*
2050 	 * taking the name_cache_lock exclusively will
2051 	 * insure that everyone is out of the fast path who
2052 	 * might be trying to use a now stale copy of
2053 	 * vp->v_mountedhere->mnt_realrootvp
2054 	 * bumping mount_generation causes the cached values
2055 	 * to be invalidated
2056 	 */
2057 	name_cache_lock();
2058 	mount_generation++;
2059 	name_cache_unlock();
2060 
2061 	error = vnode_ref(vp);
2062 	if (error != 0) {
2063 		goto out;
2064 	}
2065 
2066 	error = checkdirs(vp, ctx);
2067 	if (error != 0) {
2068 		/* Unmount the filesystem as cdir/rdirs cannot be updated */
2069 		vnode_rele(vp);
2070 		goto out;
2071 	}
2072 
2073 out:
2074 	if (error != 0) {
2075 		mp->mnt_vnodecovered = NULLVP;
2076 	}
2077 	return error;
2078 }
2079 
2080 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2081 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2082 {
2083 	vnode_rele(vp);
2084 	vnode_lock_spin(vp);
2085 	vp->v_mountedhere = (mount_t)NULL;
2086 	vnode_unlock(vp);
2087 
2088 	mp->mnt_vnodecovered = NULLVP;
2089 }
2090 
2091 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2092 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2093 {
2094 	int error;
2095 
2096 	/* unmount in progress return error */
2097 	mount_lock_spin(mp);
2098 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2099 		mount_unlock(mp);
2100 		return EBUSY;
2101 	}
2102 	mount_unlock(mp);
2103 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2104 
2105 	/*
2106 	 * We only allow the filesystem to be reloaded if it
2107 	 * is currently mounted read-only.
2108 	 */
2109 	if ((flags & MNT_RELOAD) &&
2110 	    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2111 		error = ENOTSUP;
2112 		goto out;
2113 	}
2114 
2115 	/*
2116 	 * Only root, or the user that did the original mount is
2117 	 * permitted to update it.
2118 	 */
2119 	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2120 	    (!vfs_context_issuser(ctx))) {
2121 		error = EPERM;
2122 		goto out;
2123 	}
2124 #if CONFIG_MACF
2125 	error = mac_mount_check_remount(ctx, mp);
2126 	if (error != 0) {
2127 		goto out;
2128 	}
2129 #endif
2130 
2131 out:
2132 	if (error) {
2133 		lck_rw_done(&mp->mnt_rwlock);
2134 	}
2135 
2136 	return error;
2137 }
2138 
2139 static void
mount_end_update(mount_t mp)2140 mount_end_update(mount_t mp)
2141 {
2142 	lck_rw_done(&mp->mnt_rwlock);
2143 }
2144 
2145 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2146 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2147 {
2148 	vnode_t vp;
2149 
2150 	if (height >= MAX_IMAGEBOOT_NESTING) {
2151 		return EINVAL;
2152 	}
2153 
2154 	vp = imgsrc_rootvnodes[height];
2155 	if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2156 		*rvpp = vp;
2157 		return 0;
2158 	} else {
2159 		return ENOENT;
2160 	}
2161 }
2162 
2163 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2164 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2165     struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2166     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2167 {
2168 	int error;
2169 	mount_t mp;
2170 	boolean_t placed = FALSE;
2171 	struct vfstable *vfsp;
2172 	user_addr_t devpath;
2173 	char *old_mntonname;
2174 	vnode_t rvp;
2175 	vnode_t devvp;
2176 	uint32_t height;
2177 	uint32_t flags;
2178 
2179 	/* If we didn't imageboot, nothing to move */
2180 	if (imgsrc_rootvnodes[0] == NULLVP) {
2181 		return EINVAL;
2182 	}
2183 
2184 	/* Only root can do this */
2185 	if (!vfs_context_issuser(ctx)) {
2186 		return EPERM;
2187 	}
2188 
2189 	IMGSRC_DEBUG("looking for root vnode.\n");
2190 
2191 	/*
2192 	 * Get root vnode of filesystem we're moving.
2193 	 */
2194 	if (by_index) {
2195 		if (is64bit) {
2196 			struct user64_mnt_imgsrc_args mia64;
2197 			error = copyin(fsmountargs, &mia64, sizeof(mia64));
2198 			if (error != 0) {
2199 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2200 				return error;
2201 			}
2202 
2203 			height = mia64.mi_height;
2204 			flags = mia64.mi_flags;
2205 			devpath = (user_addr_t)mia64.mi_devpath;
2206 		} else {
2207 			struct user32_mnt_imgsrc_args mia32;
2208 			error = copyin(fsmountargs, &mia32, sizeof(mia32));
2209 			if (error != 0) {
2210 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2211 				return error;
2212 			}
2213 
2214 			height = mia32.mi_height;
2215 			flags = mia32.mi_flags;
2216 			devpath = mia32.mi_devpath;
2217 		}
2218 	} else {
2219 		/*
2220 		 * For binary compatibility--assumes one level of nesting.
2221 		 */
2222 		if (is64bit) {
2223 			if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2224 				return error;
2225 			}
2226 		} else {
2227 			user32_addr_t tmp;
2228 			if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2229 				return error;
2230 			}
2231 
2232 			/* munge into LP64 addr */
2233 			devpath = CAST_USER_ADDR_T(tmp);
2234 		}
2235 
2236 		height = 0;
2237 		flags = 0;
2238 	}
2239 
2240 	if (flags != 0) {
2241 		IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2242 		return EINVAL;
2243 	}
2244 
2245 	error = get_imgsrc_rootvnode(height, &rvp);
2246 	if (error != 0) {
2247 		IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2248 		return error;
2249 	}
2250 
2251 	IMGSRC_DEBUG("got old root vnode\n");
2252 
2253 	old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2254 
2255 	/* Can only move once */
2256 	mp = vnode_mount(rvp);
2257 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2258 		IMGSRC_DEBUG("Already moved.\n");
2259 		error = EBUSY;
2260 		goto out0;
2261 	}
2262 
2263 	IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2264 	IMGSRC_DEBUG("Starting updated.\n");
2265 
2266 	/* Get exclusive rwlock on mount, authorize update on mp */
2267 	error = mount_begin_update(mp, ctx, 0);
2268 	if (error != 0) {
2269 		IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2270 		goto out0;
2271 	}
2272 
2273 	/*
2274 	 * It can only be moved once.  Flag is set under the rwlock,
2275 	 * so we're now safe to proceed.
2276 	 */
2277 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2278 		IMGSRC_DEBUG("Already moved [2]\n");
2279 		goto out1;
2280 	}
2281 
2282 	IMGSRC_DEBUG("Preparing coveredvp.\n");
2283 
2284 	/* Mark covered vnode as mount in progress, authorize placing mount on top */
2285 	error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2286 	if (error != 0) {
2287 		IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2288 		goto out1;
2289 	}
2290 
2291 	IMGSRC_DEBUG("Covered vp OK.\n");
2292 
2293 	/* Sanity check the name caller has provided */
2294 	vfsp = mp->mnt_vtable;
2295 	if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2296 		IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2297 		    vfsp->vfc_name, fsname);
2298 		error = EINVAL;
2299 		goto out2;
2300 	}
2301 
2302 	/* Check the device vnode and update mount-from name, for local filesystems */
2303 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2304 		IMGSRC_DEBUG("Local, doing device validation.\n");
2305 
2306 		if (devpath != USER_ADDR_NULL) {
2307 			error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2308 			if (error) {
2309 				IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2310 				goto out2;
2311 			}
2312 
2313 			vnode_put(devvp);
2314 		}
2315 	}
2316 
2317 	/*
2318 	 * Place mp on top of vnode, ref the vnode,  call checkdirs(),
2319 	 * and increment the name cache's mount generation
2320 	 */
2321 
2322 	IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2323 	error = place_mount_and_checkdirs(mp, vp, ctx);
2324 	if (error != 0) {
2325 		goto out2;
2326 	}
2327 
2328 	placed = TRUE;
2329 
2330 	strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2331 	strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2332 
2333 	/* Forbid future moves */
2334 	mount_lock(mp);
2335 	mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2336 	mount_unlock(mp);
2337 
2338 	/* Finally, add to mount list, completely ready to go */
2339 	if (mount_list_add(mp) != 0) {
2340 		/*
2341 		 * The system is shutting down trying to umount
2342 		 * everything, so fail with a plausible errno.
2343 		 */
2344 		error = EBUSY;
2345 		goto out3;
2346 	}
2347 
2348 	mount_end_update(mp);
2349 	vnode_put(rvp);
2350 	zfree(ZV_NAMEI, old_mntonname);
2351 
2352 	vfs_notify_mount(pvp);
2353 
2354 	return 0;
2355 out3:
2356 	strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2357 
2358 	mount_lock(mp);
2359 	mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2360 	mount_unlock(mp);
2361 
2362 out2:
2363 	/*
2364 	 * Placing the mp on the vnode clears VMOUNT,
2365 	 * so cleanup is different after that point
2366 	 */
2367 	if (placed) {
2368 		/* Rele the vp, clear VMOUNT and v_mountedhere */
2369 		undo_place_on_covered_vp(mp, vp);
2370 	} else {
2371 		vnode_lock_spin(vp);
2372 		CLR(vp->v_flag, VMOUNT);
2373 		vnode_unlock(vp);
2374 	}
2375 out1:
2376 	mount_end_update(mp);
2377 
2378 out0:
2379 	vnode_put(rvp);
2380 	zfree(ZV_NAMEI, old_mntonname);
2381 	return error;
2382 }
2383 
2384 #endif /* CONFIG_IMGSRC_ACCESS */
2385 
2386 void
enablequotas(struct mount * mp,vfs_context_t ctx)2387 enablequotas(struct mount *mp, vfs_context_t ctx)
2388 {
2389 	struct nameidata qnd;
2390 	int type;
2391 	char qfpath[MAXPATHLEN];
2392 	const char *qfname = QUOTAFILENAME;
2393 	const char *qfopsname = QUOTAOPSNAME;
2394 	const char *qfextension[] = INITQFNAMES;
2395 
2396 	/* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2397 	if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2398 		return;
2399 	}
2400 	/*
2401 	 * Enable filesystem disk quotas if necessary.
2402 	 * We ignore errors as this should not interfere with final mount
2403 	 */
2404 	for (type = 0; type < MAXQUOTAS; type++) {
2405 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2406 		NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2407 		    CAST_USER_ADDR_T(qfpath), ctx);
2408 		if (namei(&qnd) != 0) {
2409 			continue;           /* option file to trigger quotas is not present */
2410 		}
2411 		vnode_put(qnd.ni_vp);
2412 		nameidone(&qnd);
2413 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2414 
2415 		(void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2416 	}
2417 	return;
2418 }
2419 
2420 
2421 static int
checkdirs_callback(proc_t p,void * arg)2422 checkdirs_callback(proc_t p, void * arg)
2423 {
2424 	struct cdirargs *cdrp = (struct cdirargs *)arg;
2425 	vnode_t olddp = cdrp->olddp;
2426 	vnode_t newdp = cdrp->newdp;
2427 	struct filedesc *fdp = &p->p_fd;
2428 	vnode_t new_cvp = newdp;
2429 	vnode_t new_rvp = newdp;
2430 	vnode_t old_cvp = NULL;
2431 	vnode_t old_rvp = NULL;
2432 
2433 	/*
2434 	 * XXX Also needs to iterate each thread in the process to see if it
2435 	 * XXX is using a per-thread current working directory, and, if so,
2436 	 * XXX update that as well.
2437 	 */
2438 
2439 	/*
2440 	 * First, with the proc_fdlock held, check to see if we will need
2441 	 * to do any work.  If not, we will get out fast.
2442 	 */
2443 	proc_fdlock(p);
2444 	if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2445 		proc_fdunlock(p);
2446 		return PROC_RETURNED;
2447 	}
2448 	proc_fdunlock(p);
2449 
2450 	/*
2451 	 * Ok, we will have to do some work.  Always take two refs
2452 	 * because we might need that many.  We'll dispose of whatever
2453 	 * we ended up not using.
2454 	 */
2455 	if (vnode_ref(newdp) != 0) {
2456 		return PROC_RETURNED;
2457 	}
2458 	if (vnode_ref(newdp) != 0) {
2459 		vnode_rele(newdp);
2460 		return PROC_RETURNED;
2461 	}
2462 
2463 	proc_dirs_lock_exclusive(p);
2464 	/*
2465 	 * Now do the work.  Note: we dropped the proc_fdlock, so we
2466 	 * have to do all of the checks again.
2467 	 */
2468 	proc_fdlock(p);
2469 	if (fdp->fd_cdir == olddp) {
2470 		old_cvp = olddp;
2471 		fdp->fd_cdir = newdp;
2472 		new_cvp = NULL;
2473 	}
2474 	if (fdp->fd_rdir == olddp) {
2475 		old_rvp = olddp;
2476 		fdp->fd_rdir = newdp;
2477 		new_rvp = NULL;
2478 	}
2479 	proc_fdunlock(p);
2480 	proc_dirs_unlock_exclusive(p);
2481 
2482 	/*
2483 	 * Dispose of any references that are no longer needed.
2484 	 */
2485 	if (old_cvp != NULL) {
2486 		vnode_rele(old_cvp);
2487 	}
2488 	if (old_rvp != NULL) {
2489 		vnode_rele(old_rvp);
2490 	}
2491 	if (new_cvp != NULL) {
2492 		vnode_rele(new_cvp);
2493 	}
2494 	if (new_rvp != NULL) {
2495 		vnode_rele(new_rvp);
2496 	}
2497 
2498 	return PROC_RETURNED;
2499 }
2500 
2501 
2502 
2503 /*
2504  * Scan all active processes to see if any of them have a current
2505  * or root directory onto which the new filesystem has just been
2506  * mounted. If so, replace them with the new mount point.
2507  */
2508 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2509 checkdirs(vnode_t olddp, vfs_context_t ctx)
2510 {
2511 	vnode_t newdp;
2512 	vnode_t tvp;
2513 	int err;
2514 	struct cdirargs cdr;
2515 
2516 	if (olddp->v_usecount == 1) {
2517 		return 0;
2518 	}
2519 	err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2520 
2521 	if (err != 0) {
2522 #if DIAGNOSTIC
2523 		panic("mount: lost mount: error %d", err);
2524 #endif
2525 		return err;
2526 	}
2527 
2528 	cdr.olddp = olddp;
2529 	cdr.newdp = newdp;
2530 	/* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2531 	proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2532 
2533 	if (rootvnode == olddp) {
2534 		vnode_ref(newdp);
2535 		lck_rw_lock_exclusive(&rootvnode_rw_lock);
2536 		tvp = rootvnode;
2537 		rootvnode = newdp;
2538 		lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2539 		vnode_rele(tvp);
2540 	}
2541 
2542 	vnode_put(newdp);
2543 	return 0;
2544 }
2545 
2546 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT        \
2547 	"com.apple.private.vfs.role-account-unmount"
2548 
2549 /*
2550  * Unmount a file system.
2551  *
2552  * Note: unmount takes a path to the vnode mounted on as argument,
2553  * not special file (as before).
2554  */
2555 /* ARGSUSED */
2556 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2557 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2558 {
2559 	vnode_t vp;
2560 	struct mount *mp;
2561 	int error;
2562 	struct nameidata nd;
2563 	vfs_context_t ctx;
2564 
2565 	/*
2566 	 * If the process has the entitlement, use the kernel's context when
2567 	 * performing lookup on the mount path as the process might lack proper
2568 	 * permission to access the directory.
2569 	 */
2570 	ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2571 	    vfs_context_kernel() : vfs_context_current();
2572 
2573 	NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2574 	    UIO_USERSPACE, uap->path, ctx);
2575 	error = namei(&nd);
2576 	if (error) {
2577 		return error;
2578 	}
2579 	vp = nd.ni_vp;
2580 	mp = vp->v_mount;
2581 	nameidone(&nd);
2582 
2583 #if CONFIG_MACF
2584 	error = mac_mount_check_umount(ctx, mp);
2585 	if (error != 0) {
2586 		vnode_put(vp);
2587 		return error;
2588 	}
2589 #endif
2590 	/*
2591 	 * Must be the root of the filesystem
2592 	 */
2593 	if ((vp->v_flag & VROOT) == 0) {
2594 		vnode_put(vp);
2595 		return EINVAL;
2596 	}
2597 	mount_ref(mp, 0);
2598 	vnode_put(vp);
2599 	/* safedounmount consumes the mount ref */
2600 	return safedounmount(mp, uap->flags, ctx);
2601 }
2602 
2603 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2604 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2605 {
2606 	mount_t mp;
2607 
2608 	mp = mount_list_lookupby_fsid(fsid, 0, 1);
2609 	if (mp == (mount_t)0) {
2610 		return ENOENT;
2611 	}
2612 	mount_ref(mp, 0);
2613 	mount_iterdrop(mp);
2614 	/* safedounmount consumes the mount ref */
2615 	return safedounmount(mp, flags, ctx);
2616 }
2617 
2618 /*
2619  * The mount struct comes with a mount ref which will be consumed.
2620  * Do the actual file system unmount, prevent some common foot shooting.
2621  */
2622 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2623 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2624 {
2625 	int error;
2626 	proc_t p = vfs_context_proc(ctx);
2627 
2628 	/*
2629 	 * If the file system is not responding and MNT_NOBLOCK
2630 	 * is set and not a forced unmount then return EBUSY.
2631 	 */
2632 	if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2633 	    (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2634 		error = EBUSY;
2635 		goto out;
2636 	}
2637 
2638 	/*
2639 	 * Skip authorization in two cases:
2640 	 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2641 	 *   This entitlement allows non-root processes unmount volumes mounted by
2642 	 *   other processes.
2643 	 * - If the mount is tagged as permissive and this is not a forced-unmount
2644 	 *   attempt.
2645 	 */
2646 	if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2647 	    (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2648 		/*
2649 		 * Only root, or the user that did the original mount is
2650 		 * permitted to unmount this filesystem.
2651 		 */
2652 		if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2653 		    (error = suser(kauth_cred_get(), &p->p_acflag))) {
2654 			goto out;
2655 		}
2656 	}
2657 	/*
2658 	 * Don't allow unmounting the root file system, or other volumes
2659 	 * associated with it (for example, the associated VM or DATA mounts) .
2660 	 */
2661 	if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2662 		if (!(mp->mnt_flag & MNT_ROOTFS)) {
2663 			printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2664 			    mp->mnt_vfsstat.f_mntonname);
2665 		}
2666 		error = EBUSY; /* the root (or associated volumes) is always busy */
2667 		goto out;
2668 	}
2669 
2670 	/*
2671 	 * If the mount is providing the root filesystem's disk image
2672 	 * (i.e. imageboot), don't allow unmounting
2673 	 */
2674 	if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2675 		error = EBUSY;
2676 		goto out;
2677 	}
2678 
2679 	return dounmount(mp, flags, 1, ctx);
2680 
2681 out:
2682 	mount_drop(mp, 0);
2683 	return error;
2684 }
2685 
2686 /*
2687  * Do the actual file system unmount.
2688  */
2689 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2690 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2691 {
2692 	vnode_t coveredvp = (vnode_t)0;
2693 	int error;
2694 	int needwakeup = 0;
2695 	int forcedunmount = 0;
2696 	int lflags = 0;
2697 	struct vnode *devvp = NULLVP;
2698 #if CONFIG_TRIGGERS
2699 	proc_t p = vfs_context_proc(ctx);
2700 	int did_vflush = 0;
2701 	int pflags_save = 0;
2702 #endif /* CONFIG_TRIGGERS */
2703 
2704 #if CONFIG_FSE
2705 	if (!(flags & MNT_FORCE)) {
2706 		fsevent_unmount(mp, ctx);  /* has to come first! */
2707 	}
2708 #endif
2709 
2710 	mount_lock(mp);
2711 
2712 	/*
2713 	 * If already an unmount in progress just return EBUSY.
2714 	 * Even a forced unmount cannot override.
2715 	 */
2716 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2717 		if (withref != 0) {
2718 			mount_drop(mp, 1);
2719 		}
2720 		mount_unlock(mp);
2721 		return EBUSY;
2722 	}
2723 
2724 	if (flags & MNT_FORCE) {
2725 		forcedunmount = 1;
2726 		mp->mnt_lflag |= MNT_LFORCE;
2727 	}
2728 
2729 #if CONFIG_TRIGGERS
2730 	if (flags & MNT_NOBLOCK && p != kernproc) {
2731 		pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2732 	}
2733 #endif
2734 
2735 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
2736 	mp->mnt_lflag |= MNT_LUNMOUNT;
2737 	mp->mnt_flag &= ~MNT_ASYNC;
2738 	/*
2739 	 * anyone currently in the fast path that
2740 	 * trips over the cached rootvp will be
2741 	 * dumped out and forced into the slow path
2742 	 * to regenerate a new cached value
2743 	 */
2744 	mp->mnt_realrootvp = NULLVP;
2745 	mount_unlock(mp);
2746 
2747 	if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2748 		/*
2749 		 * Force unmount any mounts in this filesystem.
2750 		 * If any unmounts fail - just leave them dangling.
2751 		 * Avoids recursion.
2752 		 */
2753 		(void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2754 	}
2755 
2756 	/*
2757 	 * taking the name_cache_lock exclusively will
2758 	 * insure that everyone is out of the fast path who
2759 	 * might be trying to use a now stale copy of
2760 	 * vp->v_mountedhere->mnt_realrootvp
2761 	 * bumping mount_generation causes the cached values
2762 	 * to be invalidated
2763 	 */
2764 	name_cache_lock();
2765 	mount_generation++;
2766 	name_cache_unlock();
2767 
2768 
2769 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2770 	if (withref != 0) {
2771 		mount_drop(mp, 0);
2772 	}
2773 	error = 0;
2774 	if (forcedunmount == 0) {
2775 		ubc_umount(mp); /* release cached vnodes */
2776 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2777 			error = VFS_SYNC(mp, MNT_WAIT, ctx);
2778 			if (error) {
2779 				mount_lock(mp);
2780 				mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2781 				mp->mnt_lflag &= ~MNT_LUNMOUNT;
2782 				mp->mnt_lflag &= ~MNT_LFORCE;
2783 				goto out;
2784 			}
2785 		}
2786 	}
2787 
2788 	IOBSDMountChange(mp, kIOMountChangeUnmount);
2789 
2790 #if CONFIG_TRIGGERS
2791 	vfs_nested_trigger_unmounts(mp, flags, ctx);
2792 	did_vflush = 1;
2793 #endif
2794 	if (forcedunmount) {
2795 		lflags |= FORCECLOSE;
2796 	}
2797 	error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2798 	if ((forcedunmount == 0) && error) {
2799 		mount_lock(mp);
2800 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2801 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2802 		mp->mnt_lflag &= ~MNT_LFORCE;
2803 		goto out;
2804 	}
2805 
2806 	/* make sure there are no one in the mount iterations or lookup */
2807 	mount_iterdrain(mp);
2808 
2809 	error = VFS_UNMOUNT(mp, flags, ctx);
2810 	if (error) {
2811 		mount_iterreset(mp);
2812 		mount_lock(mp);
2813 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2814 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2815 		mp->mnt_lflag &= ~MNT_LFORCE;
2816 		goto out;
2817 	}
2818 
2819 	/* increment the operations count */
2820 	if (!error) {
2821 		OSAddAtomic(1, &vfs_nummntops);
2822 	}
2823 
2824 	if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2825 		/* hold an io reference and drop the usecount before close */
2826 		devvp = mp->mnt_devvp;
2827 		vnode_getalways(devvp);
2828 		vnode_rele(devvp);
2829 		VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2830 		    ctx);
2831 		vnode_clearmountedon(devvp);
2832 		vnode_put(devvp);
2833 	}
2834 	lck_rw_done(&mp->mnt_rwlock);
2835 	mount_list_remove(mp);
2836 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2837 
2838 	/* mark the mount point hook in the vp but not drop the ref yet */
2839 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2840 		/*
2841 		 * The covered vnode needs special handling. Trying to get an
2842 		 * iocount must not block here as this may lead to deadlocks
2843 		 * if the Filesystem to which the covered vnode belongs is
2844 		 * undergoing forced unmounts. Since we hold a usecount, the
2845 		 * vnode cannot be reused (it can, however, still be terminated)
2846 		 */
2847 		vnode_getalways(coveredvp);
2848 		vnode_lock_spin(coveredvp);
2849 
2850 		mp->mnt_crossref++;
2851 		coveredvp->v_mountedhere = (struct mount *)0;
2852 		CLR(coveredvp->v_flag, VMOUNT);
2853 
2854 		vnode_unlock(coveredvp);
2855 		vnode_put(coveredvp);
2856 	}
2857 
2858 	mount_list_lock();
2859 	mp->mnt_vtable->vfc_refcount--;
2860 	mount_list_unlock();
2861 
2862 	cache_purgevfs(mp);     /* remove cache entries for this file sys */
2863 	vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2864 	mount_lock(mp);
2865 	mp->mnt_lflag |= MNT_LDEAD;
2866 
2867 	if (mp->mnt_lflag & MNT_LWAIT) {
2868 		/*
2869 		 * do the wakeup here
2870 		 * in case we block in mount_refdrain
2871 		 * which will drop the mount lock
2872 		 * and allow anyone blocked in vfs_busy
2873 		 * to wakeup and see the LDEAD state
2874 		 */
2875 		mp->mnt_lflag &= ~MNT_LWAIT;
2876 		wakeup((caddr_t)mp);
2877 	}
2878 	mount_refdrain(mp);
2879 
2880 	/* free disk_conditioner_info structure for this mount */
2881 	disk_conditioner_unmount(mp);
2882 
2883 out:
2884 	if (mp->mnt_lflag & MNT_LWAIT) {
2885 		mp->mnt_lflag &= ~MNT_LWAIT;
2886 		needwakeup = 1;
2887 	}
2888 
2889 #if CONFIG_TRIGGERS
2890 	if (flags & MNT_NOBLOCK && p != kernproc) {
2891 		// Restore P_NOREMOTEHANG bit to its previous value
2892 		if ((pflags_save & P_NOREMOTEHANG) == 0) {
2893 			OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2894 		}
2895 	}
2896 
2897 	/*
2898 	 * Callback and context are set together under the mount lock, and
2899 	 * never cleared, so we're safe to examine them here, drop the lock,
2900 	 * and call out.
2901 	 */
2902 	if (mp->mnt_triggercallback != NULL) {
2903 		mount_unlock(mp);
2904 		if (error == 0) {
2905 			mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2906 		} else if (did_vflush) {
2907 			mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2908 		}
2909 	} else {
2910 		mount_unlock(mp);
2911 	}
2912 #else
2913 	mount_unlock(mp);
2914 #endif /* CONFIG_TRIGGERS */
2915 
2916 	lck_rw_done(&mp->mnt_rwlock);
2917 
2918 	if (needwakeup) {
2919 		wakeup((caddr_t)mp);
2920 	}
2921 
2922 	if (!error) {
2923 		if ((coveredvp != NULLVP)) {
2924 			vnode_t pvp = NULLVP;
2925 
2926 			/*
2927 			 * The covered vnode needs special handling. Trying to
2928 			 * get an iocount must not block here as this may lead
2929 			 * to deadlocks if the Filesystem to which the covered
2930 			 * vnode belongs is undergoing forced unmounts. Since we
2931 			 * hold a usecount, the  vnode cannot be reused
2932 			 * (it can, however, still be terminated).
2933 			 */
2934 			vnode_getalways(coveredvp);
2935 
2936 			mount_dropcrossref(mp, coveredvp, 0);
2937 			/*
2938 			 * We'll _try_ to detect if this really needs to be
2939 			 * done. The coveredvp can only be in termination (or
2940 			 * terminated) if the coveredvp's mount point is in a
2941 			 * forced unmount (or has been) since we still hold the
2942 			 * ref.
2943 			 */
2944 			if (!vnode_isrecycled(coveredvp)) {
2945 				pvp = vnode_getparent(coveredvp);
2946 #if CONFIG_TRIGGERS
2947 				if (coveredvp->v_resolve) {
2948 					vnode_trigger_rearm(coveredvp, ctx);
2949 				}
2950 #endif
2951 			}
2952 
2953 			vnode_rele(coveredvp);
2954 			vnode_put(coveredvp);
2955 			coveredvp = NULLVP;
2956 
2957 			if (pvp) {
2958 				lock_vnode_and_post(pvp, NOTE_WRITE);
2959 				vnode_put(pvp);
2960 			}
2961 		} else if (mp->mnt_flag & MNT_ROOTFS) {
2962 			mount_lock_destroy(mp);
2963 #if CONFIG_MACF
2964 			mac_mount_label_destroy(mp);
2965 #endif
2966 			zfree(mount_zone, mp);
2967 		} else {
2968 			panic("dounmount: no coveredvp");
2969 		}
2970 	}
2971 	return error;
2972 }
2973 
2974 /*
2975  * Unmount any mounts in this filesystem.
2976  */
2977 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)2978 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2979 {
2980 	mount_t smp;
2981 	fsid_t *fsids, fsid;
2982 	int fsids_sz;
2983 	int count = 0, i, m = 0;
2984 	vnode_t vp;
2985 
2986 	mount_list_lock();
2987 
2988 	// Get an array to hold the submounts fsids.
2989 	TAILQ_FOREACH(smp, &mountlist, mnt_list)
2990 	count++;
2991 	fsids_sz = count * sizeof(fsid_t);
2992 	fsids = kalloc_data(fsids_sz, Z_NOWAIT);
2993 	if (fsids == NULL) {
2994 		mount_list_unlock();
2995 		goto out;
2996 	}
2997 	fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
2998 
2999 	/*
3000 	 * Fill the array with submount fsids.
3001 	 * Since mounts are always added to the tail of the mount list, the
3002 	 * list is always in mount order.
3003 	 * For each mount check if the mounted-on vnode belongs to a
3004 	 * mount that's already added to our array of mounts to be unmounted.
3005 	 */
3006 	for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3007 		vp = smp->mnt_vnodecovered;
3008 		if (vp == NULL) {
3009 			continue;
3010 		}
3011 		fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
3012 		for (i = 0; i <= m; i++) {
3013 			if (fsids[i].val[0] == fsid.val[0] &&
3014 			    fsids[i].val[1] == fsid.val[1]) {
3015 				fsids[++m] = smp->mnt_vfsstat.f_fsid;
3016 				break;
3017 			}
3018 		}
3019 	}
3020 	mount_list_unlock();
3021 
3022 	// Unmount the submounts in reverse order. Ignore errors.
3023 	for (i = m; i > 0; i--) {
3024 		smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3025 		if (smp) {
3026 			mount_ref(smp, 0);
3027 			mount_iterdrop(smp);
3028 			(void) dounmount(smp, flags, 1, ctx);
3029 		}
3030 	}
3031 out:
3032 	kfree_data(fsids, fsids_sz);
3033 }
3034 
3035 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3036 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3037 {
3038 	vnode_hold(dp);
3039 	vnode_lock(dp);
3040 	mp->mnt_crossref--;
3041 
3042 	if (mp->mnt_crossref < 0) {
3043 		panic("mount cross refs -ve");
3044 	}
3045 
3046 	if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3047 		if (need_put) {
3048 			vnode_put_locked(dp);
3049 		}
3050 		vnode_drop_and_unlock(dp);
3051 
3052 		mount_lock_destroy(mp);
3053 #if CONFIG_MACF
3054 		mac_mount_label_destroy(mp);
3055 #endif
3056 		zfree(mount_zone, mp);
3057 		return;
3058 	}
3059 	if (need_put) {
3060 		vnode_put_locked(dp);
3061 	}
3062 	vnode_drop_and_unlock(dp);
3063 }
3064 
3065 
3066 /*
3067  * Sync each mounted filesystem.
3068  */
3069 #if DIAGNOSTIC
3070 int syncprt = 0;
3071 #endif
3072 
3073 int print_vmpage_stat = 0;
3074 
3075 /*
3076  * sync_callback:	simple wrapper that calls VFS_SYNC() on volumes
3077  *			mounted read-write with the passed waitfor value.
3078  *
3079  * Parameters:	mp	mount-point descriptor per mounted file-system instance.
3080  *		arg	user argument (please see below)
3081  *
3082  * User argument is a pointer to 32 bit unsigned integer which describes the
3083  * type of waitfor value to set for calling VFS_SYNC().  If user argument is
3084  * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3085  * waitfor value.
3086  *
3087  * Returns:		VFS_RETURNED
3088  */
3089 static int
sync_callback(mount_t mp,void * arg)3090 sync_callback(mount_t mp, void *arg)
3091 {
3092 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3093 		int asyncflag = mp->mnt_flag & MNT_ASYNC;
3094 		unsigned waitfor = MNT_NOWAIT;
3095 
3096 		if (arg) {
3097 			waitfor = *(uint32_t*)arg;
3098 		}
3099 
3100 		/* Sanity check for flags - these are the only valid combinations for the flag bits*/
3101 		if (waitfor != MNT_WAIT &&
3102 		    waitfor != (MNT_WAIT | MNT_VOLUME) &&
3103 		    waitfor != MNT_NOWAIT &&
3104 		    waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3105 		    waitfor != MNT_DWAIT &&
3106 		    waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3107 			panic("Passed inappropriate waitfor %u to "
3108 			    "sync_callback()", waitfor);
3109 		}
3110 
3111 		mp->mnt_flag &= ~MNT_ASYNC;
3112 		(void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3113 		if (asyncflag) {
3114 			mp->mnt_flag |= MNT_ASYNC;
3115 		}
3116 	}
3117 
3118 	return VFS_RETURNED;
3119 }
3120 
3121 /* ARGSUSED */
3122 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3123 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3124 {
3125 	vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3126 
3127 	if (print_vmpage_stat) {
3128 		vm_countdirtypages();
3129 	}
3130 
3131 #if DIAGNOSTIC
3132 	if (syncprt) {
3133 		vfs_bufstats();
3134 	}
3135 #endif /* DIAGNOSTIC */
3136 	return 0;
3137 }
3138 
3139 typedef enum {
3140 	SYNC_ALL = 0,
3141 	SYNC_ONLY_RELIABLE_MEDIA = 1,
3142 	SYNC_ONLY_UNRELIABLE_MEDIA = 2
3143 } sync_type_t;
3144 
3145 static int
sync_internal_callback(mount_t mp,void * arg)3146 sync_internal_callback(mount_t mp, void *arg)
3147 {
3148 	if (arg) {
3149 		int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3150 		    (mp->mnt_flag & MNT_LOCAL);
3151 		sync_type_t sync_type = *((sync_type_t *)arg);
3152 
3153 		if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3154 			return VFS_RETURNED;
3155 		} else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3156 			return VFS_RETURNED;
3157 		}
3158 	}
3159 
3160 	(void)sync_callback(mp, NULL);
3161 
3162 	return VFS_RETURNED;
3163 }
3164 
3165 int sync_thread_state = 0;
3166 int sync_timeout_seconds = 5;
3167 
3168 #define SYNC_THREAD_RUN       0x0001
3169 #define SYNC_THREAD_RUNNING   0x0002
3170 
3171 #if CONFIG_PHYS_WRITE_ACCT
3172 thread_t pm_sync_thread;
3173 #endif /* CONFIG_PHYS_WRITE_ACCT */
3174 
3175 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3176 sync_thread(__unused void *arg, __unused wait_result_t wr)
3177 {
3178 	sync_type_t sync_type;
3179 #if CONFIG_PHYS_WRITE_ACCT
3180 	pm_sync_thread = current_thread();
3181 #endif /* CONFIG_PHYS_WRITE_ACCT */
3182 
3183 	lck_mtx_lock(&sync_mtx_lck);
3184 	while (sync_thread_state & SYNC_THREAD_RUN) {
3185 		sync_thread_state &= ~SYNC_THREAD_RUN;
3186 		lck_mtx_unlock(&sync_mtx_lck);
3187 
3188 		sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3189 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3190 		sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3191 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3192 
3193 		lck_mtx_lock(&sync_mtx_lck);
3194 	}
3195 	/*
3196 	 * This wakeup _has_ to be issued before the lock is released otherwise
3197 	 * we may end up waking up a thread in sync_internal which is
3198 	 * expecting a wakeup from a thread it just created and not from this
3199 	 * thread which is about to exit.
3200 	 */
3201 	wakeup(&sync_thread_state);
3202 	sync_thread_state &= ~SYNC_THREAD_RUNNING;
3203 #if CONFIG_PHYS_WRITE_ACCT
3204 	pm_sync_thread = NULL;
3205 #endif /* CONFIG_PHYS_WRITE_ACCT */
3206 	lck_mtx_unlock(&sync_mtx_lck);
3207 
3208 	if (print_vmpage_stat) {
3209 		vm_countdirtypages();
3210 	}
3211 
3212 #if DIAGNOSTIC
3213 	if (syncprt) {
3214 		vfs_bufstats();
3215 	}
3216 #endif /* DIAGNOSTIC */
3217 }
3218 
3219 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3220 
3221 /*
3222  * An in-kernel sync for power management to call.
3223  * This function always returns within sync_timeout seconds.
3224  */
3225 __private_extern__ int
sync_internal(void)3226 sync_internal(void)
3227 {
3228 	thread_t thd = NULL;
3229 	int error;
3230 	int thread_created = FALSE;
3231 	struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3232 
3233 	lck_mtx_lock(&sync_mtx_lck);
3234 	sync_thread_state |= SYNC_THREAD_RUN;
3235 	if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3236 		int kr;
3237 
3238 		sync_thread_state |= SYNC_THREAD_RUNNING;
3239 		kr = kernel_thread_start(sync_thread, NULL, &thd);
3240 		if (kr != KERN_SUCCESS) {
3241 			sync_thread_state &= ~SYNC_THREAD_RUNNING;
3242 			lck_mtx_unlock(&sync_mtx_lck);
3243 			printf("sync_thread failed\n");
3244 			return 0;
3245 		}
3246 		thread_created = TRUE;
3247 	}
3248 
3249 	error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3250 	    (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3251 	if (error) {
3252 		struct timeval now;
3253 
3254 		microtime(&now);
3255 		if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3256 			printf("sync timed out: %d sec\n", sync_timeout_seconds);
3257 			sync_timeout_last_print.tv_sec = now.tv_sec;
3258 		}
3259 	}
3260 
3261 	if (thread_created) {
3262 		thread_deallocate(thd);
3263 	}
3264 
3265 	return 0;
3266 } /* end of sync_internal call */
3267 
3268 /*
3269  * Change filesystem quotas.
3270  */
3271 #if QUOTA
3272 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3273 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3274 {
3275 	struct mount *mp;
3276 	int error, quota_cmd, quota_status = 0;
3277 	caddr_t datap;
3278 	size_t fnamelen;
3279 	struct nameidata nd;
3280 	vfs_context_t ctx = vfs_context_current();
3281 	struct dqblk my_dqblk = {};
3282 
3283 	AUDIT_ARG(uid, uap->uid);
3284 	AUDIT_ARG(cmd, uap->cmd);
3285 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3286 	    uap->path, ctx);
3287 	error = namei(&nd);
3288 	if (error) {
3289 		return error;
3290 	}
3291 	mp = nd.ni_vp->v_mount;
3292 	mount_ref(mp, 0);
3293 	vnode_put(nd.ni_vp);
3294 	nameidone(&nd);
3295 
3296 #if CONFIG_MACF
3297 	error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3298 	if (error != 0) {
3299 		goto out;
3300 	}
3301 #endif
3302 
3303 	/* copyin any data we will need for downstream code */
3304 	quota_cmd = uap->cmd >> SUBCMDSHIFT;
3305 
3306 	switch (quota_cmd) {
3307 	case Q_QUOTAON:
3308 		/* uap->arg specifies a file from which to take the quotas */
3309 		fnamelen = MAXPATHLEN;
3310 		datap = zalloc(ZV_NAMEI);
3311 		error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3312 		break;
3313 	case Q_GETQUOTA:
3314 		/* uap->arg is a pointer to a dqblk structure. */
3315 		datap = (caddr_t) &my_dqblk;
3316 		break;
3317 	case Q_SETQUOTA:
3318 	case Q_SETUSE:
3319 		/* uap->arg is a pointer to a dqblk structure. */
3320 		datap = (caddr_t) &my_dqblk;
3321 		if (proc_is64bit(p)) {
3322 			struct user_dqblk       my_dqblk64;
3323 			error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3324 			if (error == 0) {
3325 				munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3326 			}
3327 		} else {
3328 			error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3329 		}
3330 		break;
3331 	case Q_QUOTASTAT:
3332 		/* uap->arg is a pointer to an integer */
3333 		datap = (caddr_t) &quota_status;
3334 		break;
3335 	default:
3336 		datap = NULL;
3337 		break;
3338 	} /* switch */
3339 
3340 	if (error == 0) {
3341 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3342 	}
3343 
3344 	switch (quota_cmd) {
3345 	case Q_QUOTAON:
3346 		if (datap != NULL) {
3347 			zfree(ZV_NAMEI, datap);
3348 		}
3349 		break;
3350 	case Q_GETQUOTA:
3351 		/* uap->arg is a pointer to a dqblk structure we need to copy out to */
3352 		if (error == 0) {
3353 			if (proc_is64bit(p)) {
3354 				struct user_dqblk       my_dqblk64;
3355 
3356 				memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3357 				munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3358 				error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3359 			} else {
3360 				error = copyout(datap, uap->arg, sizeof(struct dqblk));
3361 			}
3362 		}
3363 		break;
3364 	case Q_QUOTASTAT:
3365 		/* uap->arg is a pointer to an integer */
3366 		if (error == 0) {
3367 			error = copyout(datap, uap->arg, sizeof(quota_status));
3368 		}
3369 		break;
3370 	default:
3371 		break;
3372 	} /* switch */
3373 
3374 out:
3375 	mount_drop(mp, 0);
3376 	return error;
3377 }
3378 #else
3379 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3380 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3381 {
3382 	return EOPNOTSUPP;
3383 }
3384 #endif /* QUOTA */
3385 
3386 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3387 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3388 {
3389 	int error;
3390 	vfs_context_t ctx = vfs_context_current();
3391 
3392 #if CONFIG_MACF
3393 	error = mac_mount_check_stat(ctx, mp);
3394 	if (error != 0) {
3395 		return error;
3396 	}
3397 #endif
3398 
3399 	error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3400 	if (error != 0) {
3401 		return error;
3402 	}
3403 
3404 	return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3405 }
3406 
3407 /*
3408  * Get filesystem statistics.
3409  *
3410  * Returns:	0			Success
3411  *	namei:???
3412  *	vfs_update_vfsstat:???
3413  *	munge_statfs:EFAULT
3414  */
3415 /* ARGSUSED */
3416 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3417 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3418 {
3419 	int error;
3420 	struct mount *mp;
3421 	struct nameidata nd;
3422 	vfs_context_t ctx = vfs_context_current();
3423 	vnode_t vp;
3424 
3425 	NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3426 	    UIO_USERSPACE, uap->path, ctx);
3427 	error = namei(&nd);
3428 	if (error != 0) {
3429 		return error;
3430 	}
3431 	vp = nd.ni_vp;
3432 	mp = vp->v_mount;
3433 	nameidone(&nd);
3434 
3435 	error = statfs_internal(p, mp, uap->buf);
3436 	vnode_put(vp);
3437 
3438 	return error;
3439 }
3440 
3441 /*
3442  * Get filesystem statistics.
3443  */
3444 /* ARGSUSED */
3445 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3446 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3447 {
3448 	int error;
3449 	vnode_t vp = NULL;
3450 	struct mount *mp;
3451 
3452 	AUDIT_ARG(fd, uap->fd);
3453 
3454 	if ((error = file_vnode(uap->fd, &vp)) ||
3455 	    (error = vnode_getwithref(vp))) {
3456 		goto out;
3457 	}
3458 
3459 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3460 
3461 	mp = vp->v_mount;
3462 	if (!mp) {
3463 		error = EBADF;
3464 		goto out_vnode;
3465 	}
3466 
3467 	error = statfs_internal(p, mp, uap->buf);
3468 
3469 out_vnode:
3470 	vnode_put(vp);
3471 
3472 out:
3473 	if (vp != NULL) {
3474 		file_drop(uap->fd);
3475 	}
3476 
3477 	return error;
3478 }
3479 
3480 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3481 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3482 {
3483 	struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3484 
3485 	bzero(sfs, sizeof(*sfs));
3486 
3487 	sfs->f_bsize = vsfs->f_bsize;
3488 	sfs->f_iosize = (int32_t)vsfs->f_iosize;
3489 	sfs->f_blocks = vsfs->f_blocks;
3490 	sfs->f_bfree = vsfs->f_bfree;
3491 	sfs->f_bavail = vsfs->f_bavail;
3492 	sfs->f_files = vsfs->f_files;
3493 	sfs->f_ffree = vsfs->f_ffree;
3494 	sfs->f_fsid = vsfs->f_fsid;
3495 	sfs->f_owner = vsfs->f_owner;
3496 	sfs->f_type = mp->mnt_vtable->vfc_typenum;
3497 	sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3498 	sfs->f_fssubtype = vsfs->f_fssubtype;
3499 	sfs->f_flags_ext = (mp->mnt_kern_flag & MNTK_SYSTEMDATA) ? MNT_EXT_ROOT_DATA_VOL : 0;
3500 	if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3501 		strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3502 	} else {
3503 		strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
3504 	}
3505 	strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3506 	strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3507 }
3508 
3509 /*
3510  * Get file system statistics in 64-bit mode
3511  */
3512 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3513 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3514 {
3515 	struct mount *mp;
3516 	int error;
3517 	struct nameidata *ndp;
3518 	struct statfs64 *sfsp;
3519 	vfs_context_t ctxp = vfs_context_current();
3520 	vnode_t vp;
3521 	struct {
3522 		struct nameidata nd;
3523 		struct statfs64 sfs;
3524 	} *__nameidata_statfs64;
3525 
3526 	__nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3527 	    Z_WAITOK);
3528 	ndp = &__nameidata_statfs64->nd;
3529 
3530 	NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3531 	    UIO_USERSPACE, uap->path, ctxp);
3532 	error = namei(ndp);
3533 	if (error != 0) {
3534 		goto out;
3535 	}
3536 	vp = ndp->ni_vp;
3537 	mp = vp->v_mount;
3538 	nameidone(ndp);
3539 
3540 #if CONFIG_MACF
3541 	error = mac_mount_check_stat(ctxp, mp);
3542 	if (error != 0) {
3543 		vnode_put(vp);
3544 		goto out;
3545 	}
3546 #endif
3547 
3548 	error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3549 	if (error != 0) {
3550 		vnode_put(vp);
3551 		goto out;
3552 	}
3553 
3554 	sfsp = &__nameidata_statfs64->sfs;
3555 	vfs_get_statfs64(mp, sfsp);
3556 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3557 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3558 		/* This process does not want to see a seperate data volume mountpoint */
3559 		strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3560 	}
3561 	error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3562 	vnode_put(vp);
3563 
3564 out:
3565 	kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3566 
3567 	return error;
3568 }
3569 
3570 /*
3571  * Get file system statistics in 64-bit mode
3572  */
3573 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3574 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3575 {
3576 	struct vnode *vp;
3577 	struct mount *mp;
3578 	struct statfs64 sfs;
3579 	int error;
3580 
3581 	AUDIT_ARG(fd, uap->fd);
3582 
3583 	if ((error = file_vnode(uap->fd, &vp))) {
3584 		return error;
3585 	}
3586 
3587 	error = vnode_getwithref(vp);
3588 	if (error) {
3589 		file_drop(uap->fd);
3590 		return error;
3591 	}
3592 
3593 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3594 
3595 	mp = vp->v_mount;
3596 	if (!mp) {
3597 		error = EBADF;
3598 		goto out;
3599 	}
3600 
3601 #if CONFIG_MACF
3602 	error = mac_mount_check_stat(vfs_context_current(), mp);
3603 	if (error != 0) {
3604 		goto out;
3605 	}
3606 #endif
3607 
3608 	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3609 		goto out;
3610 	}
3611 
3612 	vfs_get_statfs64(mp, &sfs);
3613 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3614 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3615 		/* This process does not want to see a seperate data volume mountpoint */
3616 		strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3617 	}
3618 	error = copyout(&sfs, uap->buf, sizeof(sfs));
3619 
3620 out:
3621 	file_drop(uap->fd);
3622 	vnode_put(vp);
3623 
3624 	return error;
3625 }
3626 
3627 struct getfsstat_struct {
3628 	user_addr_t     sfsp;
3629 	user_addr_t     *mp;
3630 	int             count;
3631 	int             maxcount;
3632 	int             flags;
3633 	int             error;
3634 };
3635 
3636 
3637 static int
getfsstat_callback(mount_t mp,void * arg)3638 getfsstat_callback(mount_t mp, void * arg)
3639 {
3640 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3641 	struct vfsstatfs *sp;
3642 	int error, my_size;
3643 	vfs_context_t ctx = vfs_context_current();
3644 
3645 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3646 #if CONFIG_MACF
3647 		error = mac_mount_check_stat(ctx, mp);
3648 		if (error != 0) {
3649 			fstp->error = error;
3650 			return VFS_RETURNED_DONE;
3651 		}
3652 #endif
3653 		sp = &mp->mnt_vfsstat;
3654 		/*
3655 		 * If MNT_NOWAIT is specified, do not refresh the
3656 		 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3657 		 */
3658 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3659 		    (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3660 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3661 		    (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3662 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3663 			return VFS_RETURNED;
3664 		}
3665 
3666 		/*
3667 		 * Need to handle LP64 version of struct statfs
3668 		 */
3669 		error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3670 		if (error) {
3671 			fstp->error = error;
3672 			return VFS_RETURNED_DONE;
3673 		}
3674 		fstp->sfsp += my_size;
3675 
3676 		if (fstp->mp) {
3677 #if CONFIG_MACF
3678 			error = mac_mount_label_get(mp, *fstp->mp);
3679 			if (error) {
3680 				fstp->error = error;
3681 				return VFS_RETURNED_DONE;
3682 			}
3683 #endif
3684 			fstp->mp++;
3685 		}
3686 	}
3687 	fstp->count++;
3688 	return VFS_RETURNED;
3689 }
3690 
3691 /*
3692  * Get statistics on all filesystems.
3693  */
3694 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3695 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3696 {
3697 	struct __mac_getfsstat_args muap;
3698 
3699 	muap.buf = uap->buf;
3700 	muap.bufsize = uap->bufsize;
3701 	muap.mac = USER_ADDR_NULL;
3702 	muap.macsize = 0;
3703 	muap.flags = uap->flags;
3704 
3705 	return __mac_getfsstat(p, &muap, retval);
3706 }
3707 
3708 /*
3709  * __mac_getfsstat: Get MAC-related file system statistics
3710  *
3711  * Parameters:    p                        (ignored)
3712  *                uap                      User argument descriptor (see below)
3713  *                retval                   Count of file system statistics (N stats)
3714  *
3715  * Indirect:      uap->bufsize             Buffer size
3716  *                uap->macsize             MAC info size
3717  *                uap->buf                 Buffer where information will be returned
3718  *                uap->mac                 MAC info
3719  *                uap->flags               File system flags
3720  *
3721  *
3722  * Returns:        0                       Success
3723  *                !0                       Not success
3724  *
3725  */
3726 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3727 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3728 {
3729 	user_addr_t sfsp;
3730 	user_addr_t *mp;
3731 	size_t count, maxcount, bufsize, macsize;
3732 	struct getfsstat_struct fst;
3733 
3734 	if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3735 		return EINVAL;
3736 	}
3737 
3738 	bufsize = (size_t) uap->bufsize;
3739 	macsize = (size_t) uap->macsize;
3740 
3741 	if (IS_64BIT_PROCESS(p)) {
3742 		maxcount = bufsize / sizeof(struct user64_statfs);
3743 	} else {
3744 		maxcount = bufsize / sizeof(struct user32_statfs);
3745 	}
3746 	sfsp = uap->buf;
3747 	count = 0;
3748 
3749 	mp = NULL;
3750 
3751 #if CONFIG_MACF
3752 	if (uap->mac != USER_ADDR_NULL) {
3753 		u_int32_t *mp0;
3754 		int error;
3755 		unsigned int i;
3756 
3757 		count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3758 		if (count != maxcount) {
3759 			return EINVAL;
3760 		}
3761 
3762 		/* Copy in the array */
3763 		mp0 = kalloc_data(macsize, Z_WAITOK);
3764 		if (mp0 == NULL) {
3765 			return ENOMEM;
3766 		}
3767 
3768 		error = copyin(uap->mac, mp0, macsize);
3769 		if (error) {
3770 			kfree_data(mp0, macsize);
3771 			return error;
3772 		}
3773 
3774 		/* Normalize to an array of user_addr_t */
3775 		mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3776 		if (mp == NULL) {
3777 			kfree_data(mp0, macsize);
3778 			return ENOMEM;
3779 		}
3780 
3781 		for (i = 0; i < count; i++) {
3782 			if (IS_64BIT_PROCESS(p)) {
3783 				mp[i] = ((user_addr_t *)mp0)[i];
3784 			} else {
3785 				mp[i] = (user_addr_t)mp0[i];
3786 			}
3787 		}
3788 		kfree_data(mp0, macsize);
3789 	}
3790 #endif
3791 
3792 
3793 	fst.sfsp = sfsp;
3794 	fst.mp = mp;
3795 	fst.flags = uap->flags;
3796 	fst.count = 0;
3797 	fst.error = 0;
3798 	fst.maxcount = (int)maxcount;
3799 
3800 
3801 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3802 
3803 	if (mp) {
3804 		kfree_data(mp, count * sizeof(user_addr_t));
3805 	}
3806 
3807 	if (fst.error) {
3808 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3809 		return fst.error;
3810 	}
3811 
3812 	if (fst.sfsp && fst.count > fst.maxcount) {
3813 		*retval = fst.maxcount;
3814 	} else {
3815 		*retval = fst.count;
3816 	}
3817 	return 0;
3818 }
3819 
3820 static int
getfsstat64_callback(mount_t mp,void * arg)3821 getfsstat64_callback(mount_t mp, void * arg)
3822 {
3823 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3824 	struct vfsstatfs *sp;
3825 	struct statfs64 sfs;
3826 	int error;
3827 
3828 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3829 #if CONFIG_MACF
3830 		error = mac_mount_check_stat(vfs_context_current(), mp);
3831 		if (error != 0) {
3832 			fstp->error = error;
3833 			return VFS_RETURNED_DONE;
3834 		}
3835 #endif
3836 		sp = &mp->mnt_vfsstat;
3837 		/*
3838 		 * If MNT_NOWAIT is specified, do not refresh the fsstat
3839 		 * cache. MNT_WAIT overrides MNT_NOWAIT.
3840 		 *
3841 		 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3842 		 * getfsstat, since the constants are out of the same
3843 		 * namespace.
3844 		 */
3845 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3846 		    ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3847 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3848 		    (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3849 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3850 			return VFS_RETURNED;
3851 		}
3852 
3853 		vfs_get_statfs64(mp, &sfs);
3854 		error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3855 		if (error) {
3856 			fstp->error = error;
3857 			return VFS_RETURNED_DONE;
3858 		}
3859 		fstp->sfsp += sizeof(sfs);
3860 	}
3861 	fstp->count++;
3862 	return VFS_RETURNED;
3863 }
3864 
3865 /*
3866  * Get statistics on all file systems in 64 bit mode.
3867  */
3868 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)3869 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3870 {
3871 	user_addr_t sfsp;
3872 	int count, maxcount;
3873 	struct getfsstat_struct fst;
3874 
3875 	maxcount = uap->bufsize / sizeof(struct statfs64);
3876 
3877 	sfsp = uap->buf;
3878 	count = 0;
3879 
3880 	fst.sfsp = sfsp;
3881 	fst.flags = uap->flags;
3882 	fst.count = 0;
3883 	fst.error = 0;
3884 	fst.maxcount = maxcount;
3885 
3886 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3887 
3888 	if (fst.error) {
3889 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3890 		return fst.error;
3891 	}
3892 
3893 	if (fst.sfsp && fst.count > fst.maxcount) {
3894 		*retval = fst.maxcount;
3895 	} else {
3896 		*retval = fst.count;
3897 	}
3898 
3899 	return 0;
3900 }
3901 
3902 /*
3903  * gets the associated vnode with the file descriptor passed.
3904  * as input
3905  *
3906  * INPUT
3907  * ctx - vfs context of caller
3908  * fd - file descriptor for which vnode is required.
3909  * vpp - Pointer to pointer to vnode to be returned.
3910  *
3911  * The vnode is returned with an iocount so any vnode obtained
3912  * by this call needs a vnode_put
3913  *
3914  */
3915 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)3916 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3917 {
3918 	int error;
3919 	vnode_t vp;
3920 	struct fileproc *fp;
3921 	proc_t p = vfs_context_proc(ctx);
3922 
3923 	*vpp =  NULLVP;
3924 
3925 	error = fp_getfvp(p, fd, &fp, &vp);
3926 	if (error) {
3927 		return error;
3928 	}
3929 
3930 	error = vnode_getwithref(vp);
3931 	if (error) {
3932 		(void)fp_drop(p, fd, fp, 0);
3933 		return error;
3934 	}
3935 
3936 	(void)fp_drop(p, fd, fp, 0);
3937 	*vpp = vp;
3938 	return error;
3939 }
3940 
3941 /*
3942  * Wrapper function around namei to start lookup from a directory
3943  * specified by a file descriptor ni_dirfd.
3944  *
3945  * In addition to all the errors returned by namei, this call can
3946  * return ENOTDIR if the file descriptor does not refer to a directory.
3947  * and EBADF if the file descriptor is not valid.
3948  */
3949 int
nameiat(struct nameidata * ndp,int dirfd)3950 nameiat(struct nameidata *ndp, int dirfd)
3951 {
3952 	if ((dirfd != AT_FDCWD) &&
3953 	    !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3954 	    !(ndp->ni_cnd.cn_flags & USEDVP)) {
3955 		int error = 0;
3956 		char c;
3957 
3958 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3959 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
3960 			if (error) {
3961 				return error;
3962 			}
3963 		} else {
3964 			c = *((char *)(ndp->ni_dirp));
3965 		}
3966 
3967 		if (c != '/') {
3968 			vnode_t dvp_at;
3969 
3970 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3971 			    &dvp_at);
3972 			if (error) {
3973 				return error;
3974 			}
3975 
3976 			if (vnode_vtype(dvp_at) != VDIR) {
3977 				vnode_put(dvp_at);
3978 				return ENOTDIR;
3979 			}
3980 
3981 			ndp->ni_dvp = dvp_at;
3982 			ndp->ni_cnd.cn_flags |= USEDVP;
3983 			error = namei(ndp);
3984 			ndp->ni_cnd.cn_flags &= ~USEDVP;
3985 			vnode_put(dvp_at);
3986 			return error;
3987 		}
3988 	}
3989 
3990 	return namei(ndp);
3991 }
3992 
3993 /*
3994  * Change current working directory to a given file descriptor.
3995  */
3996 /* ARGSUSED */
3997 static int
common_fchdir(proc_t p,struct fchdir_args * uap,int per_thread)3998 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
3999 {
4000 	vnode_t vp;
4001 	vnode_t tdp;
4002 	vnode_t tvp;
4003 	struct mount *mp;
4004 	int error, should_put = 1;
4005 	vfs_context_t ctx = vfs_context_current();
4006 
4007 	AUDIT_ARG(fd, uap->fd);
4008 	if (per_thread && uap->fd == -1) {
4009 		/*
4010 		 * Switching back from per-thread to per process CWD; verify we
4011 		 * in fact have one before proceeding.  The only success case
4012 		 * for this code path is to return 0 preemptively after zapping
4013 		 * the thread structure contents.
4014 		 */
4015 		thread_t th = vfs_context_thread(ctx);
4016 		if (th) {
4017 			uthread_t uth = get_bsdthread_info(th);
4018 			tvp = uth->uu_cdir;
4019 			uth->uu_cdir = NULLVP;
4020 			if (tvp != NULLVP) {
4021 				vnode_rele(tvp);
4022 				return 0;
4023 			}
4024 		}
4025 		return EBADF;
4026 	}
4027 
4028 	if ((error = file_vnode(uap->fd, &vp))) {
4029 		return error;
4030 	}
4031 	if ((error = vnode_getwithref(vp))) {
4032 		file_drop(uap->fd);
4033 		return error;
4034 	}
4035 
4036 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4037 
4038 	if (vp->v_type != VDIR) {
4039 		error = ENOTDIR;
4040 		goto out;
4041 	}
4042 
4043 #if CONFIG_MACF
4044 	error = mac_vnode_check_chdir(ctx, vp);
4045 	if (error) {
4046 		goto out;
4047 	}
4048 #endif
4049 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4050 	if (error) {
4051 		goto out;
4052 	}
4053 
4054 	while (!error && (mp = vp->v_mountedhere) != NULL) {
4055 		if (vfs_busy(mp, LK_NOWAIT)) {
4056 			error = EACCES;
4057 			goto out;
4058 		}
4059 		error = VFS_ROOT(mp, &tdp, ctx);
4060 		vfs_unbusy(mp);
4061 		if (error) {
4062 			break;
4063 		}
4064 		vnode_put(vp);
4065 		vp = tdp;
4066 	}
4067 	if (error) {
4068 		goto out;
4069 	}
4070 	if ((error = vnode_ref(vp))) {
4071 		goto out;
4072 	}
4073 	vnode_put(vp);
4074 	should_put = 0;
4075 
4076 	if (per_thread) {
4077 		thread_t th = vfs_context_thread(ctx);
4078 		if (th) {
4079 			uthread_t uth = get_bsdthread_info(th);
4080 			tvp = uth->uu_cdir;
4081 			uth->uu_cdir = vp;
4082 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4083 		} else {
4084 			vnode_rele(vp);
4085 			error = ENOENT;
4086 			goto out;
4087 		}
4088 	} else {
4089 		proc_dirs_lock_exclusive(p);
4090 		proc_fdlock(p);
4091 		tvp = p->p_fd.fd_cdir;
4092 		p->p_fd.fd_cdir = vp;
4093 		proc_fdunlock(p);
4094 		proc_dirs_unlock_exclusive(p);
4095 	}
4096 
4097 	if (tvp) {
4098 		vnode_rele(tvp);
4099 	}
4100 
4101 out:
4102 	if (should_put) {
4103 		vnode_put(vp);
4104 	}
4105 	file_drop(uap->fd);
4106 
4107 	return error;
4108 }
4109 
4110 int
fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4111 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4112 {
4113 	return common_fchdir(p, uap, 0);
4114 }
4115 
4116 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4117 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4118 {
4119 	return common_fchdir(p, (void *)uap, 1);
4120 }
4121 
4122 
4123 /*
4124  * Change current working directory (".").
4125  *
4126  * Returns:	0			Success
4127  *	change_dir:ENOTDIR
4128  *	change_dir:???
4129  *	vnode_ref:ENOENT		No such file or directory
4130  */
4131 /* ARGSUSED */
4132 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4133 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4134 {
4135 	int error;
4136 	vnode_t tvp;
4137 
4138 	error = change_dir(ndp, ctx);
4139 	if (error) {
4140 		return error;
4141 	}
4142 	if ((error = vnode_ref(ndp->ni_vp))) {
4143 		vnode_put(ndp->ni_vp);
4144 		return error;
4145 	}
4146 	/*
4147 	 * drop the iocount we picked up in change_dir
4148 	 */
4149 	vnode_put(ndp->ni_vp);
4150 
4151 	if (per_thread) {
4152 		thread_t th = vfs_context_thread(ctx);
4153 		if (th) {
4154 			uthread_t uth = get_bsdthread_info(th);
4155 			tvp = uth->uu_cdir;
4156 			uth->uu_cdir = ndp->ni_vp;
4157 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4158 		} else {
4159 			vnode_rele(ndp->ni_vp);
4160 			return ENOENT;
4161 		}
4162 	} else {
4163 		proc_dirs_lock_exclusive(p);
4164 		proc_fdlock(p);
4165 		tvp = p->p_fd.fd_cdir;
4166 		p->p_fd.fd_cdir = ndp->ni_vp;
4167 		proc_fdunlock(p);
4168 		proc_dirs_unlock_exclusive(p);
4169 	}
4170 
4171 	if (tvp) {
4172 		vnode_rele(tvp);
4173 	}
4174 
4175 	return 0;
4176 }
4177 
4178 
4179 /*
4180  * Change current working directory (".").
4181  *
4182  * Returns:	0			Success
4183  *	chdir_internal:ENOTDIR
4184  *	chdir_internal:ENOENT		No such file or directory
4185  *	chdir_internal:???
4186  */
4187 /* ARGSUSED */
4188 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4189 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4190 {
4191 	struct nameidata nd;
4192 	vfs_context_t ctx = vfs_context_current();
4193 
4194 	NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4195 	    UIO_USERSPACE, uap->path, ctx);
4196 
4197 	return chdir_internal(p, ctx, &nd, per_thread);
4198 }
4199 
4200 
4201 /*
4202  * chdir
4203  *
4204  * Change current working directory (".") for the entire process
4205  *
4206  * Parameters:  p       Process requesting the call
4207  *              uap     User argument descriptor (see below)
4208  *              retval  (ignored)
4209  *
4210  * Indirect parameters:	uap->path	Directory path
4211  *
4212  * Returns:	0			Success
4213  *              common_chdir: ENOTDIR
4214  *              common_chdir: ENOENT	No such file or directory
4215  *              common_chdir: ???
4216  *
4217  */
4218 int
chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4219 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4220 {
4221 	return common_chdir(p, (void *)uap, 0);
4222 }
4223 
4224 /*
4225  * __pthread_chdir
4226  *
4227  * Change current working directory (".") for a single thread
4228  *
4229  * Parameters:  p       Process requesting the call
4230  *              uap     User argument descriptor (see below)
4231  *              retval  (ignored)
4232  *
4233  * Indirect parameters:	uap->path	Directory path
4234  *
4235  * Returns:	0			Success
4236  *              common_chdir: ENOTDIR
4237  *		common_chdir: ENOENT	No such file or directory
4238  *		common_chdir: ???
4239  *
4240  */
4241 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4242 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4243 {
4244 	return common_chdir(p, (void *)uap, 1);
4245 }
4246 
4247 
4248 /*
4249  * Change notion of root (``/'') directory.
4250  */
4251 /* ARGSUSED */
4252 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4253 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4254 {
4255 	struct filedesc *fdp = &p->p_fd;
4256 	int error;
4257 	struct nameidata nd;
4258 	vnode_t tvp;
4259 	vfs_context_t ctx = vfs_context_current();
4260 
4261 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4262 		return error;
4263 	}
4264 
4265 	NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4266 	    UIO_USERSPACE, uap->path, ctx);
4267 	error = change_dir(&nd, ctx);
4268 	if (error) {
4269 		return error;
4270 	}
4271 
4272 #if CONFIG_MACF
4273 	error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4274 	    &nd.ni_cnd);
4275 	if (error) {
4276 		vnode_put(nd.ni_vp);
4277 		return error;
4278 	}
4279 #endif
4280 
4281 	if ((error = vnode_ref(nd.ni_vp))) {
4282 		vnode_put(nd.ni_vp);
4283 		return error;
4284 	}
4285 	vnode_put(nd.ni_vp);
4286 
4287 	/*
4288 	 * This lock provides the guarantee that as long as you hold the lock
4289 	 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4290 	 * on a referenced vnode in namei when determining the rootvnode for
4291 	 * a process.
4292 	 */
4293 	/* needed for synchronization with lookup */
4294 	proc_dirs_lock_exclusive(p);
4295 	/* needed for setting the flag and other activities on the fd itself */
4296 	proc_fdlock(p);
4297 	tvp = fdp->fd_rdir;
4298 	fdp->fd_rdir = nd.ni_vp;
4299 	fdt_flag_set(fdp, FD_CHROOT);
4300 	proc_fdunlock(p);
4301 	proc_dirs_unlock_exclusive(p);
4302 
4303 	if (tvp != NULL) {
4304 		vnode_rele(tvp);
4305 	}
4306 
4307 	return 0;
4308 }
4309 
4310 #define PATHSTATICBUFLEN 256
4311 #define PIVOT_ROOT_ENTITLEMENT              \
4312        "com.apple.private.vfs.pivot-root"
4313 
4314 #if defined(XNU_TARGET_OS_OSX)
4315 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4316 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4317 {
4318 	int error;
4319 	char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4320 	char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4321 	char *new_rootfs_path_before_buf = NULL;
4322 	char *old_rootfs_path_after_buf = NULL;
4323 	char *incoming = NULL;
4324 	char *outgoing = NULL;
4325 	vnode_t incoming_rootvp = NULLVP;
4326 	size_t bytes_copied;
4327 
4328 	/*
4329 	 * XXX : Additional restrictions needed
4330 	 * - perhaps callable only once.
4331 	 */
4332 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4333 		return error;
4334 	}
4335 
4336 	/*
4337 	 * pivot_root can be executed by launchd only.
4338 	 * Enforce entitlement.
4339 	 */
4340 	if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4341 		return EPERM;
4342 	}
4343 
4344 	error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4345 	if (error == ENAMETOOLONG) {
4346 		new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4347 		error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4348 	}
4349 
4350 	if (error) {
4351 		goto out;
4352 	}
4353 
4354 	error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4355 	if (error == ENAMETOOLONG) {
4356 		old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4357 		error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4358 	}
4359 	if (error) {
4360 		goto out;
4361 	}
4362 
4363 	if (new_rootfs_path_before_buf) {
4364 		incoming = new_rootfs_path_before_buf;
4365 	} else {
4366 		incoming = &new_rootfs_path_before[0];
4367 	}
4368 
4369 	if (old_rootfs_path_after_buf) {
4370 		outgoing = old_rootfs_path_after_buf;
4371 	} else {
4372 		outgoing = &old_rootfs_path_after[0];
4373 	}
4374 
4375 	/*
4376 	 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4377 	 * Userland is not allowed to pivot to an image.
4378 	 */
4379 	error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4380 	if (error) {
4381 		goto out;
4382 	}
4383 	error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4384 	if (error) {
4385 		goto out;
4386 	}
4387 
4388 	error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4389 
4390 out:
4391 	if (incoming_rootvp != NULLVP) {
4392 		vnode_put(incoming_rootvp);
4393 		incoming_rootvp = NULLVP;
4394 	}
4395 
4396 	if (old_rootfs_path_after_buf) {
4397 		zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4398 	}
4399 
4400 	if (new_rootfs_path_before_buf) {
4401 		zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4402 	}
4403 
4404 	return error;
4405 }
4406 #else
4407 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4408 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4409 {
4410 	return nosys(p, NULL, retval);
4411 }
4412 #endif /* XNU_TARGET_OS_OSX */
4413 
4414 /*
4415  * Common routine for chroot and chdir.
4416  *
4417  * Returns:	0			Success
4418  *		ENOTDIR			Not a directory
4419  *		namei:???		[anything namei can return]
4420  *		vnode_authorize:???	[anything vnode_authorize can return]
4421  */
4422 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4423 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4424 {
4425 	vnode_t vp;
4426 	int error;
4427 
4428 	if ((error = namei(ndp))) {
4429 		return error;
4430 	}
4431 	nameidone(ndp);
4432 	vp = ndp->ni_vp;
4433 
4434 	if (vp->v_type != VDIR) {
4435 		vnode_put(vp);
4436 		return ENOTDIR;
4437 	}
4438 
4439 #if CONFIG_MACF
4440 	error = mac_vnode_check_chdir(ctx, vp);
4441 	if (error) {
4442 		vnode_put(vp);
4443 		return error;
4444 	}
4445 #endif
4446 
4447 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4448 	if (error) {
4449 		vnode_put(vp);
4450 		return error;
4451 	}
4452 
4453 	return error;
4454 }
4455 
4456 /*
4457  * Free the vnode data (for directories) associated with the file glob.
4458  */
4459 struct fd_vn_data *
fg_vn_data_alloc(void)4460 fg_vn_data_alloc(void)
4461 {
4462 	struct fd_vn_data *fvdata;
4463 
4464 	/* Allocate per fd vnode data */
4465 	fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4466 	lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4467 	return fvdata;
4468 }
4469 
4470 /*
4471  * Free the vnode data (for directories) associated with the file glob.
4472  */
4473 void
fg_vn_data_free(void * fgvndata)4474 fg_vn_data_free(void *fgvndata)
4475 {
4476 	struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4477 
4478 	kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4479 	lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4480 	kfree_type(struct fd_vn_data, fvdata);
4481 }
4482 
4483 /*
4484  * Check permissions, allocate an open file structure,
4485  * and call the device open routine if any.
4486  *
4487  * Returns:	0			Success
4488  *		EINVAL
4489  *		EINTR
4490  *	falloc:ENFILE
4491  *	falloc:EMFILE
4492  *	falloc:ENOMEM
4493  *	vn_open_auth:???
4494  *	dupfdopen:???
4495  *	VNOP_ADVLOCK:???
4496  *	vnode_setsize:???
4497  *
4498  * XXX Need to implement uid, gid
4499  */
4500 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4501 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4502     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4503 {
4504 	proc_t p = vfs_context_proc(ctx);
4505 	uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4506 	struct fileproc *fp;
4507 	vnode_t vp;
4508 	int flags, oflags, amode;
4509 	int type, indx, error;
4510 	struct vfs_context context;
4511 	vnode_t authvp = NULLVP;
4512 
4513 	oflags = uflags;
4514 
4515 	amode = oflags & O_ACCMODE;
4516 	/*
4517 	 * Because O_RDONLY is 0, it is not possible to distinguish between
4518 	 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4519 	 * with FREAD/FWRITE.
4520 	 */
4521 	if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4522 		return EINVAL;
4523 	}
4524 
4525 	flags = FFLAGS(uflags);
4526 	CLR(flags, FENCRYPTED);
4527 	CLR(flags, FUNENCRYPTED);
4528 
4529 	AUDIT_ARG(fflags, oflags);
4530 	AUDIT_ARG(mode, vap->va_mode);
4531 
4532 	if ((error = falloc_withinit(p, &fp, &indx, ctx, fp_init, initarg)) != 0) {
4533 		return error;
4534 	}
4535 	if (flags & O_CLOEXEC) {
4536 		fp->fp_flags |= FP_CLOEXEC;
4537 	}
4538 	if (flags & O_CLOFORK) {
4539 		fp->fp_flags |= FP_CLOFORK;
4540 	}
4541 
4542 	/* setup state to recognize when fdesc_open was called */
4543 	uu->uu_dupfd = -1;
4544 
4545 	/*
4546 	 * Disable read/write access if file is opened with O_EVTONLY and
4547 	 * the process has requested to deny read/write access.
4548 	 */
4549 	if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4550 		flags &= ~(FREAD | FWRITE);
4551 	}
4552 
4553 	if (authfd != AUTH_OPEN_NOAUTHFD) {
4554 		error = vnode_getfromfd(ctx, authfd, &authvp);
4555 		if (error) {
4556 			fp_free(p, indx, fp);
4557 			return error;
4558 		}
4559 	}
4560 
4561 	if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4562 		if (authvp != NULLVP) {
4563 			vnode_put(authvp);
4564 		}
4565 		if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4566 			if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4567 				*retval = indx;
4568 				return 0;
4569 			}
4570 		}
4571 		if (error == ERESTART) {
4572 			error = EINTR;
4573 		}
4574 		fp_free(p, indx, fp);
4575 		return error;
4576 	}
4577 
4578 	if (authvp != NULLVP) {
4579 		vnode_put(authvp);
4580 	}
4581 
4582 	uu->uu_dupfd = 0;
4583 	vp = ndp->ni_vp;
4584 
4585 	fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4586 	fp->fp_glob->fg_ops = &vnops;
4587 	fp_set_data(fp, vp);
4588 
4589 #if CONFIG_FILE_LEASES
4590 	/*
4591 	 * If we are creating a file or open with truncate, we need to break the
4592 	 * lease if there is a read lease placed on the parent dir.
4593 	 */
4594 	if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4595 		vnode_breakdirlease(vp, true, oflags);
4596 	}
4597 	/* Now check if there is a lease placed on the file itself. */
4598 	error = vnode_breaklease(vp, oflags, ctx);
4599 	if (error) {
4600 		goto bad;
4601 	}
4602 #endif /* CONFIG_FILE_LEASES */
4603 
4604 	if (flags & (O_EXLOCK | O_SHLOCK)) {
4605 		struct flock lf = {
4606 			.l_whence = SEEK_SET,
4607 		};
4608 
4609 		if (flags & O_EXLOCK) {
4610 			lf.l_type = F_WRLCK;
4611 		} else {
4612 			lf.l_type = F_RDLCK;
4613 		}
4614 		type = F_FLOCK;
4615 		if ((flags & FNONBLOCK) == 0) {
4616 			type |= F_WAIT;
4617 		}
4618 #if CONFIG_MACF
4619 		error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4620 		    F_SETLK, &lf);
4621 		if (error) {
4622 			goto bad;
4623 		}
4624 #endif
4625 		if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4626 			goto bad;
4627 		}
4628 		fp->fp_glob->fg_flag |= FWASLOCKED;
4629 	}
4630 
4631 	/* try to truncate by setting the size attribute */
4632 	if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4633 		goto bad;
4634 	}
4635 
4636 	/*
4637 	 * For directories we hold some additional information in the fd.
4638 	 */
4639 	if (vnode_vtype(vp) == VDIR) {
4640 		fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4641 	} else {
4642 		fp->fp_glob->fg_vn_data = NULL;
4643 	}
4644 
4645 	vnode_put(vp);
4646 
4647 	/*
4648 	 * The first terminal open (without a O_NOCTTY) by a session leader
4649 	 * results in it being set as the controlling terminal.
4650 	 */
4651 	if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4652 	    !(flags & O_NOCTTY)) {
4653 		int tmp = 0;
4654 
4655 		(void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4656 		    (caddr_t)&tmp, ctx);
4657 	}
4658 
4659 	proc_fdlock(p);
4660 	procfdtbl_releasefd(p, indx, NULL);
4661 
4662 #if CONFIG_SECLUDED_MEMORY
4663 	if (secluded_for_filecache &&
4664 	    FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE &&
4665 	    vnode_vtype(vp) == VREG) {
4666 		memory_object_control_t moc;
4667 
4668 		moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4669 
4670 		if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4671 			/* nothing to do... */
4672 		} else if (fp->fp_glob->fg_flag & FWRITE) {
4673 			/* writable -> no longer  eligible for secluded pages */
4674 			memory_object_mark_eligible_for_secluded(moc,
4675 			    FALSE);
4676 		} else if (secluded_for_filecache == 1) {
4677 			char pathname[32] = { 0, };
4678 			size_t copied;
4679 			/* XXX FBDP: better way to detect /Applications/ ? */
4680 			if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4681 				(void)copyinstr(ndp->ni_dirp,
4682 				    pathname,
4683 				    sizeof(pathname),
4684 				    &copied);
4685 			} else {
4686 				copystr(CAST_DOWN(void *, ndp->ni_dirp),
4687 				    pathname,
4688 				    sizeof(pathname),
4689 				    &copied);
4690 			}
4691 			pathname[sizeof(pathname) - 1] = '\0';
4692 			if (strncmp(pathname,
4693 			    "/Applications/",
4694 			    strlen("/Applications/")) == 0 &&
4695 			    strncmp(pathname,
4696 			    "/Applications/Camera.app/",
4697 			    strlen("/Applications/Camera.app/")) != 0) {
4698 				/*
4699 				 * not writable
4700 				 * AND from "/Applications/"
4701 				 * AND not from "/Applications/Camera.app/"
4702 				 * ==> eligible for secluded
4703 				 */
4704 				memory_object_mark_eligible_for_secluded(moc,
4705 				    TRUE);
4706 			}
4707 		} else if (secluded_for_filecache == 2) {
4708 			size_t len = strlen(vp->v_name);
4709 			if (!strncmp(vp->v_name, "dyld", len) ||
4710 			    !strncmp(vp->v_name, "launchd", len) ||
4711 			    !strncmp(vp->v_name, "Camera", len) ||
4712 			    !strncmp(vp->v_name, "mediaserverd", len) ||
4713 			    !strncmp(vp->v_name, "SpringBoard", len) ||
4714 			    !strncmp(vp->v_name, "backboardd", len)) {
4715 				/*
4716 				 * This file matters when launching Camera:
4717 				 * do not store its contents in the secluded
4718 				 * pool that will be drained on Camera launch.
4719 				 */
4720 				memory_object_mark_eligible_for_secluded(moc,
4721 				    FALSE);
4722 			}
4723 		}
4724 	}
4725 #endif /* CONFIG_SECLUDED_MEMORY */
4726 
4727 	fp_drop(p, indx, fp, 1);
4728 	proc_fdunlock(p);
4729 
4730 	*retval = indx;
4731 
4732 	return 0;
4733 bad:
4734 	context = *vfs_context_current();
4735 	context.vc_ucred = fp->fp_glob->fg_cred;
4736 
4737 	if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4738 	    (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4739 		struct flock lf = {
4740 			.l_whence = SEEK_SET,
4741 			.l_type = F_UNLCK,
4742 		};
4743 
4744 		(void)VNOP_ADVLOCK(
4745 			vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4746 	}
4747 
4748 	vn_close(vp, fp->fp_glob->fg_flag, &context);
4749 	vnode_put(vp);
4750 	fp_free(p, indx, fp);
4751 
4752 	return error;
4753 }
4754 
4755 /*
4756  * While most of the *at syscall handlers can call nameiat() which
4757  * is a wrapper around namei, the use of namei and initialisation
4758  * of nameidata are far removed and in different functions  - namei
4759  * gets called in vn_open_auth for open1. So we'll just do here what
4760  * nameiat() does.
4761  */
4762 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)4763 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4764     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4765     int dirfd, int authfd)
4766 {
4767 	if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4768 		int error;
4769 		char c;
4770 
4771 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4772 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
4773 			if (error) {
4774 				return error;
4775 			}
4776 		} else {
4777 			c = *((char *)(ndp->ni_dirp));
4778 		}
4779 
4780 		if (c != '/') {
4781 			vnode_t dvp_at;
4782 
4783 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4784 			    &dvp_at);
4785 			if (error) {
4786 				return error;
4787 			}
4788 
4789 			if (vnode_vtype(dvp_at) != VDIR) {
4790 				vnode_put(dvp_at);
4791 				return ENOTDIR;
4792 			}
4793 
4794 			ndp->ni_dvp = dvp_at;
4795 			ndp->ni_cnd.cn_flags |= USEDVP;
4796 			error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4797 			    retval, authfd);
4798 			vnode_put(dvp_at);
4799 			return error;
4800 		}
4801 	}
4802 
4803 	return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
4804 }
4805 
4806 /*
4807  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4808  *
4809  * Parameters:	p			Process requesting the open
4810  *		uap			User argument descriptor (see below)
4811  *		retval			Pointer to an area to receive the
4812  *					return calue from the system call
4813  *
4814  * Indirect:	uap->path		Path to open (same as 'open')
4815  *		uap->flags		Flags to open (same as 'open'
4816  *		uap->uid		UID to set, if creating
4817  *		uap->gid		GID to set, if creating
4818  *		uap->mode		File mode, if creating (same as 'open')
4819  *		uap->xsecurity		ACL to set, if creating
4820  *
4821  * Returns:	0			Success
4822  *		!0			errno value
4823  *
4824  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
4825  *
4826  * XXX:		We should enummerate the possible errno values here, and where
4827  *		in the code they originated.
4828  */
4829 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)4830 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4831 {
4832 	int ciferror;
4833 	kauth_filesec_t xsecdst;
4834 	struct vnode_attr va;
4835 	struct nameidata nd;
4836 	int cmode;
4837 
4838 	AUDIT_ARG(owner, uap->uid, uap->gid);
4839 
4840 	xsecdst = NULL;
4841 	if ((uap->xsecurity != USER_ADDR_NULL) &&
4842 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4843 		return ciferror;
4844 	}
4845 
4846 	VATTR_INIT(&va);
4847 	cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4848 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4849 	if (uap->uid != KAUTH_UID_NONE) {
4850 		VATTR_SET(&va, va_uid, uap->uid);
4851 	}
4852 	if (uap->gid != KAUTH_GID_NONE) {
4853 		VATTR_SET(&va, va_gid, uap->gid);
4854 	}
4855 	if (xsecdst != NULL) {
4856 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4857 		va.va_vaflags |= VA_FILESEC_ACL;
4858 	}
4859 
4860 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4861 	    uap->path, vfs_context_current());
4862 
4863 	ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4864 	    NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
4865 	if (xsecdst != NULL) {
4866 		kauth_filesec_free(xsecdst);
4867 	}
4868 
4869 	return ciferror;
4870 }
4871 
4872 /*
4873  * Go through the data-protected atomically controlled open (2)
4874  *
4875  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4876  */
4877 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)4878 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4879     int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
4880 {
4881 	/*
4882 	 * Follow the same path as normal open(2)
4883 	 * Look up the item if it exists, and acquire the vnode.
4884 	 */
4885 	struct vnode_attr va;
4886 	struct nameidata nd;
4887 	int cmode;
4888 	int error;
4889 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
4890 
4891 	VATTR_INIT(&va);
4892 	/* Mask off all but regular access permissions */
4893 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4894 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4895 
4896 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
4897 	    path, ctx);
4898 
4899 	/*
4900 	 * Initialize the extra fields in vnode_attr to pass down our
4901 	 * extra fields.
4902 	 * 1. target cprotect class.
4903 	 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4904 	 */
4905 	if (flags & O_CREAT) {
4906 		/* lower level kernel code validates that the class is valid before applying it. */
4907 		if (class != PROTECTION_CLASS_DEFAULT) {
4908 			/*
4909 			 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4910 			 * file behave the same as open (2)
4911 			 */
4912 			VATTR_SET(&va, va_dataprotect_class, class);
4913 		}
4914 	}
4915 
4916 	if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
4917 		if (flags & (O_RDWR | O_WRONLY)) {
4918 			/*
4919 			 * Not allowed to write raw encrypted bytes or when opening authenticated.
4920 			 */
4921 			return EINVAL;
4922 		}
4923 		if (dpflags & O_DP_GETRAWENCRYPTED) {
4924 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4925 		}
4926 		if (dpflags & O_DP_GETRAWUNENCRYPTED) {
4927 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4928 		}
4929 		if (dpflags & O_DP_AUTHENTICATE) {
4930 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
4931 		}
4932 	}
4933 
4934 	error = open1at(vfs_context_current(), &nd, flags, &va,
4935 	    NULL, NULL, retval, fd, authfd);
4936 
4937 	return error;
4938 }
4939 
4940 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)4941 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
4942 {
4943 	if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
4944 		return EINVAL;
4945 	}
4946 
4947 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
4948 	           uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
4949 }
4950 
4951 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)4952 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
4953 {
4954 	if (uap->dpflags & O_DP_AUTHENTICATE) {
4955 		return EINVAL;
4956 	}
4957 
4958 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
4959 	           uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
4960 }
4961 
4962 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)4963 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4964     int fd, enum uio_seg segflg, int *retval)
4965 {
4966 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
4967 	struct {
4968 		struct vnode_attr va;
4969 		struct nameidata nd;
4970 	} *__open_data;
4971 	struct vnode_attr *vap;
4972 	struct nameidata *ndp;
4973 	int cmode;
4974 	int error;
4975 
4976 	__open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
4977 	vap = &__open_data->va;
4978 	ndp = &__open_data->nd;
4979 
4980 	VATTR_INIT(vap);
4981 	/* Mask off all but regular access permissions */
4982 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4983 	VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
4984 
4985 	NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
4986 	    segflg, path, ctx);
4987 
4988 	error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
4989 
4990 	kfree_type(typeof(*__open_data), __open_data);
4991 
4992 	return error;
4993 }
4994 
4995 int
open(proc_t p,struct open_args * uap,int32_t * retval)4996 open(proc_t p, struct open_args *uap, int32_t *retval)
4997 {
4998 	__pthread_testcancel(1);
4999 	return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5000 }
5001 
5002 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5003 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5004     int32_t *retval)
5005 {
5006 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5007 	           uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
5008 }
5009 
5010 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5011 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5012     int32_t *retval)
5013 {
5014 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5015 	           uap->mode, uap->fd, UIO_USERSPACE, retval);
5016 }
5017 
5018 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5019 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5020 {
5021 	__pthread_testcancel(1);
5022 	return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5023 }
5024 
5025 #define OPEN_BY_ID_ENTITLEMENT  "com.apple.private.vfs.open-by-id"
5026 
5027 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5028 vfs_context_can_open_by_id(vfs_context_t ctx)
5029 {
5030 	if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5031 		return TRUE;
5032 	}
5033 
5034 	return IOTaskHasEntitlement(vfs_context_task(ctx),
5035 	           OPEN_BY_ID_ENTITLEMENT);
5036 }
5037 
5038 /*
5039  * openbyid_np: open a file given a file system id and a file system object id
5040  *	the hfs file system object id is an fsobj_id_t {uint32, uint32}
5041  *	file systems that don't support object ids it is a node id (uint64_t).
5042  *
5043  * Parameters:	p			Process requesting the open
5044  *		uap			User argument descriptor (see below)
5045  *		retval			Pointer to an area to receive the
5046  *					return calue from the system call
5047  *
5048  * Indirect:	uap->path		Path to open (same as 'open')
5049  *
5050  *		uap->fsid		id of target file system
5051  *		uap->objid		id of target file system object
5052  *		uap->flags		Flags to open (same as 'open')
5053  *
5054  * Returns:	0			Success
5055  *		!0			errno value
5056  *
5057  *
5058  * XXX:		We should enummerate the possible errno values here, and where
5059  *		in the code they originated.
5060  */
5061 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5062 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5063 {
5064 	fsid_t fsid;
5065 	uint64_t objid;
5066 	int error;
5067 	char *buf = NULL;
5068 	int buflen = MAXPATHLEN;
5069 	int pathlen = 0;
5070 	vfs_context_t ctx = vfs_context_current();
5071 
5072 	if (!vfs_context_can_open_by_id(ctx)) {
5073 		return EPERM;
5074 	}
5075 
5076 	if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5077 		return error;
5078 	}
5079 
5080 	/*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5081 	if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5082 		return error;
5083 	}
5084 
5085 	AUDIT_ARG(value32, fsid.val[0]);
5086 	AUDIT_ARG(value64, objid);
5087 
5088 	/*resolve path from fsis, objid*/
5089 	do {
5090 		buf = kalloc_data(buflen + 1, Z_WAITOK);
5091 		if (buf == NULL) {
5092 			return ENOMEM;
5093 		}
5094 
5095 		error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5096 		    buf, FSOPT_ISREALFSID, &pathlen);
5097 
5098 		if (error) {
5099 			kfree_data(buf, buflen + 1);
5100 			buf = NULL;
5101 		}
5102 	} while (error == ENOSPC && (buflen += MAXPATHLEN));
5103 
5104 	if (error) {
5105 		return error;
5106 	}
5107 
5108 	buf[pathlen] = 0;
5109 
5110 	error = openat_internal(
5111 		ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
5112 
5113 	kfree_data(buf, buflen + 1);
5114 
5115 	return error;
5116 }
5117 
5118 
5119 /*
5120  * Create a special file.
5121  */
5122 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5123     int fd);
5124 
5125 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5126 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5127     mode_t mode, int fd)
5128 {
5129 	vfs_context_t ctx = vfs_context_current();
5130 	struct nameidata nd;
5131 	vnode_t vp, dvp;
5132 	int error;
5133 
5134 	/* If it's a mknod() of a FIFO, call mkfifo1() instead */
5135 	if ((mode & S_IFMT) == S_IFIFO) {
5136 		return mkfifo1(ctx, upath, vap, fd);
5137 	}
5138 
5139 	AUDIT_ARG(mode, mode);
5140 	AUDIT_ARG(value32, vap->va_rdev);
5141 
5142 	if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5143 		return error;
5144 	}
5145 	NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5146 	    UIO_USERSPACE, upath, ctx);
5147 	error = nameiat(&nd, fd);
5148 	if (error) {
5149 		return error;
5150 	}
5151 	dvp = nd.ni_dvp;
5152 	vp = nd.ni_vp;
5153 
5154 	if (vp != NULL) {
5155 		error = EEXIST;
5156 		goto out;
5157 	}
5158 
5159 	switch (mode & S_IFMT) {
5160 	case S_IFCHR:
5161 		VATTR_SET(vap, va_type, VCHR);
5162 		break;
5163 	case S_IFBLK:
5164 		VATTR_SET(vap, va_type, VBLK);
5165 		break;
5166 	default:
5167 		error = EINVAL;
5168 		goto out;
5169 	}
5170 
5171 #if CONFIG_MACF
5172 	error = mac_vnode_check_create(ctx,
5173 	    nd.ni_dvp, &nd.ni_cnd, vap);
5174 	if (error) {
5175 		goto out;
5176 	}
5177 #endif
5178 
5179 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5180 		goto out;
5181 	}
5182 
5183 #if CONFIG_FILE_LEASES
5184 	vnode_breakdirlease(dvp, false, O_WRONLY);
5185 #endif
5186 
5187 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5188 		goto out;
5189 	}
5190 
5191 	if (vp) {
5192 		int     update_flags = 0;
5193 
5194 		// Make sure the name & parent pointers are hooked up
5195 		if (vp->v_name == NULL) {
5196 			update_flags |= VNODE_UPDATE_NAME;
5197 		}
5198 		if (vp->v_parent == NULLVP) {
5199 			update_flags |= VNODE_UPDATE_PARENT;
5200 		}
5201 
5202 		if (update_flags) {
5203 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5204 		}
5205 
5206 #if CONFIG_FSE
5207 		add_fsevent(FSE_CREATE_FILE, ctx,
5208 		    FSE_ARG_VNODE, vp,
5209 		    FSE_ARG_DONE);
5210 #endif
5211 	}
5212 
5213 out:
5214 	/*
5215 	 * nameidone has to happen before we vnode_put(dvp)
5216 	 * since it may need to release the fs_nodelock on the dvp
5217 	 */
5218 	nameidone(&nd);
5219 
5220 	if (vp) {
5221 		vnode_put(vp);
5222 	}
5223 	vnode_put(dvp);
5224 
5225 	return error;
5226 }
5227 
5228 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5229 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5230 {
5231 	struct vnode_attr va;
5232 
5233 	VATTR_INIT(&va);
5234 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5235 	VATTR_SET(&va, va_rdev, uap->dev);
5236 
5237 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5238 }
5239 
5240 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5241 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5242 {
5243 	struct vnode_attr va;
5244 
5245 	VATTR_INIT(&va);
5246 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5247 	VATTR_SET(&va, va_rdev, uap->dev);
5248 
5249 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5250 }
5251 
5252 /*
5253  * Create a named pipe.
5254  *
5255  * Returns:	0			Success
5256  *		EEXIST
5257  *	namei:???
5258  *	vnode_authorize:???
5259  *	vn_create:???
5260  */
5261 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5262 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5263 {
5264 	vnode_t vp, dvp;
5265 	int error;
5266 	struct nameidata nd;
5267 
5268 	NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5269 	    UIO_USERSPACE, upath, ctx);
5270 	error = nameiat(&nd, fd);
5271 	if (error) {
5272 		return error;
5273 	}
5274 	dvp = nd.ni_dvp;
5275 	vp = nd.ni_vp;
5276 
5277 	/* check that this is a new file and authorize addition */
5278 	if (vp != NULL) {
5279 		error = EEXIST;
5280 		goto out;
5281 	}
5282 	VATTR_SET(vap, va_type, VFIFO);
5283 
5284 	if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5285 		goto out;
5286 	}
5287 
5288 	error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5289 out:
5290 	/*
5291 	 * nameidone has to happen before we vnode_put(dvp)
5292 	 * since it may need to release the fs_nodelock on the dvp
5293 	 */
5294 	nameidone(&nd);
5295 
5296 	if (vp) {
5297 		vnode_put(vp);
5298 	}
5299 	vnode_put(dvp);
5300 
5301 	return error;
5302 }
5303 
5304 
5305 /*
5306  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5307  *
5308  * Parameters:	p			Process requesting the open
5309  *		uap			User argument descriptor (see below)
5310  *		retval			(Ignored)
5311  *
5312  * Indirect:	uap->path		Path to fifo (same as 'mkfifo')
5313  *		uap->uid		UID to set
5314  *		uap->gid		GID to set
5315  *		uap->mode		File mode to set (same as 'mkfifo')
5316  *		uap->xsecurity		ACL to set, if creating
5317  *
5318  * Returns:	0			Success
5319  *		!0			errno value
5320  *
5321  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
5322  *
5323  * XXX:		We should enummerate the possible errno values here, and where
5324  *		in the code they originated.
5325  */
5326 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5327 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5328 {
5329 	int ciferror;
5330 	kauth_filesec_t xsecdst;
5331 	struct vnode_attr va;
5332 
5333 	AUDIT_ARG(owner, uap->uid, uap->gid);
5334 
5335 	xsecdst = KAUTH_FILESEC_NONE;
5336 	if (uap->xsecurity != USER_ADDR_NULL) {
5337 		if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5338 			return ciferror;
5339 		}
5340 	}
5341 
5342 	VATTR_INIT(&va);
5343 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5344 	if (uap->uid != KAUTH_UID_NONE) {
5345 		VATTR_SET(&va, va_uid, uap->uid);
5346 	}
5347 	if (uap->gid != KAUTH_GID_NONE) {
5348 		VATTR_SET(&va, va_gid, uap->gid);
5349 	}
5350 	if (xsecdst != KAUTH_FILESEC_NONE) {
5351 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5352 		va.va_vaflags |= VA_FILESEC_ACL;
5353 	}
5354 
5355 	ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5356 
5357 	if (xsecdst != KAUTH_FILESEC_NONE) {
5358 		kauth_filesec_free(xsecdst);
5359 	}
5360 	return ciferror;
5361 }
5362 
5363 /* ARGSUSED */
5364 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5365 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5366 {
5367 	struct vnode_attr va;
5368 
5369 	VATTR_INIT(&va);
5370 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5371 
5372 	return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5373 }
5374 
5375 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5376 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5377 {
5378 	struct vnode_attr va;
5379 
5380 	VATTR_INIT(&va);
5381 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5382 
5383 	return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5384 }
5385 
5386 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5387 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5388 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5389 
5390 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5391 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5392 {
5393 	int ret, len = _len;
5394 
5395 	*truncated_path = 0;
5396 
5397 	if (firmlink) {
5398 		ret = vn_getpath(dvp, path, &len);
5399 	} else {
5400 		ret = vn_getpath_no_firmlink(dvp, path, &len);
5401 	}
5402 	if (ret == 0 && len < (MAXPATHLEN - 1)) {
5403 		if (leafname) {
5404 			path[len - 1] = '/';
5405 			len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5406 			if (len > MAXPATHLEN) {
5407 				char *ptr;
5408 
5409 				// the string got truncated!
5410 				*truncated_path = 1;
5411 				ptr = strrchr(path, '/');
5412 				if (ptr) {
5413 					*ptr = '\0';   // chop off the string at the last directory component
5414 				}
5415 				len = (int)strlen(path) + 1;
5416 			}
5417 		}
5418 	} else if (ret == 0) {
5419 		*truncated_path = 1;
5420 	} else if (ret != 0) {
5421 		struct vnode *mydvp = dvp;
5422 
5423 		if (ret != ENOSPC) {
5424 			printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5425 			    dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5426 		}
5427 		*truncated_path = 1;
5428 
5429 		do {
5430 			if (mydvp->v_parent != NULL) {
5431 				mydvp = mydvp->v_parent;
5432 			} else if (mydvp->v_mount) {
5433 				strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5434 				break;
5435 			} else {
5436 				// no parent and no mount point?  only thing is to punt and say "/" changed
5437 				strlcpy(path, "/", _len);
5438 				len = 2;
5439 				mydvp = NULL;
5440 			}
5441 
5442 			if (mydvp == NULL) {
5443 				break;
5444 			}
5445 
5446 			len = _len;
5447 			if (firmlink) {
5448 				ret = vn_getpath(mydvp, path, &len);
5449 			} else {
5450 				ret = vn_getpath_no_firmlink(mydvp, path, &len);
5451 			}
5452 		} while (ret == ENOSPC);
5453 	}
5454 
5455 	return len;
5456 }
5457 
5458 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5459 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5460 {
5461 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5462 }
5463 
5464 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5465 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5466 {
5467 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5468 }
5469 
5470 /*
5471  * Make a hard file link.
5472  *
5473  * Returns:	0			Success
5474  *		EPERM
5475  *		EEXIST
5476  *		EXDEV
5477  *	namei:???
5478  *	vnode_authorize:???
5479  *	VNOP_LINK:???
5480  */
5481 /* ARGSUSED */
5482 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5483 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5484     user_addr_t link, int flag, enum uio_seg segflg)
5485 {
5486 	vnode_t vp, pvp, dvp, lvp;
5487 	struct nameidata nd;
5488 	int follow;
5489 	int error;
5490 #if CONFIG_FSE
5491 	fse_info finfo;
5492 #endif
5493 	int need_event, has_listeners, need_kpath2;
5494 	char *target_path = NULL;
5495 	char  *no_firmlink_path = NULL;
5496 	int truncated = 0;
5497 	int truncated_no_firmlink_path = 0;
5498 
5499 	vp = dvp = lvp = NULLVP;
5500 
5501 	/* look up the object we are linking to */
5502 	follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5503 	NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5504 	    segflg, path, ctx);
5505 
5506 	error = nameiat(&nd, fd1);
5507 	if (error) {
5508 		return error;
5509 	}
5510 	vp = nd.ni_vp;
5511 
5512 	nameidone(&nd);
5513 
5514 	/*
5515 	 * Normally, linking to directories is not supported.
5516 	 * However, some file systems may have limited support.
5517 	 */
5518 	if (vp->v_type == VDIR) {
5519 		if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5520 			error = EPERM;   /* POSIX */
5521 			goto out;
5522 		}
5523 
5524 		/* Linking to a directory requires ownership. */
5525 		if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5526 			struct vnode_attr dva;
5527 
5528 			VATTR_INIT(&dva);
5529 			VATTR_WANTED(&dva, va_uid);
5530 			if (vnode_getattr(vp, &dva, ctx) != 0 ||
5531 			    !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5532 			    (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5533 				error = EACCES;
5534 				goto out;
5535 			}
5536 		}
5537 	}
5538 
5539 	/* lookup the target node */
5540 #if CONFIG_TRIGGERS
5541 	nd.ni_op = OP_LINK;
5542 #endif
5543 	nd.ni_cnd.cn_nameiop = CREATE;
5544 	nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5545 	nd.ni_dirp = link;
5546 	error = nameiat(&nd, fd2);
5547 	if (error != 0) {
5548 		goto out;
5549 	}
5550 	dvp = nd.ni_dvp;
5551 	lvp = nd.ni_vp;
5552 
5553 #if CONFIG_MACF
5554 	if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5555 		goto out2;
5556 	}
5557 #endif
5558 
5559 	/* or to anything that kauth doesn't want us to (eg. immutable items) */
5560 	if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5561 		goto out2;
5562 	}
5563 
5564 	/* target node must not exist */
5565 	if (lvp != NULLVP) {
5566 		error = EEXIST;
5567 		goto out2;
5568 	}
5569 	/* cannot link across mountpoints */
5570 	if (vnode_mount(vp) != vnode_mount(dvp)) {
5571 		error = EXDEV;
5572 		goto out2;
5573 	}
5574 
5575 	/* authorize creation of the target note */
5576 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5577 		goto out2;
5578 	}
5579 
5580 #if CONFIG_FILE_LEASES
5581 	vnode_breakdirlease(dvp, false, O_WRONLY);
5582 #endif
5583 
5584 	/* and finally make the link */
5585 	error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5586 	if (error) {
5587 		goto out2;
5588 	}
5589 
5590 #if CONFIG_MACF
5591 	(void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5592 #endif
5593 
5594 #if CONFIG_FSE
5595 	need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5596 #else
5597 	need_event = 0;
5598 #endif
5599 	has_listeners = kauth_authorize_fileop_has_listeners();
5600 
5601 	need_kpath2 = 0;
5602 #if CONFIG_AUDIT
5603 	if (AUDIT_RECORD_EXISTS()) {
5604 		need_kpath2 = 1;
5605 	}
5606 #endif
5607 
5608 	if (need_event || has_listeners || need_kpath2) {
5609 		char *link_to_path = NULL;
5610 		int len, link_name_len;
5611 		int  len_no_firmlink_path = 0;
5612 
5613 		/* build the path to the new link file */
5614 		GET_PATH(target_path);
5615 
5616 		len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5617 		if (no_firmlink_path == NULL) {
5618 			GET_PATH(no_firmlink_path);
5619 		}
5620 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5621 
5622 		AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5623 
5624 		if (has_listeners) {
5625 			/* build the path to file we are linking to */
5626 			GET_PATH(link_to_path);
5627 
5628 			link_name_len = MAXPATHLEN;
5629 			if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5630 				/*
5631 				 * Call out to allow 3rd party notification of rename.
5632 				 * Ignore result of kauth_authorize_fileop call.
5633 				 */
5634 				kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5635 				    (uintptr_t)link_to_path,
5636 				    (uintptr_t)target_path);
5637 			}
5638 			if (link_to_path != NULL) {
5639 				RELEASE_PATH(link_to_path);
5640 			}
5641 		}
5642 #if CONFIG_FSE
5643 		if (need_event) {
5644 			/* construct fsevent */
5645 			if (get_fse_info(vp, &finfo, ctx) == 0) {
5646 				if (truncated_no_firmlink_path) {
5647 					finfo.mode |= FSE_TRUNCATED_PATH;
5648 				}
5649 
5650 				// build the path to the destination of the link
5651 				add_fsevent(FSE_CREATE_FILE, ctx,
5652 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5653 				    FSE_ARG_FINFO, &finfo,
5654 				    FSE_ARG_DONE);
5655 			}
5656 
5657 			pvp = vp->v_parent;
5658 			// need an iocount on pvp in this case
5659 			if (pvp && pvp != dvp) {
5660 				error = vnode_get(pvp);
5661 				if (error) {
5662 					pvp = NULLVP;
5663 					error = 0;
5664 				}
5665 			}
5666 			if (pvp) {
5667 				add_fsevent(FSE_STAT_CHANGED, ctx,
5668 				    FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5669 			}
5670 			if (pvp && pvp != dvp) {
5671 				vnode_put(pvp);
5672 			}
5673 		}
5674 #endif
5675 	}
5676 out2:
5677 	/*
5678 	 * nameidone has to happen before we vnode_put(dvp)
5679 	 * since it may need to release the fs_nodelock on the dvp
5680 	 */
5681 	nameidone(&nd);
5682 	if (target_path != NULL) {
5683 		RELEASE_PATH(target_path);
5684 	}
5685 	if (no_firmlink_path != NULL) {
5686 		RELEASE_PATH(no_firmlink_path);
5687 		no_firmlink_path = NULL;
5688 	}
5689 out:
5690 	if (lvp) {
5691 		vnode_put(lvp);
5692 	}
5693 	if (dvp) {
5694 		vnode_put(dvp);
5695 	}
5696 	vnode_put(vp);
5697 	return error;
5698 }
5699 
5700 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5701 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5702 {
5703 	return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5704 	           AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5705 }
5706 
5707 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5708 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5709 {
5710 	if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5711 		return EINVAL;
5712 	}
5713 
5714 	return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5715 	           uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5716 }
5717 
5718 /*
5719  * Make a symbolic link.
5720  *
5721  * We could add support for ACLs here too...
5722  */
5723 /* ARGSUSED */
5724 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5725 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5726     user_addr_t link, enum uio_seg segflg)
5727 {
5728 	struct vnode_attr va;
5729 	char *path;
5730 	int error;
5731 	struct nameidata nd;
5732 	vnode_t vp, dvp;
5733 	size_t dummy = 0;
5734 	proc_t p;
5735 
5736 	error = 0;
5737 	if (UIO_SEG_IS_USER_SPACE(segflg)) {
5738 		path = zalloc(ZV_NAMEI);
5739 		error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5740 	} else {
5741 		path = (char *)path_data;
5742 	}
5743 	if (error) {
5744 		goto out;
5745 	}
5746 	AUDIT_ARG(text, path);  /* This is the link string */
5747 
5748 	NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5749 	    segflg, link, ctx);
5750 
5751 	error = nameiat(&nd, fd);
5752 	if (error) {
5753 		goto out;
5754 	}
5755 	dvp = nd.ni_dvp;
5756 	vp = nd.ni_vp;
5757 
5758 	p = vfs_context_proc(ctx);
5759 	VATTR_INIT(&va);
5760 	VATTR_SET(&va, va_type, VLNK);
5761 	VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5762 
5763 #if CONFIG_MACF
5764 	error = mac_vnode_check_create(ctx,
5765 	    dvp, &nd.ni_cnd, &va);
5766 #endif
5767 	if (error != 0) {
5768 		goto skipit;
5769 	}
5770 
5771 	if (vp != NULL) {
5772 		error = EEXIST;
5773 		goto skipit;
5774 	}
5775 
5776 	/* authorize */
5777 	if (error == 0) {
5778 		error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5779 	}
5780 	/* get default ownership, etc. */
5781 	if (error == 0) {
5782 		error = vnode_authattr_new(dvp, &va, 0, ctx);
5783 	}
5784 
5785 #if CONFIG_FILE_LEASES
5786 	vnode_breakdirlease(dvp, false, O_WRONLY);
5787 #endif
5788 
5789 	if (error == 0) {
5790 		error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5791 	}
5792 
5793 	/* do fallback attribute handling */
5794 	if (error == 0 && vp) {
5795 		error = vnode_setattr_fallback(vp, &va, ctx);
5796 	}
5797 
5798 #if CONFIG_MACF
5799 	if (error == 0 && vp) {
5800 		error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5801 	}
5802 #endif
5803 
5804 	if (error == 0) {
5805 		int     update_flags = 0;
5806 
5807 		/*check if a new vnode was created, else try to get one*/
5808 		if (vp == NULL) {
5809 			nd.ni_cnd.cn_nameiop = LOOKUP;
5810 #if CONFIG_TRIGGERS
5811 			nd.ni_op = OP_LOOKUP;
5812 #endif
5813 			/*
5814 			 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5815 			 * reallocated again in namei().
5816 			 */
5817 			nd.ni_cnd.cn_flags &= HASBUF;
5818 			error = nameiat(&nd, fd);
5819 			if (error) {
5820 				goto skipit;
5821 			}
5822 			vp = nd.ni_vp;
5823 		}
5824 
5825 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5826 		/* call out to allow 3rd party notification of rename.
5827 		 * Ignore result of kauth_authorize_fileop call.
5828 		 */
5829 		if (kauth_authorize_fileop_has_listeners() &&
5830 		    namei(&nd) == 0) {
5831 			char *new_link_path = NULL;
5832 			int             len;
5833 
5834 			/* build the path to the new link file */
5835 			new_link_path = get_pathbuff();
5836 			len = MAXPATHLEN;
5837 			vn_getpath(dvp, new_link_path, &len);
5838 			if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5839 				new_link_path[len - 1] = '/';
5840 				strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5841 			}
5842 
5843 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5844 			    (uintptr_t)path, (uintptr_t)new_link_path);
5845 			if (new_link_path != NULL) {
5846 				release_pathbuff(new_link_path);
5847 			}
5848 		}
5849 #endif
5850 		// Make sure the name & parent pointers are hooked up
5851 		if (vp->v_name == NULL) {
5852 			update_flags |= VNODE_UPDATE_NAME;
5853 		}
5854 		if (vp->v_parent == NULLVP) {
5855 			update_flags |= VNODE_UPDATE_PARENT;
5856 		}
5857 
5858 		if (update_flags) {
5859 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5860 		}
5861 
5862 #if CONFIG_FSE
5863 		add_fsevent(FSE_CREATE_FILE, ctx,
5864 		    FSE_ARG_VNODE, vp,
5865 		    FSE_ARG_DONE);
5866 #endif
5867 	}
5868 
5869 skipit:
5870 	/*
5871 	 * nameidone has to happen before we vnode_put(dvp)
5872 	 * since it may need to release the fs_nodelock on the dvp
5873 	 */
5874 	nameidone(&nd);
5875 
5876 	if (vp) {
5877 		vnode_put(vp);
5878 	}
5879 	vnode_put(dvp);
5880 out:
5881 	if (path && (path != (char *)path_data)) {
5882 		zfree(ZV_NAMEI, path);
5883 	}
5884 
5885 	return error;
5886 }
5887 
5888 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)5889 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5890 {
5891 	return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5892 	           uap->link, UIO_USERSPACE);
5893 }
5894 
5895 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)5896 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5897     __unused int32_t *retval)
5898 {
5899 	return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5900 	           uap->path2, UIO_USERSPACE);
5901 }
5902 
5903 /*
5904  * Delete a whiteout from the filesystem.
5905  * No longer supported.
5906  */
5907 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)5908 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5909 {
5910 	return ENOTSUP;
5911 }
5912 
5913 /*
5914  * Delete a name from the filesystem.
5915  */
5916 /* ARGSUSED */
5917 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5918 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5919     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5920 {
5921 	struct {
5922 		struct nameidata nd;
5923 #if CONFIG_FSE
5924 		struct vnode_attr va;
5925 		fse_info finfo;
5926 #endif
5927 	} *__unlink_data;
5928 	struct nameidata *ndp;
5929 	vnode_t vp, dvp;
5930 	int error;
5931 	struct componentname *cnp;
5932 	char  *path = NULL;
5933 	char  *no_firmlink_path = NULL;
5934 	int  len_path = 0;
5935 	int  len_no_firmlink_path = 0;
5936 	int flags;
5937 	int need_event;
5938 	int has_listeners;
5939 	int truncated_path;
5940 	int truncated_no_firmlink_path;
5941 	int batched;
5942 	struct vnode_attr *vap;
5943 	int do_retry;
5944 	int retry_count = 0;
5945 	int cn_flags;
5946 
5947 	cn_flags = LOCKPARENT;
5948 	if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
5949 		cn_flags |= AUDITVNPATH1;
5950 	}
5951 	/* If a starting dvp is passed, it trumps any fd passed. */
5952 	if (start_dvp) {
5953 		cn_flags |= USEDVP;
5954 	}
5955 
5956 #if NAMEDRSRCFORK
5957 	/* unlink or delete is allowed on rsrc forks and named streams */
5958 	cn_flags |= CN_ALLOWRSRCFORK;
5959 #endif
5960 
5961 	__unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
5962 	ndp = &__unlink_data->nd;
5963 #if CONFIG_FSE
5964 	fse_info *finfop = &__unlink_data->finfo;
5965 #endif
5966 
5967 retry:
5968 	do_retry = 0;
5969 	flags = 0;
5970 	need_event = 0;
5971 	has_listeners = 0;
5972 	truncated_path = 0;
5973 	truncated_no_firmlink_path = 0;
5974 	vap = NULL;
5975 
5976 	NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
5977 
5978 	ndp->ni_dvp = start_dvp;
5979 	ndp->ni_flag |= NAMEI_COMPOUNDREMOVE;
5980 	cnp = &ndp->ni_cnd;
5981 
5982 continue_lookup:
5983 	error = nameiat(ndp, fd);
5984 	if (error) {
5985 		goto early_out;
5986 	}
5987 
5988 	dvp = ndp->ni_dvp;
5989 	vp = ndp->ni_vp;
5990 
5991 	/* With Carbon delete semantics, busy files cannot be deleted */
5992 	if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
5993 		flags |= VNODE_REMOVE_NODELETEBUSY;
5994 	}
5995 
5996 	/* Skip any potential upcalls if told to. */
5997 	if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
5998 		flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
5999 	}
6000 
6001 	if (vp) {
6002 		batched = vnode_compound_remove_available(vp);
6003 		/*
6004 		 * The root of a mounted filesystem cannot be deleted.
6005 		 */
6006 		if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6007 			error = EBUSY;
6008 			goto out;
6009 		}
6010 
6011 #if DEVELOPMENT || DEBUG
6012 		/*
6013 		 * XXX VSWAP: Check for entitlements or special flag here
6014 		 * so we can restrict access appropriately.
6015 		 */
6016 #else /* DEVELOPMENT || DEBUG */
6017 
6018 		if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6019 			error = EPERM;
6020 			goto out;
6021 		}
6022 #endif /* DEVELOPMENT || DEBUG */
6023 
6024 		if (!batched) {
6025 			error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6026 			if (error) {
6027 				if (error == ENOENT) {
6028 					if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6029 						do_retry = 1;
6030 						retry_count++;
6031 					}
6032 				}
6033 				goto out;
6034 			}
6035 		}
6036 	} else {
6037 		batched = 1;
6038 
6039 		if (!vnode_compound_remove_available(dvp)) {
6040 			panic("No vp, but no compound remove?");
6041 		}
6042 	}
6043 
6044 #if CONFIG_FSE
6045 	need_event = need_fsevent(FSE_DELETE, dvp);
6046 	if (need_event) {
6047 		if (!batched) {
6048 			if ((vp->v_flag & VISHARDLINK) == 0) {
6049 				/* XXX need to get these data in batched VNOP */
6050 				get_fse_info(vp, finfop, ctx);
6051 			}
6052 		} else {
6053 			error =
6054 			    vfs_get_notify_attributes(&__unlink_data->va);
6055 			if (error) {
6056 				goto out;
6057 			}
6058 
6059 			vap = &__unlink_data->va;
6060 		}
6061 	}
6062 #endif
6063 	has_listeners = kauth_authorize_fileop_has_listeners();
6064 	if (need_event || has_listeners) {
6065 		if (path == NULL) {
6066 			GET_PATH(path);
6067 		}
6068 		len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6069 		if (no_firmlink_path == NULL) {
6070 			GET_PATH(no_firmlink_path);
6071 		}
6072 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6073 	}
6074 
6075 #if NAMEDRSRCFORK
6076 	if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6077 		error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6078 	} else
6079 #endif
6080 	{
6081 #if CONFIG_FILE_LEASES
6082 		vnode_breakdirlease(dvp, false, O_WRONLY);
6083 #endif
6084 
6085 		error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6086 		vp = ndp->ni_vp;
6087 		if (error == EKEEPLOOKING) {
6088 			if (!batched) {
6089 				panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6090 			}
6091 
6092 			if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6093 				panic("EKEEPLOOKING, but continue flag not set?");
6094 			}
6095 
6096 			if (vnode_isdir(vp)) {
6097 				error = EISDIR;
6098 				goto out;
6099 			}
6100 			goto continue_lookup;
6101 		} else if (error == ENOENT && batched) {
6102 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6103 				/*
6104 				 * For compound VNOPs, the authorization callback may
6105 				 * return ENOENT in case of racing hardlink lookups
6106 				 * hitting the name  cache, redrive the lookup.
6107 				 */
6108 				do_retry = 1;
6109 				retry_count += 1;
6110 				goto out;
6111 			}
6112 		}
6113 	}
6114 
6115 	/*
6116 	 * Call out to allow 3rd party notification of delete.
6117 	 * Ignore result of kauth_authorize_fileop call.
6118 	 */
6119 	if (!error) {
6120 		if (has_listeners) {
6121 			kauth_authorize_fileop(vfs_context_ucred(ctx),
6122 			    KAUTH_FILEOP_DELETE,
6123 			    (uintptr_t)vp,
6124 			    (uintptr_t)path);
6125 		}
6126 
6127 		if (vp->v_flag & VISHARDLINK) {
6128 			//
6129 			// if a hardlink gets deleted we want to blow away the
6130 			// v_parent link because the path that got us to this
6131 			// instance of the link is no longer valid.  this will
6132 			// force the next call to get the path to ask the file
6133 			// system instead of just following the v_parent link.
6134 			//
6135 			vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6136 		}
6137 
6138 #if CONFIG_FSE
6139 		if (need_event) {
6140 			if (vp->v_flag & VISHARDLINK) {
6141 				get_fse_info(vp, finfop, ctx);
6142 			} else if (vap) {
6143 				vnode_get_fse_info_from_vap(vp, finfop, vap);
6144 			}
6145 			if (truncated_path) {
6146 				finfop->mode |= FSE_TRUNCATED_PATH;
6147 			}
6148 			add_fsevent(FSE_DELETE, ctx,
6149 			    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6150 			    FSE_ARG_FINFO, finfop,
6151 			    FSE_ARG_DONE);
6152 		}
6153 #endif
6154 
6155 #if CONFIG_MACF
6156 		mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6157 #endif
6158 	}
6159 
6160 out:
6161 	if (path != NULL) {
6162 		RELEASE_PATH(path);
6163 		path = NULL;
6164 	}
6165 
6166 	if (no_firmlink_path != NULL) {
6167 		RELEASE_PATH(no_firmlink_path);
6168 		no_firmlink_path = NULL;
6169 	}
6170 #if NAMEDRSRCFORK
6171 	/* recycle the deleted rsrc fork vnode to force a reclaim, which
6172 	 * will cause its shadow file to go away if necessary.
6173 	 */
6174 	if (vp && (vnode_isnamedstream(vp)) &&
6175 	    (vp->v_parent != NULLVP) &&
6176 	    vnode_isshadow(vp)) {
6177 		vnode_recycle(vp);
6178 	}
6179 #endif
6180 	/*
6181 	 * nameidone has to happen before we vnode_put(dvp)
6182 	 * since it may need to release the fs_nodelock on the dvp
6183 	 */
6184 	nameidone(ndp);
6185 	vnode_put(dvp);
6186 	if (vp) {
6187 		vnode_put(vp);
6188 	}
6189 
6190 	if (do_retry) {
6191 		goto retry;
6192 	}
6193 
6194 early_out:
6195 	kfree_type(typeof(*__unlink_data), __unlink_data);
6196 	return error;
6197 }
6198 
6199 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6200 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6201     enum uio_seg segflg, int unlink_flags)
6202 {
6203 	return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6204 	           unlink_flags);
6205 }
6206 
6207 /*
6208  * Delete a name from the filesystem using Carbon semantics.
6209  */
6210 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6211 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6212 {
6213 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6214 	           uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6215 }
6216 
6217 /*
6218  * Delete a name from the filesystem using POSIX semantics.
6219  */
6220 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6221 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6222 {
6223 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6224 	           uap->path, UIO_USERSPACE, 0);
6225 }
6226 
6227 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6228 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6229 {
6230 	if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6231 		return EINVAL;
6232 	}
6233 
6234 	if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6235 		int unlink_flags = 0;
6236 
6237 		if (uap->flag & AT_REMOVEDIR_DATALESS) {
6238 			unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6239 		}
6240 		return rmdirat_internal(vfs_context_current(), uap->fd,
6241 		           uap->path, UIO_USERSPACE, unlink_flags);
6242 	} else {
6243 		return unlinkat_internal(vfs_context_current(), uap->fd,
6244 		           NULLVP, uap->path, UIO_USERSPACE, 0);
6245 	}
6246 }
6247 
6248 /*
6249  * Reposition read/write file offset.
6250  */
6251 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6252 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6253 {
6254 	struct fileproc *fp;
6255 	vnode_t vp;
6256 	struct vfs_context *ctx;
6257 	off_t offset = uap->offset, file_size;
6258 	int error;
6259 
6260 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6261 		if (error == ENOTSUP) {
6262 			return ESPIPE;
6263 		}
6264 		return error;
6265 	}
6266 	if (vnode_isfifo(vp)) {
6267 		file_drop(uap->fd);
6268 		return ESPIPE;
6269 	}
6270 
6271 
6272 	ctx = vfs_context_current();
6273 #if CONFIG_MACF
6274 	if (uap->whence == L_INCR && uap->offset == 0) {
6275 		error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6276 		    fp->fp_glob);
6277 	} else {
6278 		error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6279 		    fp->fp_glob);
6280 	}
6281 	if (error) {
6282 		file_drop(uap->fd);
6283 		return error;
6284 	}
6285 #endif
6286 	if ((error = vnode_getwithref(vp))) {
6287 		file_drop(uap->fd);
6288 		return error;
6289 	}
6290 
6291 	switch (uap->whence) {
6292 	case L_INCR:
6293 		offset += fp->fp_glob->fg_offset;
6294 		break;
6295 	case L_XTND:
6296 		if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6297 			break;
6298 		}
6299 		offset += file_size;
6300 		break;
6301 	case L_SET:
6302 		break;
6303 	case SEEK_HOLE:
6304 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6305 		break;
6306 	case SEEK_DATA:
6307 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6308 		break;
6309 	default:
6310 		error = EINVAL;
6311 	}
6312 	if (error == 0) {
6313 		if (uap->offset > 0 && offset < 0) {
6314 			/* Incremented/relative move past max size */
6315 			error = EOVERFLOW;
6316 		} else {
6317 			/*
6318 			 * Allow negative offsets on character devices, per
6319 			 * POSIX 1003.1-2001.  Most likely for writing disk
6320 			 * labels.
6321 			 */
6322 			if (offset < 0 && vp->v_type != VCHR) {
6323 				/* Decremented/relative move before start */
6324 				error = EINVAL;
6325 			} else {
6326 				/* Success */
6327 				fp->fp_glob->fg_offset = offset;
6328 				*retval = fp->fp_glob->fg_offset;
6329 			}
6330 		}
6331 	}
6332 
6333 	/*
6334 	 * An lseek can affect whether data is "available to read."  Use
6335 	 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6336 	 */
6337 	post_event_if_success(vp, error, NOTE_NONE);
6338 	(void)vnode_put(vp);
6339 	file_drop(uap->fd);
6340 	return error;
6341 }
6342 
6343 
6344 /*
6345  * Check access permissions.
6346  *
6347  * Returns:	0			Success
6348  *		vnode_authorize:???
6349  */
6350 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6351 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6352 {
6353 	kauth_action_t action;
6354 	int error;
6355 
6356 	/*
6357 	 * If just the regular access bits, convert them to something
6358 	 * that vnode_authorize will understand.
6359 	 */
6360 	if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6361 		action = 0;
6362 		if (uflags & R_OK) {
6363 			action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
6364 		}
6365 		if (uflags & W_OK) {
6366 			if (vnode_isdir(vp)) {
6367 				action |= KAUTH_VNODE_ADD_FILE |
6368 				    KAUTH_VNODE_ADD_SUBDIRECTORY;
6369 				/* might want delete rights here too */
6370 			} else {
6371 				action |= KAUTH_VNODE_WRITE_DATA;
6372 			}
6373 		}
6374 		if (uflags & X_OK) {
6375 			if (vnode_isdir(vp)) {
6376 				action |= KAUTH_VNODE_SEARCH;
6377 			} else {
6378 				action |= KAUTH_VNODE_EXECUTE;
6379 			}
6380 		}
6381 	} else {
6382 		/* take advantage of definition of uflags */
6383 		action = uflags >> 8;
6384 	}
6385 
6386 #if CONFIG_MACF
6387 	error = mac_vnode_check_access(ctx, vp, uflags);
6388 	if (error) {
6389 		return error;
6390 	}
6391 #endif /* MAC */
6392 
6393 	/* action == 0 means only check for existence */
6394 	if (action != 0) {
6395 		error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6396 	} else {
6397 		error = 0;
6398 	}
6399 
6400 	return error;
6401 }
6402 
6403 
6404 
6405 /*
6406  * access_extended: Check access permissions in bulk.
6407  *
6408  * Description:	uap->entries		Pointer to an array of accessx
6409  *                                      descriptor structs, plus one or
6410  *                                      more NULL terminated strings (see
6411  *                                      "Notes" section below).
6412  *		uap->size		Size of the area pointed to by
6413  *					uap->entries.
6414  *		uap->results		Pointer to the results array.
6415  *
6416  * Returns:	0			Success
6417  *		ENOMEM			Insufficient memory
6418  *		EINVAL			Invalid arguments
6419  *		namei:EFAULT		Bad address
6420  *		namei:ENAMETOOLONG	Filename too long
6421  *		namei:ENOENT		No such file or directory
6422  *		namei:ELOOP		Too many levels of symbolic links
6423  *		namei:EBADF		Bad file descriptor
6424  *		namei:ENOTDIR		Not a directory
6425  *		namei:???
6426  *		access1:
6427  *
6428  * Implicit returns:
6429  *		uap->results		Array contents modified
6430  *
6431  * Notes:	The uap->entries are structured as an arbitrary length array
6432  *		of accessx descriptors, followed by one or more NULL terminated
6433  *		strings
6434  *
6435  *			struct accessx_descriptor[0]
6436  *			...
6437  *			struct accessx_descriptor[n]
6438  *			char name_data[0];
6439  *
6440  *		We determine the entry count by walking the buffer containing
6441  *		the uap->entries argument descriptor.  For each descriptor we
6442  *		see, the valid values for the offset ad_name_offset will be
6443  *		in the byte range:
6444  *
6445  *			[ uap->entries + sizeof(struct accessx_descriptor) ]
6446  *						to
6447  *				[ uap->entries + uap->size - 2 ]
6448  *
6449  *		since we must have at least one string, and the string must
6450  *		be at least one character plus the NULL terminator in length.
6451  *
6452  * XXX:		Need to support the check-as uid argument
6453  */
6454 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6455 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6456 {
6457 	struct accessx_descriptor *input = NULL;
6458 	errno_t *result = NULL;
6459 	errno_t error = 0;
6460 	int wantdelete = 0;
6461 	size_t desc_max, desc_actual = 0;
6462 	unsigned int i, j;
6463 	struct vfs_context context;
6464 	struct nameidata nd;
6465 	int niopts;
6466 	vnode_t vp = NULL;
6467 	vnode_t dvp = NULL;
6468 #define ACCESSX_MAX_DESCR_ON_STACK 10
6469 	struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6470 
6471 	context.vc_ucred = NULL;
6472 
6473 	/*
6474 	 * Validate parameters; if valid, copy the descriptor array and string
6475 	 * arguments into local memory.  Before proceeding, the following
6476 	 * conditions must have been met:
6477 	 *
6478 	 * o	The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6479 	 * o	There must be sufficient room in the request for at least one
6480 	 *	descriptor and a one yte NUL terminated string.
6481 	 * o	The allocation of local storage must not fail.
6482 	 */
6483 	if (uap->size > ACCESSX_MAX_TABLESIZE) {
6484 		return ENOMEM;
6485 	}
6486 	if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6487 		return EINVAL;
6488 	}
6489 	if (uap->size <= sizeof(stack_input)) {
6490 		input = stack_input;
6491 	} else {
6492 		input = kalloc_data(uap->size, Z_WAITOK);
6493 		if (input == NULL) {
6494 			error = ENOMEM;
6495 			goto out;
6496 		}
6497 	}
6498 	error = copyin(uap->entries, input, uap->size);
6499 	if (error) {
6500 		goto out;
6501 	}
6502 
6503 	AUDIT_ARG(opaque, input, uap->size);
6504 
6505 	/*
6506 	 * Force NUL termination of the copyin buffer to avoid nami() running
6507 	 * off the end.  If the caller passes us bogus data, they may get a
6508 	 * bogus result.
6509 	 */
6510 	((char *)input)[uap->size - 1] = 0;
6511 
6512 	/*
6513 	 * Access is defined as checking against the process' real identity,
6514 	 * even if operations are checking the effective identity.  This
6515 	 * requires that we use a local vfs context.
6516 	 */
6517 	context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6518 	context.vc_thread = current_thread();
6519 
6520 	/*
6521 	 * Find out how many entries we have, so we can allocate the result
6522 	 * array by walking the list and adjusting the count downward by the
6523 	 * earliest string offset we see.
6524 	 */
6525 	desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6526 	desc_actual = desc_max;
6527 	for (i = 0; i < desc_actual; i++) {
6528 		/*
6529 		 * Take the offset to the name string for this entry and
6530 		 * convert to an input array index, which would be one off
6531 		 * the end of the array if this entry was the lowest-addressed
6532 		 * name string.
6533 		 */
6534 		j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6535 
6536 		/*
6537 		 * An offset greater than the max allowable offset is an error.
6538 		 * It is also an error for any valid entry to point
6539 		 * to a location prior to the end of the current entry, if
6540 		 * it's not a reference to the string of the previous entry.
6541 		 */
6542 		if (j > desc_max || (j != 0 && j <= i)) {
6543 			error = EINVAL;
6544 			goto out;
6545 		}
6546 
6547 		/* Also do not let ad_name_offset point to something beyond the size of the input */
6548 		if (input[i].ad_name_offset >= uap->size) {
6549 			error = EINVAL;
6550 			goto out;
6551 		}
6552 
6553 		/*
6554 		 * An offset of 0 means use the previous descriptor's offset;
6555 		 * this is used to chain multiple requests for the same file
6556 		 * to avoid multiple lookups.
6557 		 */
6558 		if (j == 0) {
6559 			/* This is not valid for the first entry */
6560 			if (i == 0) {
6561 				error = EINVAL;
6562 				goto out;
6563 			}
6564 			continue;
6565 		}
6566 
6567 		/*
6568 		 * If the offset of the string for this descriptor is before
6569 		 * what we believe is the current actual last descriptor,
6570 		 * then we need to adjust our estimate downward; this permits
6571 		 * the string table following the last descriptor to be out
6572 		 * of order relative to the descriptor list.
6573 		 */
6574 		if (j < desc_actual) {
6575 			desc_actual = j;
6576 		}
6577 	}
6578 
6579 	/*
6580 	 * We limit the actual number of descriptors we are willing to process
6581 	 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
6582 	 * requested does not exceed this limit,
6583 	 */
6584 	if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6585 		error = ENOMEM;
6586 		goto out;
6587 	}
6588 	result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6589 	if (result == NULL) {
6590 		error = ENOMEM;
6591 		goto out;
6592 	}
6593 
6594 	/*
6595 	 * Do the work by iterating over the descriptor entries we know to
6596 	 * at least appear to contain valid data.
6597 	 */
6598 	error = 0;
6599 	for (i = 0; i < desc_actual; i++) {
6600 		/*
6601 		 * If the ad_name_offset is 0, then we use the previous
6602 		 * results to make the check; otherwise, we are looking up
6603 		 * a new file name.
6604 		 */
6605 		if (input[i].ad_name_offset != 0) {
6606 			/* discard old vnodes */
6607 			if (vp) {
6608 				vnode_put(vp);
6609 				vp = NULL;
6610 			}
6611 			if (dvp) {
6612 				vnode_put(dvp);
6613 				dvp = NULL;
6614 			}
6615 
6616 			/*
6617 			 * Scan forward in the descriptor list to see if we
6618 			 * need the parent vnode.  We will need it if we are
6619 			 * deleting, since we must have rights  to remove
6620 			 * entries in the parent directory, as well as the
6621 			 * rights to delete the object itself.
6622 			 */
6623 			wantdelete = input[i].ad_flags & _DELETE_OK;
6624 			for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6625 				if (input[j].ad_flags & _DELETE_OK) {
6626 					wantdelete = 1;
6627 				}
6628 			}
6629 
6630 			niopts = FOLLOW | AUDITVNPATH1;
6631 
6632 			/* need parent for vnode_authorize for deletion test */
6633 			if (wantdelete) {
6634 				niopts |= WANTPARENT;
6635 			}
6636 
6637 			/* do the lookup */
6638 			NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6639 			    CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6640 			    &context);
6641 			error = namei(&nd);
6642 			if (!error) {
6643 				vp = nd.ni_vp;
6644 				if (wantdelete) {
6645 					dvp = nd.ni_dvp;
6646 				}
6647 			}
6648 			nameidone(&nd);
6649 		}
6650 
6651 		/*
6652 		 * Handle lookup errors.
6653 		 */
6654 		switch (error) {
6655 		case ENOENT:
6656 		case EACCES:
6657 		case EPERM:
6658 		case ENOTDIR:
6659 			result[i] = error;
6660 			break;
6661 		case 0:
6662 			/* run this access check */
6663 			result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6664 			break;
6665 		default:
6666 			/* fatal lookup error */
6667 
6668 			goto out;
6669 		}
6670 	}
6671 
6672 	AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6673 
6674 	/* copy out results */
6675 	error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6676 
6677 out:
6678 	if (input && input != stack_input) {
6679 		kfree_data(input, uap->size);
6680 	}
6681 	if (result) {
6682 		kfree_data(result, desc_actual * sizeof(errno_t));
6683 	}
6684 	if (vp) {
6685 		vnode_put(vp);
6686 	}
6687 	if (dvp) {
6688 		vnode_put(dvp);
6689 	}
6690 	if (IS_VALID_CRED(context.vc_ucred)) {
6691 		kauth_cred_unref(&context.vc_ucred);
6692 	}
6693 	return error;
6694 }
6695 
6696 
6697 /*
6698  * Returns:	0			Success
6699  *		namei:EFAULT		Bad address
6700  *		namei:ENAMETOOLONG	Filename too long
6701  *		namei:ENOENT		No such file or directory
6702  *		namei:ELOOP		Too many levels of symbolic links
6703  *		namei:EBADF		Bad file descriptor
6704  *		namei:ENOTDIR		Not a directory
6705  *		namei:???
6706  *		access1:
6707  */
6708 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6709 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6710     int flag, enum uio_seg segflg)
6711 {
6712 	int error;
6713 	struct nameidata nd;
6714 	int niopts;
6715 	struct vfs_context context;
6716 #if NAMEDRSRCFORK
6717 	int is_namedstream = 0;
6718 #endif
6719 
6720 	/*
6721 	 * Unless the AT_EACCESS option is used, Access is defined as checking
6722 	 * against the process' real identity, even if operations are checking
6723 	 * the effective identity.  So we need to tweak the credential
6724 	 * in the context for that case.
6725 	 */
6726 	if (!(flag & AT_EACCESS)) {
6727 		context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6728 	} else {
6729 		context.vc_ucred = ctx->vc_ucred;
6730 	}
6731 	context.vc_thread = ctx->vc_thread;
6732 
6733 
6734 	niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6735 	/* need parent for vnode_authorize for deletion test */
6736 	if (amode & _DELETE_OK) {
6737 		niopts |= WANTPARENT;
6738 	}
6739 	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6740 	    path, &context);
6741 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6742 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6743 	}
6744 
6745 #if NAMEDRSRCFORK
6746 	/* access(F_OK) calls are allowed for resource forks. */
6747 	if (amode == F_OK) {
6748 		nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6749 	}
6750 #endif
6751 	error = nameiat(&nd, fd);
6752 	if (error) {
6753 		goto out;
6754 	}
6755 
6756 #if NAMEDRSRCFORK
6757 	/* Grab reference on the shadow stream file vnode to
6758 	 * force an inactive on release which will mark it
6759 	 * for recycle.
6760 	 */
6761 	if (vnode_isnamedstream(nd.ni_vp) &&
6762 	    (nd.ni_vp->v_parent != NULLVP) &&
6763 	    vnode_isshadow(nd.ni_vp)) {
6764 		is_namedstream = 1;
6765 		vnode_ref(nd.ni_vp);
6766 	}
6767 #endif
6768 
6769 	error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6770 
6771 #if NAMEDRSRCFORK
6772 	if (is_namedstream) {
6773 		vnode_rele(nd.ni_vp);
6774 	}
6775 #endif
6776 
6777 	vnode_put(nd.ni_vp);
6778 	if (amode & _DELETE_OK) {
6779 		vnode_put(nd.ni_dvp);
6780 	}
6781 	nameidone(&nd);
6782 
6783 out:
6784 	if (!(flag & AT_EACCESS)) {
6785 		kauth_cred_unref(&context.vc_ucred);
6786 	}
6787 	return error;
6788 }
6789 
6790 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)6791 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6792 {
6793 	return faccessat_internal(vfs_context_current(), AT_FDCWD,
6794 	           uap->path, uap->flags, 0, UIO_USERSPACE);
6795 }
6796 
6797 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)6798 faccessat(__unused proc_t p, struct faccessat_args *uap,
6799     __unused int32_t *retval)
6800 {
6801 	if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
6802 		return EINVAL;
6803 	}
6804 
6805 	return faccessat_internal(vfs_context_current(), uap->fd,
6806 	           uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6807 }
6808 
6809 /*
6810  * Returns:	0			Success
6811  *		EFAULT
6812  *	copyout:EFAULT
6813  *	namei:???
6814  *	vn_stat:???
6815  */
6816 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)6817 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6818     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6819     enum uio_seg segflg, int fd, int flag)
6820 {
6821 	struct nameidata nd;
6822 	int follow;
6823 	union {
6824 		struct stat sb;
6825 		struct stat64 sb64;
6826 	} source = {};
6827 	union {
6828 		struct user64_stat user64_sb;
6829 		struct user32_stat user32_sb;
6830 		struct user64_stat64 user64_sb64;
6831 		struct user32_stat64 user32_sb64;
6832 	} dest = {};
6833 	caddr_t sbp;
6834 	int error, my_size;
6835 	kauth_filesec_t fsec;
6836 	size_t xsecurity_bufsize;
6837 	void * statptr;
6838 	struct fileproc *fp = NULL;
6839 	int needsrealdev = 0;
6840 
6841 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
6842 	NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6843 	    segflg, path, ctx);
6844 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6845 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6846 	}
6847 
6848 #if NAMEDRSRCFORK
6849 	int is_namedstream = 0;
6850 	/* stat calls are allowed for resource forks. */
6851 	nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6852 #endif
6853 
6854 	if (flag & AT_FDONLY) {
6855 		vnode_t fvp;
6856 
6857 		error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6858 		if (error) {
6859 			return error;
6860 		}
6861 		if ((error = vnode_getwithref(fvp))) {
6862 			file_drop(fd);
6863 			return error;
6864 		}
6865 		nd.ni_vp = fvp;
6866 	} else {
6867 		error = nameiat(&nd, fd);
6868 		if (error) {
6869 			return error;
6870 		}
6871 	}
6872 	fsec = KAUTH_FILESEC_NONE;
6873 
6874 	statptr = (void *)&source;
6875 
6876 #if NAMEDRSRCFORK
6877 	/* Grab reference on the shadow stream file vnode to
6878 	 * force an inactive on release which will mark it
6879 	 * for recycle.
6880 	 */
6881 	if (vnode_isnamedstream(nd.ni_vp) &&
6882 	    (nd.ni_vp->v_parent != NULLVP) &&
6883 	    vnode_isshadow(nd.ni_vp)) {
6884 		is_namedstream = 1;
6885 		vnode_ref(nd.ni_vp);
6886 	}
6887 #endif
6888 
6889 	needsrealdev = flag & AT_REALDEV ? 1 : 0;
6890 	if (fp && (xsecurity == USER_ADDR_NULL)) {
6891 		/*
6892 		 * If the caller has the file open, and is not
6893 		 * requesting extended security information, we are
6894 		 * going to let them get the basic stat information.
6895 		 */
6896 		error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6897 		    fp->fp_glob->fg_cred);
6898 	} else {
6899 		error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6900 		    isstat64, needsrealdev, ctx);
6901 	}
6902 
6903 #if NAMEDRSRCFORK
6904 	if (is_namedstream) {
6905 		vnode_rele(nd.ni_vp);
6906 	}
6907 #endif
6908 	vnode_put(nd.ni_vp);
6909 	nameidone(&nd);
6910 	if (fp) {
6911 		file_drop(fd);
6912 		fp = NULL;
6913 	}
6914 
6915 	if (error) {
6916 		return error;
6917 	}
6918 	/* Zap spare fields */
6919 	if (isstat64 != 0) {
6920 		source.sb64.st_lspare = 0;
6921 		source.sb64.st_qspare[0] = 0LL;
6922 		source.sb64.st_qspare[1] = 0LL;
6923 		if (vfs_context_is64bit(ctx)) {
6924 			munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6925 			my_size = sizeof(dest.user64_sb64);
6926 			sbp = (caddr_t)&dest.user64_sb64;
6927 		} else {
6928 			munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6929 			my_size = sizeof(dest.user32_sb64);
6930 			sbp = (caddr_t)&dest.user32_sb64;
6931 		}
6932 		/*
6933 		 * Check if we raced (post lookup) against the last unlink of a file.
6934 		 */
6935 		if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
6936 			source.sb64.st_nlink = 1;
6937 		}
6938 	} else {
6939 		source.sb.st_lspare = 0;
6940 		source.sb.st_qspare[0] = 0LL;
6941 		source.sb.st_qspare[1] = 0LL;
6942 		if (vfs_context_is64bit(ctx)) {
6943 			munge_user64_stat(&source.sb, &dest.user64_sb);
6944 			my_size = sizeof(dest.user64_sb);
6945 			sbp = (caddr_t)&dest.user64_sb;
6946 		} else {
6947 			munge_user32_stat(&source.sb, &dest.user32_sb);
6948 			my_size = sizeof(dest.user32_sb);
6949 			sbp = (caddr_t)&dest.user32_sb;
6950 		}
6951 
6952 		/*
6953 		 * Check if we raced (post lookup) against the last unlink of a file.
6954 		 */
6955 		if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
6956 			source.sb.st_nlink = 1;
6957 		}
6958 	}
6959 	if ((error = copyout(sbp, ub, my_size)) != 0) {
6960 		goto out;
6961 	}
6962 
6963 	/* caller wants extended security information? */
6964 	if (xsecurity != USER_ADDR_NULL) {
6965 		/* did we get any? */
6966 		if (fsec == KAUTH_FILESEC_NONE) {
6967 			if (susize(xsecurity_size, 0) != 0) {
6968 				error = EFAULT;
6969 				goto out;
6970 			}
6971 		} else {
6972 			/* find the user buffer size */
6973 			xsecurity_bufsize = fusize(xsecurity_size);
6974 
6975 			/* copy out the actual data size */
6976 			if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
6977 				error = EFAULT;
6978 				goto out;
6979 			}
6980 
6981 			/* if the caller supplied enough room, copy out to it */
6982 			if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
6983 				error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
6984 			}
6985 		}
6986 	}
6987 out:
6988 	if (fsec != KAUTH_FILESEC_NONE) {
6989 		kauth_filesec_free(fsec);
6990 	}
6991 	return error;
6992 }
6993 
6994 /*
6995  * stat_extended: Get file status; with extended security (ACL).
6996  *
6997  * Parameters:    p                       (ignored)
6998  *                uap                     User argument descriptor (see below)
6999  *                retval                  (ignored)
7000  *
7001  * Indirect:      uap->path               Path of file to get status from
7002  *                uap->ub                 User buffer (holds file status info)
7003  *                uap->xsecurity          ACL to get (extended security)
7004  *                uap->xsecurity_size     Size of ACL
7005  *
7006  * Returns:        0                      Success
7007  *                !0                      errno value
7008  *
7009  */
7010 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7011 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7012     __unused int32_t *retval)
7013 {
7014 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7015 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7016 	           0);
7017 }
7018 
7019 /*
7020  * Returns:	0			Success
7021  *	fstatat_internal:???		[see fstatat_internal() in this file]
7022  */
7023 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7024 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7025 {
7026 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7027 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7028 }
7029 
7030 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7031 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7032 {
7033 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7034 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7035 }
7036 
7037 /*
7038  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7039  *
7040  * Parameters:    p                       (ignored)
7041  *                uap                     User argument descriptor (see below)
7042  *                retval                  (ignored)
7043  *
7044  * Indirect:      uap->path               Path of file to get status from
7045  *                uap->ub                 User buffer (holds file status info)
7046  *                uap->xsecurity          ACL to get (extended security)
7047  *                uap->xsecurity_size     Size of ACL
7048  *
7049  * Returns:        0                      Success
7050  *                !0                      errno value
7051  *
7052  */
7053 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7054 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7055 {
7056 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7057 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7058 	           0);
7059 }
7060 
7061 /*
7062  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7063  *
7064  * Parameters:    p                       (ignored)
7065  *                uap                     User argument descriptor (see below)
7066  *                retval                  (ignored)
7067  *
7068  * Indirect:      uap->path               Path of file to get status from
7069  *                uap->ub                 User buffer (holds file status info)
7070  *                uap->xsecurity          ACL to get (extended security)
7071  *                uap->xsecurity_size     Size of ACL
7072  *
7073  * Returns:        0                      Success
7074  *                !0                      errno value
7075  *
7076  */
7077 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7078 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7079 {
7080 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7081 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7082 	           AT_SYMLINK_NOFOLLOW);
7083 }
7084 
7085 /*
7086  * Get file status; this version does not follow links.
7087  */
7088 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7089 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7090 {
7091 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7092 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7093 }
7094 
7095 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7096 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7097 {
7098 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7099 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7100 }
7101 
7102 /*
7103  * lstat64_extended: Get file status; can handle large inode numbers; does not
7104  * follow links; with extended security (ACL).
7105  *
7106  * Parameters:    p                       (ignored)
7107  *                uap                     User argument descriptor (see below)
7108  *                retval                  (ignored)
7109  *
7110  * Indirect:      uap->path               Path of file to get status from
7111  *                uap->ub                 User buffer (holds file status info)
7112  *                uap->xsecurity          ACL to get (extended security)
7113  *                uap->xsecurity_size     Size of ACL
7114  *
7115  * Returns:        0                      Success
7116  *                !0                      errno value
7117  *
7118  */
7119 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7120 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7121 {
7122 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7123 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7124 	           AT_SYMLINK_NOFOLLOW);
7125 }
7126 
7127 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7128 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7129 {
7130 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7131 		return EINVAL;
7132 	}
7133 
7134 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7135 	           0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7136 }
7137 
7138 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7139 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7140     __unused int32_t *retval)
7141 {
7142 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7143 		return EINVAL;
7144 	}
7145 
7146 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7147 	           0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7148 }
7149 
7150 /*
7151  * Get configurable pathname variables.
7152  *
7153  * Returns:	0			Success
7154  *	namei:???
7155  *	vn_pathconf:???
7156  *
7157  * Notes:	Global implementation  constants are intended to be
7158  *		implemented in this function directly; all other constants
7159  *		are per-FS implementation, and therefore must be handled in
7160  *		each respective FS, instead.
7161  *
7162  * XXX We implement some things globally right now that should actually be
7163  * XXX per-FS; we will need to deal with this at some point.
7164  */
7165 /* ARGSUSED */
7166 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7167 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7168 {
7169 	int error;
7170 	struct nameidata nd;
7171 	vfs_context_t ctx = vfs_context_current();
7172 
7173 	NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7174 	    UIO_USERSPACE, uap->path, ctx);
7175 	error = namei(&nd);
7176 	if (error) {
7177 		return error;
7178 	}
7179 
7180 	error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7181 
7182 	vnode_put(nd.ni_vp);
7183 	nameidone(&nd);
7184 	return error;
7185 }
7186 
7187 /*
7188  * Return target name of a symbolic link.
7189  */
7190 /* ARGSUSED */
7191 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7192 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7193     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7194     int *retval)
7195 {
7196 	vnode_t vp;
7197 	uio_t auio;
7198 	int error;
7199 	struct nameidata nd;
7200 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
7201 	bool put_vnode;
7202 
7203 	if (bufsize > INT32_MAX) {
7204 		return EINVAL;
7205 	}
7206 
7207 	if (lnk_vp) {
7208 		vp = lnk_vp;
7209 		put_vnode = false;
7210 	} else {
7211 		NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7212 		    seg, path, ctx);
7213 
7214 		error = nameiat(&nd, fd);
7215 		if (error) {
7216 			return error;
7217 		}
7218 		vp = nd.ni_vp;
7219 		put_vnode = true;
7220 		nameidone(&nd);
7221 	}
7222 
7223 	auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7224 	    &uio_buf[0], sizeof(uio_buf));
7225 	uio_addiov(auio, buf, bufsize);
7226 	if (vp->v_type != VLNK) {
7227 		error = EINVAL;
7228 	} else {
7229 #if CONFIG_MACF
7230 		error = mac_vnode_check_readlink(ctx, vp);
7231 #endif
7232 		if (error == 0) {
7233 			error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7234 			    ctx);
7235 		}
7236 		if (error == 0) {
7237 			error = VNOP_READLINK(vp, auio, ctx);
7238 		}
7239 	}
7240 
7241 	if (put_vnode) {
7242 		vnode_put(vp);
7243 	}
7244 
7245 	*retval = (int)(bufsize - uio_resid(auio));
7246 	return error;
7247 }
7248 
7249 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7250 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7251 {
7252 	enum uio_seg procseg;
7253 	vnode_t vp;
7254 	int error;
7255 
7256 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7257 
7258 	AUDIT_ARG(fd, uap->fd);
7259 
7260 	if ((error = file_vnode(uap->fd, &vp))) {
7261 		return error;
7262 	}
7263 	if ((error = vnode_getwithref(vp))) {
7264 		file_drop(uap->fd);
7265 		return error;
7266 	}
7267 
7268 	error = readlinkat_internal(vfs_context_current(), -1,
7269 	    vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7270 	    uap->bufsize, procseg, retval);
7271 
7272 	vnode_put(vp);
7273 	file_drop(uap->fd);
7274 	return error;
7275 }
7276 
7277 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7278 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7279 {
7280 	enum uio_seg procseg;
7281 
7282 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7283 	return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7284 	           CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7285 	           uap->count, procseg, retval);
7286 }
7287 
7288 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7289 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7290 {
7291 	enum uio_seg procseg;
7292 
7293 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7294 	return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7295 	           CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7296 	           retval);
7297 }
7298 
7299 /*
7300  * Change file flags, the deep inner layer.
7301  */
7302 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7303 chflags0(vnode_t vp, struct vnode_attr *va,
7304     int (*setattr)(vnode_t, void *, vfs_context_t),
7305     void *arg, vfs_context_t ctx)
7306 {
7307 	kauth_action_t action = 0;
7308 	int error;
7309 
7310 #if CONFIG_MACF
7311 	error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7312 	if (error) {
7313 		goto out;
7314 	}
7315 #endif
7316 
7317 	/* request authorisation, disregard immutability */
7318 	if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7319 		goto out;
7320 	}
7321 	/*
7322 	 * Request that the auth layer disregard those file flags it's allowed to when
7323 	 * authorizing this operation; we need to do this in order to be able to
7324 	 * clear immutable flags.
7325 	 */
7326 	if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7327 		goto out;
7328 	}
7329 	error = (*setattr)(vp, arg, ctx);
7330 
7331 #if CONFIG_MACF
7332 	if (error == 0) {
7333 		mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7334 	}
7335 #endif
7336 
7337 out:
7338 	return error;
7339 }
7340 
7341 /*
7342  * Change file flags.
7343  *
7344  * NOTE: this will vnode_put() `vp'
7345  */
7346 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7347 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7348 {
7349 	struct vnode_attr va;
7350 	int error;
7351 
7352 	VATTR_INIT(&va);
7353 	VATTR_SET(&va, va_flags, flags);
7354 
7355 	error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7356 	vnode_put(vp);
7357 
7358 	if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7359 		error = ENOTSUP;
7360 	}
7361 
7362 	return error;
7363 }
7364 
7365 /*
7366  * Change flags of a file given a path name.
7367  */
7368 /* ARGSUSED */
7369 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7370 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7371 {
7372 	vnode_t vp;
7373 	vfs_context_t ctx = vfs_context_current();
7374 	int error;
7375 	struct nameidata nd;
7376 	uint32_t wantparent = 0;
7377 
7378 #if CONFIG_FILE_LEASES
7379 	wantparent = WANTPARENT;
7380 #endif
7381 
7382 	AUDIT_ARG(fflags, uap->flags);
7383 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7384 	    UIO_USERSPACE, uap->path, ctx);
7385 	error = namei(&nd);
7386 	if (error) {
7387 		return error;
7388 	}
7389 	vp = nd.ni_vp;
7390 
7391 #if CONFIG_FILE_LEASES
7392 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7393 	vnode_put(nd.ni_dvp);
7394 #endif
7395 
7396 	nameidone(&nd);
7397 
7398 	/* we don't vnode_put() here because chflags1 does internally */
7399 	error = chflags1(vp, uap->flags, ctx);
7400 
7401 	return error;
7402 }
7403 
7404 /*
7405  * Change flags of a file given a file descriptor.
7406  */
7407 /* ARGSUSED */
7408 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7409 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7410 {
7411 	vnode_t vp;
7412 	int error;
7413 
7414 	AUDIT_ARG(fd, uap->fd);
7415 	AUDIT_ARG(fflags, uap->flags);
7416 	if ((error = file_vnode(uap->fd, &vp))) {
7417 		return error;
7418 	}
7419 
7420 	if ((error = vnode_getwithref(vp))) {
7421 		file_drop(uap->fd);
7422 		return error;
7423 	}
7424 
7425 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7426 
7427 #if CONFIG_FILE_LEASES
7428 	vnode_breakdirlease(vp, true, O_WRONLY);
7429 #endif
7430 
7431 	/* we don't vnode_put() here because chflags1 does internally */
7432 	error = chflags1(vp, uap->flags, vfs_context_current());
7433 
7434 	file_drop(uap->fd);
7435 	return error;
7436 }
7437 
7438 /*
7439  * Change security information on a filesystem object.
7440  *
7441  * Returns:	0			Success
7442  *		EPERM			Operation not permitted
7443  *		vnode_authattr:???	[anything vnode_authattr can return]
7444  *		vnode_authorize:???	[anything vnode_authorize can return]
7445  *		vnode_setattr:???	[anything vnode_setattr can return]
7446  *
7447  * Notes:	If vnode_authattr or vnode_authorize return EACCES, it will be
7448  *		translated to EPERM before being returned.
7449  */
7450 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7451 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7452 {
7453 	kauth_action_t action;
7454 	int error;
7455 
7456 	AUDIT_ARG(mode, vap->va_mode);
7457 	/* XXX audit new args */
7458 
7459 #if NAMEDSTREAMS
7460 	/* chmod calls are not allowed for resource forks. */
7461 	if (vp->v_flag & VISNAMEDSTREAM) {
7462 		return EPERM;
7463 	}
7464 #endif
7465 
7466 #if CONFIG_MACF
7467 	if (VATTR_IS_ACTIVE(vap, va_mode) &&
7468 	    (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7469 		return error;
7470 	}
7471 
7472 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7473 		if ((error = mac_vnode_check_setowner(ctx, vp,
7474 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7475 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7476 			return error;
7477 		}
7478 	}
7479 
7480 	if (VATTR_IS_ACTIVE(vap, va_acl) &&
7481 	    (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7482 		return error;
7483 	}
7484 #endif
7485 
7486 	/* make sure that the caller is allowed to set this security information */
7487 	if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7488 	    ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7489 		if (error == EACCES) {
7490 			error = EPERM;
7491 		}
7492 		return error;
7493 	}
7494 
7495 	if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7496 		return error;
7497 	}
7498 
7499 #if CONFIG_MACF
7500 	if (VATTR_IS_ACTIVE(vap, va_mode)) {
7501 		mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7502 	}
7503 
7504 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7505 		mac_vnode_notify_setowner(ctx, vp,
7506 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7507 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7508 	}
7509 
7510 	if (VATTR_IS_ACTIVE(vap, va_acl)) {
7511 		mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7512 	}
7513 #endif
7514 
7515 	return error;
7516 }
7517 
7518 
7519 /*
7520  * Change mode of a file given a path name.
7521  *
7522  * Returns:	0			Success
7523  *		namei:???		[anything namei can return]
7524  *		chmod_vnode:???		[anything chmod_vnode can return]
7525  */
7526 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7527 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7528     int fd, int flag, enum uio_seg segflg)
7529 {
7530 	struct nameidata nd;
7531 	int follow, error;
7532 	uint32_t wantparent = 0;
7533 
7534 #if CONFIG_FILE_LEASES
7535 	wantparent = WANTPARENT;
7536 #endif
7537 
7538 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7539 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7540 	    segflg, path, ctx);
7541 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7542 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7543 	}
7544 	if ((error = nameiat(&nd, fd))) {
7545 		return error;
7546 	}
7547 
7548 #if CONFIG_FILE_LEASES
7549 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7550 	vnode_put(nd.ni_dvp);
7551 #endif
7552 
7553 	error = chmod_vnode(ctx, nd.ni_vp, vap);
7554 	vnode_put(nd.ni_vp);
7555 	nameidone(&nd);
7556 	return error;
7557 }
7558 
7559 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)7560 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7561     gid_t gid, user_addr_t xsecurity)
7562 {
7563 	int error;
7564 
7565 	VATTR_INIT(pva);
7566 
7567 	if (mode != -1) {
7568 		VATTR_SET(pva, va_mode, mode & ALLPERMS);
7569 	} else {
7570 		pva->va_mode = 0;
7571 	}
7572 
7573 	if (uid != KAUTH_UID_NONE) {
7574 		VATTR_SET(pva, va_uid, uid);
7575 	}
7576 
7577 	if (gid != KAUTH_GID_NONE) {
7578 		VATTR_SET(pva, va_gid, gid);
7579 	}
7580 
7581 	*pxsecdst = NULL;
7582 	switch (xsecurity) {
7583 	case USER_ADDR_NULL:
7584 		break;
7585 
7586 	case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7587 		VATTR_SET(pva, va_acl, NULL);
7588 		break;
7589 
7590 	default:
7591 		if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
7592 			return error;
7593 		}
7594 
7595 		VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7596 		pva->va_vaflags |= VA_FILESEC_ACL;
7597 		KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7598 		break;
7599 	}
7600 
7601 	return 0;
7602 }
7603 
7604 /*
7605  * chmod_extended: Change the mode of a file given a path name; with extended
7606  * argument list (including extended security (ACL)).
7607  *
7608  * Parameters:	p			Process requesting the open
7609  *		uap			User argument descriptor (see below)
7610  *		retval			(ignored)
7611  *
7612  * Indirect:	uap->path		Path to object (same as 'chmod')
7613  *		uap->uid		UID to set
7614  *		uap->gid		GID to set
7615  *		uap->mode		File mode to set (same as 'chmod')
7616  *		uap->xsecurity		ACL to set (or delete)
7617  *
7618  * Returns:	0			Success
7619  *		!0			errno value
7620  *
7621  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
7622  *
7623  * XXX:		We should enummerate the possible errno values here, and where
7624  *		in the code they originated.
7625  */
7626 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7627 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7628 {
7629 	int error;
7630 	struct vnode_attr va;
7631 	kauth_filesec_t xsecdst = NULL;
7632 
7633 	AUDIT_ARG(owner, uap->uid, uap->gid);
7634 
7635 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7636 	    uap->gid, uap->xsecurity);
7637 
7638 	if (error) {
7639 		return error;
7640 	}
7641 
7642 	error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7643 	    UIO_USERSPACE);
7644 
7645 	if (xsecdst != NULL) {
7646 		kauth_filesec_free(xsecdst);
7647 	}
7648 	return error;
7649 }
7650 
7651 /*
7652  * Returns:	0			Success
7653  *		chmodat:???		[anything chmodat can return]
7654  */
7655 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7656 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7657     int flag, enum uio_seg segflg)
7658 {
7659 	struct vnode_attr va;
7660 
7661 	VATTR_INIT(&va);
7662 	VATTR_SET(&va, va_mode, mode & ALLPERMS);
7663 
7664 	return chmodat(ctx, path, &va, fd, flag, segflg);
7665 }
7666 
7667 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7668 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7669 {
7670 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7671 	           AT_FDCWD, 0, UIO_USERSPACE);
7672 }
7673 
7674 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7675 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7676 {
7677 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7678 		return EINVAL;
7679 	}
7680 
7681 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7682 	           uap->fd, uap->flag, UIO_USERSPACE);
7683 }
7684 
7685 /*
7686  * Change mode of a file given a file descriptor.
7687  */
7688 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7689 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7690 {
7691 	vnode_t vp;
7692 	int error;
7693 
7694 	AUDIT_ARG(fd, fd);
7695 
7696 	if ((error = file_vnode(fd, &vp)) != 0) {
7697 		return error;
7698 	}
7699 	if ((error = vnode_getwithref(vp)) != 0) {
7700 		file_drop(fd);
7701 		return error;
7702 	}
7703 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7704 
7705 #if CONFIG_FILE_LEASES
7706 	vnode_breakdirlease(vp, true, O_WRONLY);
7707 #endif
7708 
7709 	error = chmod_vnode(vfs_context_current(), vp, vap);
7710 	(void)vnode_put(vp);
7711 	file_drop(fd);
7712 
7713 	return error;
7714 }
7715 
7716 /*
7717  * fchmod_extended: Change mode of a file given a file descriptor; with
7718  * extended argument list (including extended security (ACL)).
7719  *
7720  * Parameters:    p                       Process requesting to change file mode
7721  *                uap                     User argument descriptor (see below)
7722  *                retval                  (ignored)
7723  *
7724  * Indirect:      uap->mode               File mode to set (same as 'chmod')
7725  *                uap->uid                UID to set
7726  *                uap->gid                GID to set
7727  *                uap->xsecurity          ACL to set (or delete)
7728  *                uap->fd                 File descriptor of file to change mode
7729  *
7730  * Returns:        0                      Success
7731  *                !0                      errno value
7732  *
7733  */
7734 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7735 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7736 {
7737 	int error;
7738 	struct vnode_attr va;
7739 	kauth_filesec_t xsecdst = NULL;
7740 
7741 	AUDIT_ARG(owner, uap->uid, uap->gid);
7742 
7743 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7744 	    uap->gid, uap->xsecurity);
7745 
7746 	if (error) {
7747 		return error;
7748 	}
7749 
7750 	error = fchmod1(p, uap->fd, &va);
7751 
7752 	if (xsecdst != NULL) {
7753 		kauth_filesec_free(xsecdst);
7754 	}
7755 	return error;
7756 }
7757 
7758 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)7759 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7760 {
7761 	struct vnode_attr va;
7762 
7763 	VATTR_INIT(&va);
7764 	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7765 
7766 	return fchmod1(p, uap->fd, &va);
7767 }
7768 
7769 
7770 /*
7771  * Set ownership given a path name.
7772  */
7773 /* ARGSUSED */
7774 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)7775 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7776     gid_t gid, int flag, enum uio_seg segflg)
7777 {
7778 	vnode_t vp;
7779 	struct vnode_attr va;
7780 	int error;
7781 	struct nameidata nd;
7782 	int follow;
7783 	kauth_action_t action;
7784 	uint32_t wantparent = 0;
7785 
7786 #if CONFIG_FILE_LEASES
7787 	wantparent = WANTPARENT;
7788 #endif
7789 
7790 	AUDIT_ARG(owner, uid, gid);
7791 
7792 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7793 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent, segflg,
7794 	    path, ctx);
7795 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7796 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7797 	}
7798 	error = nameiat(&nd, fd);
7799 	if (error) {
7800 		return error;
7801 	}
7802 	vp = nd.ni_vp;
7803 
7804 	VATTR_INIT(&va);
7805 	if (uid != (uid_t)VNOVAL) {
7806 		VATTR_SET(&va, va_uid, uid);
7807 	}
7808 	if (gid != (gid_t)VNOVAL) {
7809 		VATTR_SET(&va, va_gid, gid);
7810 	}
7811 
7812 #if CONFIG_MACF
7813 	error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7814 	if (error) {
7815 		goto out;
7816 	}
7817 #endif
7818 
7819 	/* preflight and authorize attribute changes */
7820 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7821 		goto out;
7822 	}
7823 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7824 		goto out;
7825 	}
7826 
7827 #if CONFIG_FILE_LEASES
7828 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7829 #endif
7830 
7831 	error = vnode_setattr(vp, &va, ctx);
7832 
7833 #if CONFIG_MACF
7834 	if (error == 0) {
7835 		mac_vnode_notify_setowner(ctx, vp, uid, gid);
7836 	}
7837 #endif
7838 
7839 out:
7840 	/*
7841 	 * EACCES is only allowed from namei(); permissions failure should
7842 	 * return EPERM, so we need to translate the error code.
7843 	 */
7844 	if (error == EACCES) {
7845 		error = EPERM;
7846 	}
7847 
7848 #if CONFIG_FILE_LEASES
7849 	vnode_put(nd.ni_dvp);
7850 #endif
7851 	nameidone(&nd);
7852 	vnode_put(vp);
7853 	return error;
7854 }
7855 
7856 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)7857 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7858 {
7859 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7860 	           uap->uid, uap->gid, 0, UIO_USERSPACE);
7861 }
7862 
7863 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)7864 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7865 {
7866 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7867 	           uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7868 }
7869 
7870 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)7871 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7872 {
7873 	if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7874 		return EINVAL;
7875 	}
7876 
7877 	return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7878 	           uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7879 }
7880 
7881 /*
7882  * Set ownership given a file descriptor.
7883  */
7884 /* ARGSUSED */
7885 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)7886 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7887 {
7888 	struct vnode_attr va;
7889 	vfs_context_t ctx = vfs_context_current();
7890 	vnode_t vp;
7891 	int error;
7892 	kauth_action_t action;
7893 
7894 	AUDIT_ARG(owner, uap->uid, uap->gid);
7895 	AUDIT_ARG(fd, uap->fd);
7896 
7897 	if ((error = file_vnode(uap->fd, &vp))) {
7898 		return error;
7899 	}
7900 
7901 	if ((error = vnode_getwithref(vp))) {
7902 		file_drop(uap->fd);
7903 		return error;
7904 	}
7905 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7906 
7907 	VATTR_INIT(&va);
7908 	if (uap->uid != VNOVAL) {
7909 		VATTR_SET(&va, va_uid, uap->uid);
7910 	}
7911 	if (uap->gid != VNOVAL) {
7912 		VATTR_SET(&va, va_gid, uap->gid);
7913 	}
7914 
7915 #if NAMEDSTREAMS
7916 	/* chown calls are not allowed for resource forks. */
7917 	if (vp->v_flag & VISNAMEDSTREAM) {
7918 		error = EPERM;
7919 		goto out;
7920 	}
7921 #endif
7922 
7923 #if CONFIG_MACF
7924 	error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
7925 	if (error) {
7926 		goto out;
7927 	}
7928 #endif
7929 
7930 	/* preflight and authorize attribute changes */
7931 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7932 		goto out;
7933 	}
7934 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7935 		if (error == EACCES) {
7936 			error = EPERM;
7937 		}
7938 		goto out;
7939 	}
7940 
7941 #if CONFIG_FILE_LEASES
7942 	vnode_breakdirlease(vp, true, O_WRONLY);
7943 #endif
7944 
7945 	error = vnode_setattr(vp, &va, ctx);
7946 
7947 #if CONFIG_MACF
7948 	if (error == 0) {
7949 		mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
7950 	}
7951 #endif
7952 
7953 out:
7954 	(void)vnode_put(vp);
7955 	file_drop(uap->fd);
7956 	return error;
7957 }
7958 
7959 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)7960 getutimes(user_addr_t usrtvp, struct timespec *tsp)
7961 {
7962 	int error;
7963 
7964 	if (usrtvp == USER_ADDR_NULL) {
7965 		struct timeval old_tv;
7966 		/* XXX Y2038 bug because of microtime argument */
7967 		microtime(&old_tv);
7968 		TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
7969 		tsp[1] = tsp[0];
7970 	} else {
7971 		if (IS_64BIT_PROCESS(current_proc())) {
7972 			struct user64_timeval tv[2];
7973 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
7974 			if (error) {
7975 				return error;
7976 			}
7977 			TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
7978 			TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
7979 		} else {
7980 			struct user32_timeval tv[2];
7981 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
7982 			if (error) {
7983 				return error;
7984 			}
7985 			TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
7986 			TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
7987 		}
7988 	}
7989 	return 0;
7990 }
7991 
7992 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)7993 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
7994     int nullflag)
7995 {
7996 	int error;
7997 	struct vnode_attr va;
7998 	kauth_action_t action;
7999 
8000 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8001 
8002 	VATTR_INIT(&va);
8003 	VATTR_SET(&va, va_access_time, ts[0]);
8004 	VATTR_SET(&va, va_modify_time, ts[1]);
8005 	if (nullflag) {
8006 		va.va_vaflags |= VA_UTIMES_NULL;
8007 	}
8008 
8009 #if NAMEDSTREAMS
8010 	/* utimes calls are not allowed for resource forks. */
8011 	if (vp->v_flag & VISNAMEDSTREAM) {
8012 		error = EPERM;
8013 		goto out;
8014 	}
8015 #endif
8016 
8017 #if CONFIG_MACF
8018 	error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8019 	if (error) {
8020 		goto out;
8021 	}
8022 #endif
8023 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8024 		if (!nullflag && error == EACCES) {
8025 			error = EPERM;
8026 		}
8027 		goto out;
8028 	}
8029 
8030 	/* since we may not need to auth anything, check here */
8031 	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8032 		if (!nullflag && error == EACCES) {
8033 			error = EPERM;
8034 		}
8035 		goto out;
8036 	}
8037 	error = vnode_setattr(vp, &va, ctx);
8038 
8039 #if CONFIG_MACF
8040 	if (error == 0) {
8041 		mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8042 	}
8043 #endif
8044 
8045 out:
8046 	return error;
8047 }
8048 
8049 /*
8050  * Set the access and modification times of a file.
8051  */
8052 /* ARGSUSED */
8053 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8054 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8055 {
8056 	struct timespec ts[2];
8057 	user_addr_t usrtvp;
8058 	int error;
8059 	struct nameidata nd;
8060 	vfs_context_t ctx = vfs_context_current();
8061 	uint32_t wantparent = 0;
8062 
8063 #if CONFIG_FILE_LEASES
8064 	wantparent = WANTPARENT;
8065 #endif
8066 
8067 	/*
8068 	 * AUDIT: Needed to change the order of operations to do the
8069 	 * name lookup first because auditing wants the path.
8070 	 */
8071 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8072 	    UIO_USERSPACE, uap->path, ctx);
8073 	error = namei(&nd);
8074 	if (error) {
8075 		return error;
8076 	}
8077 
8078 	/*
8079 	 * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
8080 	 * the current time instead.
8081 	 */
8082 	usrtvp = uap->tptr;
8083 	if ((error = getutimes(usrtvp, ts)) != 0) {
8084 		goto out;
8085 	}
8086 
8087 #if CONFIG_FILE_LEASES
8088 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8089 #endif
8090 
8091 	error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8092 
8093 out:
8094 #if CONFIG_FILE_LEASES
8095 	vnode_put(nd.ni_dvp);
8096 #endif
8097 	nameidone(&nd);
8098 	vnode_put(nd.ni_vp);
8099 	return error;
8100 }
8101 
8102 /*
8103  * Set the access and modification times of a file.
8104  */
8105 /* ARGSUSED */
8106 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8107 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8108 {
8109 	struct timespec ts[2];
8110 	vnode_t vp;
8111 	user_addr_t usrtvp;
8112 	int error;
8113 
8114 	AUDIT_ARG(fd, uap->fd);
8115 	usrtvp = uap->tptr;
8116 	if ((error = getutimes(usrtvp, ts)) != 0) {
8117 		return error;
8118 	}
8119 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
8120 		return error;
8121 	}
8122 	if ((error = vnode_getwithref(vp))) {
8123 		file_drop(uap->fd);
8124 		return error;
8125 	}
8126 
8127 #if CONFIG_FILE_LEASES
8128 	vnode_breakdirlease(vp, true, O_WRONLY);
8129 #endif
8130 
8131 	error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8132 
8133 	vnode_put(vp);
8134 	file_drop(uap->fd);
8135 	return error;
8136 }
8137 
8138 static int
truncate_validate_common(proc_t p,off_t length)8139 truncate_validate_common(proc_t p, off_t length)
8140 {
8141 	rlim_t fsize_limit;
8142 
8143 	if (length < 0) {
8144 		return EINVAL;
8145 	}
8146 
8147 	fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8148 	if ((rlim_t)length > fsize_limit) {
8149 		psignal(p, SIGXFSZ);
8150 		return EFBIG;
8151 	}
8152 
8153 	return 0;
8154 }
8155 
8156 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8157 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8158     vfs_context_t ctx, boolean_t need_auth)
8159 {
8160 	struct vnode_attr va;
8161 	kauth_action_t action;
8162 	int error;
8163 
8164 	VATTR_INIT(&va);
8165 	VATTR_SET(&va, va_data_size, length);
8166 
8167 #if CONFIG_MACF
8168 	error = mac_vnode_check_truncate(ctx, cred, vp);
8169 	if (error) {
8170 		return error;
8171 	}
8172 #endif
8173 
8174 	/*
8175 	 * If we reached here from `ftruncate` then we already did an effective
8176 	 * `vnode_authorize` upon open.  We honour the result from then.
8177 	 */
8178 	if (need_auth) {
8179 		if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8180 			return error;
8181 		}
8182 
8183 		if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8184 			return error;
8185 		}
8186 	}
8187 
8188 #if CONFIG_FILE_LEASES
8189 	/* Check if there is a lease placed on the parent directory. */
8190 	vnode_breakdirlease(vp, true, O_WRONLY);
8191 
8192 	/* Now check if there is a lease placed on the file itself. */
8193 	(void)vnode_breaklease(vp, O_WRONLY, ctx);
8194 #endif
8195 
8196 	error = vnode_setattr(vp, &va, ctx);
8197 
8198 #if CONFIG_MACF
8199 	if (error == 0) {
8200 		mac_vnode_notify_truncate(ctx, cred, vp);
8201 	}
8202 #endif
8203 
8204 	return error;
8205 }
8206 
8207 /*
8208  * Truncate a file given its path name.
8209  */
8210 /* ARGSUSED */
8211 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8212 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8213 {
8214 	vfs_context_t ctx = vfs_context_current();
8215 	vnode_t vp;
8216 	int error;
8217 	struct nameidata nd;
8218 
8219 	if ((error = truncate_validate_common(p, uap->length))) {
8220 		return error;
8221 	}
8222 
8223 	NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8224 	    UIO_USERSPACE, uap->path, ctx);
8225 
8226 	if ((error = namei(&nd))) {
8227 		return error;
8228 	}
8229 
8230 	vp = nd.ni_vp;
8231 	nameidone(&nd);
8232 
8233 	error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8234 	vnode_put(vp);
8235 
8236 	return error;
8237 }
8238 
8239 /*
8240  * Truncate a file given a file descriptor.
8241  */
8242 /* ARGSUSED */
8243 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8244 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8245 {
8246 	vnode_t vp;
8247 	struct fileproc *fp;
8248 	int error;
8249 
8250 	AUDIT_ARG(fd, uap->fd);
8251 
8252 	if ((error = truncate_validate_common(p, uap->length))) {
8253 		return error;
8254 	}
8255 
8256 	if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8257 		return error;
8258 	}
8259 
8260 	switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8261 	case DTYPE_PSXSHM:
8262 		error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8263 		goto out;
8264 	case DTYPE_VNODE:
8265 		break;
8266 	default:
8267 		error = EINVAL;
8268 		goto out;
8269 	}
8270 
8271 	vp = (vnode_t)fp_get_data(fp);
8272 
8273 	if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8274 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8275 		error = EINVAL;
8276 		goto out;
8277 	}
8278 
8279 	if ((error = vnode_getwithref(vp)) != 0) {
8280 		goto out;
8281 	}
8282 
8283 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8284 
8285 	error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8286 	    vfs_context_current(), false);
8287 	vnode_put(vp);
8288 
8289 out:
8290 	file_drop(uap->fd);
8291 	return error;
8292 }
8293 
8294 
8295 /*
8296  * Sync an open file with synchronized I/O _file_ integrity completion
8297  */
8298 /* ARGSUSED */
8299 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8300 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8301 {
8302 	__pthread_testcancel(1);
8303 	return fsync_common(p, uap, MNT_WAIT);
8304 }
8305 
8306 
8307 /*
8308  * Sync an open file with synchronized I/O _file_ integrity completion
8309  *
8310  * Notes:	This is a legacy support function that does not test for
8311  *		thread cancellation points.
8312  */
8313 /* ARGSUSED */
8314 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8315 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8316 {
8317 	return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8318 }
8319 
8320 
8321 /*
8322  * Sync an open file with synchronized I/O _data_ integrity completion
8323  */
8324 /* ARGSUSED */
8325 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8326 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8327 {
8328 	__pthread_testcancel(1);
8329 	return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8330 }
8331 
8332 
8333 /*
8334  * fsync_common
8335  *
8336  * Common fsync code to support both synchronized I/O file integrity completion
8337  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8338  *
8339  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8340  * will only guarantee that the file data contents are retrievable.  If
8341  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8342  * includes additional metadata unnecessary for retrieving the file data
8343  * contents, such as atime, mtime, ctime, etc., also be committed to stable
8344  * storage.
8345  *
8346  * Parameters:	p				The process
8347  *		uap->fd				The descriptor to synchronize
8348  *		flags				The data integrity flags
8349  *
8350  * Returns:	int				Success
8351  *	fp_getfvp:EBADF				Bad file descriptor
8352  *	fp_getfvp:ENOTSUP			fd does not refer to a vnode
8353  *	VNOP_FSYNC:???				unspecified
8354  *
8355  * Notes:	We use struct fsync_args because it is a short name, and all
8356  *		caller argument structures are otherwise identical.
8357  */
8358 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8359 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8360 {
8361 	vnode_t vp;
8362 	struct fileproc *fp;
8363 	vfs_context_t ctx = vfs_context_current();
8364 	int error;
8365 
8366 	AUDIT_ARG(fd, uap->fd);
8367 
8368 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8369 		return error;
8370 	}
8371 	if ((error = vnode_getwithref(vp))) {
8372 		file_drop(uap->fd);
8373 		return error;
8374 	}
8375 
8376 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8377 
8378 	error = VNOP_FSYNC(vp, flags, ctx);
8379 
8380 #if NAMEDRSRCFORK
8381 	/* Sync resource fork shadow file if necessary. */
8382 	if ((error == 0) &&
8383 	    (vp->v_flag & VISNAMEDSTREAM) &&
8384 	    (vp->v_parent != NULLVP) &&
8385 	    vnode_isshadow(vp) &&
8386 	    (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8387 		(void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8388 	}
8389 #endif
8390 
8391 	(void)vnode_put(vp);
8392 	file_drop(uap->fd);
8393 	return error;
8394 }
8395 
8396 /*
8397  * Duplicate files.  Source must be a file, target must be a file or
8398  * must not exist.
8399  *
8400  * XXX Copyfile authorisation checking is woefully inadequate, and will not
8401  *     perform inheritance correctly.
8402  */
8403 /* ARGSUSED */
8404 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8405 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8406 {
8407 	vnode_t tvp, fvp, tdvp, sdvp;
8408 	struct nameidata fromnd, tond;
8409 	int error;
8410 	vfs_context_t ctx = vfs_context_current();
8411 
8412 	/* Check that the flags are valid. */
8413 	if (uap->flags & ~CPF_MASK) {
8414 		return EINVAL;
8415 	}
8416 
8417 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8418 	    UIO_USERSPACE, uap->from, ctx);
8419 	if ((error = namei(&fromnd))) {
8420 		return error;
8421 	}
8422 	fvp = fromnd.ni_vp;
8423 
8424 	NDINIT(&tond, CREATE, OP_LINK,
8425 	    LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8426 	    UIO_USERSPACE, uap->to, ctx);
8427 	if ((error = namei(&tond))) {
8428 		goto out1;
8429 	}
8430 	tdvp = tond.ni_dvp;
8431 	tvp = tond.ni_vp;
8432 
8433 	if (tvp != NULL) {
8434 		if (!(uap->flags & CPF_OVERWRITE)) {
8435 			error = EEXIST;
8436 			goto out;
8437 		}
8438 	}
8439 
8440 	if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8441 		error = EISDIR;
8442 		goto out;
8443 	}
8444 
8445 	if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8446 		error = EOPNOTSUPP;
8447 		goto out;
8448 	}
8449 
8450 #if CONFIG_MACF
8451 	if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8452 		goto out;
8453 	}
8454 #endif /* CONFIG_MACF */
8455 
8456 	if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8457 		goto out;
8458 	}
8459 	if (tvp) {
8460 		if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8461 			goto out;
8462 		}
8463 	}
8464 	if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8465 		goto out;
8466 	}
8467 
8468 	if (fvp == tdvp) {
8469 		error = EINVAL;
8470 	}
8471 	/*
8472 	 * If source is the same as the destination (that is the
8473 	 * same inode number) then there is nothing to do.
8474 	 * (fixed to have POSIX semantics - CSM 3/2/98)
8475 	 */
8476 	if (fvp == tvp) {
8477 		error = -1;
8478 	}
8479 
8480 #if CONFIG_FILE_LEASES
8481 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8482 #endif
8483 
8484 	if (!error) {
8485 		error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8486 	}
8487 out:
8488 	sdvp = tond.ni_startdir;
8489 	/*
8490 	 * nameidone has to happen before we vnode_put(tdvp)
8491 	 * since it may need to release the fs_nodelock on the tdvp
8492 	 */
8493 	nameidone(&tond);
8494 
8495 	if (tvp) {
8496 		vnode_put(tvp);
8497 	}
8498 	vnode_put(tdvp);
8499 	vnode_put(sdvp);
8500 out1:
8501 	vnode_put(fvp);
8502 
8503 	nameidone(&fromnd);
8504 
8505 	if (error == -1) {
8506 		return 0;
8507 	}
8508 	return error;
8509 }
8510 
8511 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8512 
8513 /*
8514  * Helper function for doing clones. The caller is expected to provide an
8515  * iocounted source vnode and release it.
8516  */
8517 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)8518 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8519     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8520 {
8521 	vnode_t tvp, tdvp;
8522 	struct nameidata tond;
8523 	int error;
8524 	int follow;
8525 	boolean_t free_src_acl;
8526 	boolean_t attr_cleanup;
8527 	enum vtype v_type;
8528 	kauth_action_t action;
8529 	struct componentname *cnp;
8530 	uint32_t defaulted = 0;
8531 	struct vnode_attr va;
8532 	struct vnode_attr nva;
8533 	uint32_t vnop_flags;
8534 
8535 	v_type = vnode_vtype(fvp);
8536 	switch (v_type) {
8537 	case VLNK:
8538 	/* FALLTHRU */
8539 	case VREG:
8540 		action = KAUTH_VNODE_ADD_FILE;
8541 		break;
8542 	case VDIR:
8543 		if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8544 		    fvp->v_mountedhere) {
8545 			return EINVAL;
8546 		}
8547 		action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8548 		break;
8549 	default:
8550 		return EINVAL;
8551 	}
8552 
8553 	AUDIT_ARG(fd2, dst_dirfd);
8554 	AUDIT_ARG(value32, flags);
8555 
8556 	follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8557 	NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8558 	    UIO_USERSPACE, dst, ctx);
8559 	if ((error = nameiat(&tond, dst_dirfd))) {
8560 		return error;
8561 	}
8562 	cnp = &tond.ni_cnd;
8563 	tdvp = tond.ni_dvp;
8564 	tvp = tond.ni_vp;
8565 
8566 	free_src_acl = FALSE;
8567 	attr_cleanup = FALSE;
8568 
8569 	if (tvp != NULL) {
8570 		error = EEXIST;
8571 		goto out;
8572 	}
8573 
8574 	if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8575 		error = EXDEV;
8576 		goto out;
8577 	}
8578 
8579 #if CONFIG_MACF
8580 	if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8581 		goto out;
8582 	}
8583 #endif
8584 	if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8585 		goto out;
8586 	}
8587 
8588 	action = KAUTH_VNODE_GENERIC_READ_BITS;
8589 	if (data_read_authorised) {
8590 		action &= ~KAUTH_VNODE_READ_DATA;
8591 	}
8592 	if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8593 		goto out;
8594 	}
8595 
8596 	/*
8597 	 * certain attributes may need to be changed from the source, we ask for
8598 	 * those here with the exception of source file's ACLs unless the CLONE_ACL
8599 	 * flag is specified. By default, the clone file will inherit the target
8600 	 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8601 	 * will inherit the source file's ACLs instead.
8602 	 */
8603 	VATTR_INIT(&va);
8604 	VATTR_WANTED(&va, va_uid);
8605 	VATTR_WANTED(&va, va_gid);
8606 	VATTR_WANTED(&va, va_mode);
8607 	VATTR_WANTED(&va, va_flags);
8608 	if (flags & CLONE_ACL) {
8609 		VATTR_WANTED(&va, va_acl);
8610 	}
8611 
8612 	if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
8613 		goto out;
8614 	}
8615 
8616 	VATTR_INIT(&nva);
8617 	VATTR_SET(&nva, va_type, v_type);
8618 	if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8619 		VATTR_SET(&nva, va_acl, va.va_acl);
8620 		free_src_acl = TRUE;
8621 	}
8622 
8623 	/* Handle ACL inheritance, initialize vap. */
8624 	if (v_type == VLNK) {
8625 		error = vnode_authattr_new(tdvp, &nva, 0, ctx);
8626 	} else {
8627 		error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
8628 		if (error) {
8629 			goto out;
8630 		}
8631 		attr_cleanup = TRUE;
8632 	}
8633 
8634 	vnop_flags = VNODE_CLONEFILE_DEFAULT;
8635 	/*
8636 	 * We've got initial values for all security parameters,
8637 	 * If we are superuser, then we can change owners to be the
8638 	 * same as the source. Both superuser and the owner have default
8639 	 * WRITE_SECURITY privileges so all other fields can be taken
8640 	 * from source as well.
8641 	 */
8642 	if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8643 		if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8644 			VATTR_SET(&nva, va_uid, va.va_uid);
8645 		}
8646 		if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8647 			VATTR_SET(&nva, va_gid, va.va_gid);
8648 		}
8649 	} else {
8650 		vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8651 	}
8652 
8653 	if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8654 		VATTR_SET(&nva, va_mode, va.va_mode);
8655 	}
8656 	if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8657 		VATTR_SET(&nva, va_flags,
8658 		    ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8659 		    (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8660 	}
8661 
8662 #if CONFIG_FILE_LEASES
8663 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8664 #endif
8665 
8666 	error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8667 
8668 	if (!error && tvp) {
8669 		int     update_flags = 0;
8670 #if CONFIG_FSE
8671 		int fsevent;
8672 #endif /* CONFIG_FSE */
8673 
8674 		/*
8675 		 * If some of the requested attributes weren't handled by the
8676 		 * VNOP, use our fallback code.
8677 		 */
8678 		if (!VATTR_ALL_SUPPORTED(&nva)) {
8679 			(void)vnode_setattr_fallback(tvp, &nva, ctx);
8680 		}
8681 
8682 #if CONFIG_MACF
8683 		(void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8684 		    VNODE_LABEL_CREATE, ctx);
8685 #endif
8686 
8687 		// Make sure the name & parent pointers are hooked up
8688 		if (tvp->v_name == NULL) {
8689 			update_flags |= VNODE_UPDATE_NAME;
8690 		}
8691 		if (tvp->v_parent == NULLVP) {
8692 			update_flags |= VNODE_UPDATE_PARENT;
8693 		}
8694 
8695 		if (update_flags) {
8696 			(void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8697 			    cnp->cn_namelen, cnp->cn_hash, update_flags);
8698 		}
8699 
8700 #if CONFIG_FSE
8701 		switch (vnode_vtype(tvp)) {
8702 		case VLNK:
8703 		/* FALLTHRU */
8704 		case VREG:
8705 			fsevent = FSE_CREATE_FILE;
8706 			break;
8707 		case VDIR:
8708 			fsevent = FSE_CREATE_DIR;
8709 			break;
8710 		default:
8711 			goto out;
8712 		}
8713 
8714 		if (need_fsevent(fsevent, tvp)) {
8715 			/*
8716 			 * The following is a sequence of three explicit events.
8717 			 * A pair of FSE_CLONE events representing the source and destination
8718 			 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8719 			 * fseventsd may coalesce the destination clone and create events
8720 			 * into a single event resulting in the following sequence for a client
8721 			 * FSE_CLONE (src)
8722 			 * FSE_CLONE | FSE_CREATE (dst)
8723 			 */
8724 			add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8725 			    FSE_ARG_DONE);
8726 			add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8727 			    FSE_ARG_DONE);
8728 		}
8729 #endif /* CONFIG_FSE */
8730 	}
8731 
8732 out:
8733 	if (attr_cleanup) {
8734 		vn_attribute_cleanup(&nva, defaulted);
8735 	}
8736 	if (free_src_acl && va.va_acl) {
8737 		kauth_acl_free(va.va_acl);
8738 	}
8739 	nameidone(&tond);
8740 	if (tvp) {
8741 		vnode_put(tvp);
8742 	}
8743 	vnode_put(tdvp);
8744 	return error;
8745 }
8746 
8747 /*
8748  * clone files or directories, target must not exist.
8749  */
8750 /* ARGSUSED */
8751 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8752 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8753     __unused int32_t *retval)
8754 {
8755 	vnode_t fvp;
8756 	struct nameidata fromnd;
8757 	int follow;
8758 	int error;
8759 	vfs_context_t ctx = vfs_context_current();
8760 
8761 	/* Check that the flags are valid. */
8762 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8763 		return EINVAL;
8764 	}
8765 
8766 	AUDIT_ARG(fd, uap->src_dirfd);
8767 
8768 	follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8769 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8770 	    UIO_USERSPACE, uap->src, ctx);
8771 	if ((error = nameiat(&fromnd, uap->src_dirfd))) {
8772 		return error;
8773 	}
8774 
8775 	fvp = fromnd.ni_vp;
8776 	nameidone(&fromnd);
8777 
8778 	error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8779 	    uap->flags, ctx);
8780 
8781 	vnode_put(fvp);
8782 	return error;
8783 }
8784 
8785 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)8786 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8787     __unused int32_t *retval)
8788 {
8789 	vnode_t fvp;
8790 	struct fileproc *fp;
8791 	int error;
8792 	vfs_context_t ctx = vfs_context_current();
8793 
8794 	/* Check that the flags are valid. */
8795 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8796 		return EINVAL;
8797 	}
8798 
8799 	AUDIT_ARG(fd, uap->src_fd);
8800 	error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8801 	if (error) {
8802 		return error;
8803 	}
8804 
8805 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8806 		AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8807 		error = EBADF;
8808 		goto out;
8809 	}
8810 
8811 	if ((error = vnode_getwithref(fvp))) {
8812 		goto out;
8813 	}
8814 
8815 	AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8816 
8817 	error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8818 	    uap->flags, ctx);
8819 
8820 	vnode_put(fvp);
8821 out:
8822 	file_drop(uap->src_fd);
8823 	return error;
8824 }
8825 
8826 static int
rename_submounts_callback(mount_t mp,void * arg)8827 rename_submounts_callback(mount_t mp, void *arg)
8828 {
8829 	int error = 0;
8830 	mount_t pmp = (mount_t)arg;
8831 	int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8832 
8833 	if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8834 		return 0;
8835 	}
8836 
8837 	if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8838 		return 0;
8839 	}
8840 
8841 	if ((error = vfs_busy(mp, LK_NOWAIT))) {
8842 		printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8843 		return -1;
8844 	}
8845 
8846 	size_t pathlen = MAXPATHLEN;
8847 	if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8848 		printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8849 	}
8850 
8851 	vfs_unbusy(mp);
8852 
8853 	return error;
8854 }
8855 
8856 /*
8857  * Rename files.  Source and destination must either both be directories,
8858  * or both not be directories.  If target is a directory, it must be empty.
8859  */
8860 /* ARGSUSED */
8861 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)8862 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8863     int tofd, user_addr_t to, int segflg, u_int uflags)
8864 {
8865 	vnode_t tvp, tdvp;
8866 	vnode_t fvp, fdvp;
8867 	vnode_t mnt_fvp;
8868 	struct nameidata *fromnd, *tond;
8869 	int error = 0;
8870 	int do_retry;
8871 	int retry_count;
8872 	int mntrename;
8873 	int need_event;
8874 	int need_kpath2;
8875 	int has_listeners;
8876 	const char *oname = NULL;
8877 	char *from_name = NULL, *to_name = NULL;
8878 	char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8879 	int from_len = 0, to_len = 0;
8880 	int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8881 	int holding_mntlock;
8882 	int vn_authorize_skipped;
8883 	mount_t locked_mp = NULL;
8884 	vnode_t oparent = NULLVP;
8885 #if CONFIG_FSE
8886 	fse_info from_finfo = {}, to_finfo;
8887 #endif
8888 	int from_truncated = 0, to_truncated = 0;
8889 	int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8890 	int batched = 0;
8891 	struct vnode_attr *fvap, *tvap;
8892 	int continuing = 0;
8893 	vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
8894 	int32_t nofollow_any = 0;
8895 	/* carving out a chunk for structs that are too big to be on stack. */
8896 	struct {
8897 		struct nameidata from_node, to_node;
8898 		struct vnode_attr fv_attr, tv_attr;
8899 	} * __rename_data;
8900 
8901 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
8902 	fromnd = &__rename_data->from_node;
8903 	tond = &__rename_data->to_node;
8904 
8905 	holding_mntlock = 0;
8906 	do_retry = 0;
8907 	retry_count = 0;
8908 retry:
8909 	fvp = tvp = NULL;
8910 	fdvp = tdvp = NULL;
8911 	fvap = tvap = NULL;
8912 	mnt_fvp = NULLVP;
8913 	mntrename = FALSE;
8914 	vn_authorize_skipped = FALSE;
8915 
8916 	if (uflags & RENAME_NOFOLLOW_ANY) {
8917 		nofollow_any = NAMEI_NOFOLLOW_ANY;
8918 	}
8919 	NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8920 	    segflg, from, ctx);
8921 	fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8922 
8923 	NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8924 	    segflg, to, ctx);
8925 	tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8926 
8927 continue_lookup:
8928 	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8929 		if ((error = nameiat(fromnd, fromfd))) {
8930 			goto out1;
8931 		}
8932 		fdvp = fromnd->ni_dvp;
8933 		fvp  = fromnd->ni_vp;
8934 
8935 		if (fvp && fvp->v_type == VDIR) {
8936 			tond->ni_cnd.cn_flags |= WILLBEDIR;
8937 		}
8938 	}
8939 
8940 	if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8941 		if ((error = nameiat(tond, tofd))) {
8942 			/*
8943 			 * Translate error code for rename("dir1", "dir2/.").
8944 			 */
8945 			if (error == EISDIR && fvp->v_type == VDIR) {
8946 				error = EINVAL;
8947 			}
8948 			goto out1;
8949 		}
8950 		tdvp = tond->ni_dvp;
8951 		tvp  = tond->ni_vp;
8952 	}
8953 
8954 #if DEVELOPMENT || DEBUG
8955 	/*
8956 	 * XXX VSWAP: Check for entitlements or special flag here
8957 	 * so we can restrict access appropriately.
8958 	 */
8959 #else /* DEVELOPMENT || DEBUG */
8960 
8961 	if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
8962 		error = EPERM;
8963 		goto out1;
8964 	}
8965 
8966 	if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
8967 		error = EPERM;
8968 		goto out1;
8969 	}
8970 #endif /* DEVELOPMENT || DEBUG */
8971 
8972 	if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
8973 		error = ENOENT;
8974 		goto out1;
8975 	}
8976 
8977 	if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
8978 		int32_t pval = 0;
8979 		int err = 0;
8980 
8981 		/*
8982 		 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
8983 		 * has the same name as target iff the following conditions are met:
8984 		 * 1. the target file system is case insensitive
8985 		 * 2. source and target directories are the same
8986 		 * 3. source and target files are the same
8987 		 * 4. name only differs in case (determined by underlying filesystem)
8988 		 */
8989 		if (fvp != tvp || fdvp != tdvp) {
8990 			error = EEXIST;
8991 			goto out1;
8992 		}
8993 
8994 		/*
8995 		 * Assume that the target file system is case sensitive if
8996 		 * _PC_CASE_SENSITIVE selector isn't supported.
8997 		 */
8998 		err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
8999 		if (err != 0 || pval != 0) {
9000 			error = EEXIST;
9001 			goto out1;
9002 		}
9003 	}
9004 
9005 	batched = vnode_compound_rename_available(fdvp);
9006 
9007 #if CONFIG_FSE
9008 	need_event = need_fsevent(FSE_RENAME, fdvp);
9009 	if (need_event) {
9010 		if (fvp) {
9011 			get_fse_info(fvp, &from_finfo, ctx);
9012 		} else {
9013 			error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9014 			if (error) {
9015 				goto out1;
9016 			}
9017 
9018 			fvap = &__rename_data->fv_attr;
9019 		}
9020 
9021 		if (tvp) {
9022 			get_fse_info(tvp, &to_finfo, ctx);
9023 		} else if (batched) {
9024 			error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9025 			if (error) {
9026 				goto out1;
9027 			}
9028 
9029 			tvap = &__rename_data->tv_attr;
9030 		}
9031 	}
9032 #else
9033 	need_event = 0;
9034 #endif /* CONFIG_FSE */
9035 
9036 	has_listeners = kauth_authorize_fileop_has_listeners();
9037 
9038 	need_kpath2 = 0;
9039 #if CONFIG_AUDIT
9040 	if (AUDIT_RECORD_EXISTS()) {
9041 		need_kpath2 = 1;
9042 	}
9043 #endif
9044 
9045 	if (need_event || has_listeners) {
9046 		if (from_name == NULL) {
9047 			GET_PATH(from_name);
9048 		}
9049 
9050 		from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9051 
9052 		if (from_name_no_firmlink == NULL) {
9053 			GET_PATH(from_name_no_firmlink);
9054 		}
9055 
9056 		from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9057 	}
9058 
9059 	if (need_event || need_kpath2 || has_listeners) {
9060 		if (to_name == NULL) {
9061 			GET_PATH(to_name);
9062 		}
9063 
9064 		to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9065 
9066 		if (to_name_no_firmlink == NULL) {
9067 			GET_PATH(to_name_no_firmlink);
9068 		}
9069 
9070 		to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9071 		if (to_name && need_kpath2) {
9072 			AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9073 		}
9074 	}
9075 	if (!fvp) {
9076 		/*
9077 		 * Claim: this check will never reject a valid rename.
9078 		 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9079 		 * Suppose fdvp and tdvp are not on the same mount.
9080 		 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
9081 		 *      then you can't move it to within another dir on the same mountpoint.
9082 		 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9083 		 *
9084 		 * If this check passes, then we are safe to pass these vnodes to the same FS.
9085 		 */
9086 		if (fdvp->v_mount != tdvp->v_mount) {
9087 			error = EXDEV;
9088 			goto out1;
9089 		}
9090 		goto skipped_lookup;
9091 	}
9092 
9093 	/*
9094 	 * If the source and destination are the same (i.e. they're
9095 	 * links to the same vnode) and the target file system is
9096 	 * case sensitive, then there is nothing to do.
9097 	 *
9098 	 * XXX Come back to this.
9099 	 */
9100 	if (fvp == tvp) {
9101 		int pathconf_val;
9102 
9103 		/*
9104 		 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9105 		 * then assume that this file system is case sensitive.
9106 		 */
9107 		if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9108 		    pathconf_val != 0) {
9109 			vn_authorize_skipped = TRUE;
9110 			goto out1;
9111 		}
9112 	}
9113 
9114 	/*
9115 	 * Allow the renaming of mount points.
9116 	 * - target must not exist
9117 	 * - target must reside in the same directory as source
9118 	 * - union mounts cannot be renamed
9119 	 * - the root fs, and tightly-linked system volumes, cannot be renamed
9120 	 *
9121 	 * XXX Handle this in VFS after a continued lookup (if we missed
9122 	 * in the cache to start off)
9123 	 *
9124 	 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9125 	 * we'll skip past here.  The file system is responsible for
9126 	 * checking that @tvp is not a descendent of @fvp and vice versa
9127 	 * so it should always return EINVAL if either @tvp or @fvp is the
9128 	 * root of a volume.
9129 	 */
9130 	if ((fvp->v_flag & VROOT) &&
9131 	    (fvp->v_type == VDIR) &&
9132 	    (tvp == NULL) &&
9133 	    (fvp->v_mountedhere == NULL) &&
9134 	    (fdvp == tdvp) &&
9135 	    ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9136 	    ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9137 	    (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9138 		vnode_t coveredvp;
9139 
9140 		/* switch fvp to the covered vnode */
9141 		coveredvp = fvp->v_mount->mnt_vnodecovered;
9142 		if ((vnode_getwithref(coveredvp))) {
9143 			error = ENOENT;
9144 			goto out1;
9145 		}
9146 		/*
9147 		 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9148 		 * later.
9149 		 */
9150 		mnt_fvp = fvp;
9151 
9152 		fvp = coveredvp;
9153 		mntrename = TRUE;
9154 	}
9155 	/*
9156 	 * Check for cross-device rename.
9157 	 */
9158 	if ((fvp->v_mount != tdvp->v_mount) ||
9159 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
9160 		error = EXDEV;
9161 		goto out1;
9162 	}
9163 
9164 	/*
9165 	 * If source is the same as the destination (that is the
9166 	 * same inode number) then there is nothing to do...
9167 	 * EXCEPT if the underlying file system supports case
9168 	 * insensitivity and is case preserving.  In this case
9169 	 * the file system needs to handle the special case of
9170 	 * getting the same vnode as target (fvp) and source (tvp).
9171 	 *
9172 	 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9173 	 * and _PC_CASE_PRESERVING can have this exception, and they need to
9174 	 * handle the special case of getting the same vnode as target and
9175 	 * source.  NOTE: Then the target is unlocked going into vnop_rename,
9176 	 * so not to cause locking problems. There is a single reference on tvp.
9177 	 *
9178 	 * NOTE - that fvp == tvp also occurs if they are hard linked and
9179 	 * that correct behaviour then is just to return success without doing
9180 	 * anything.
9181 	 *
9182 	 * XXX filesystem should take care of this itself, perhaps...
9183 	 */
9184 	if (fvp == tvp && fdvp == tdvp) {
9185 		if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9186 		    !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9187 		    fromnd->ni_cnd.cn_namelen)) {
9188 			vn_authorize_skipped = TRUE;
9189 			goto out1;
9190 		}
9191 	}
9192 
9193 	if (holding_mntlock && fvp->v_mount != locked_mp) {
9194 		/*
9195 		 * we're holding a reference and lock
9196 		 * on locked_mp, but it no longer matches
9197 		 * what we want to do... so drop our hold
9198 		 */
9199 		mount_unlock_renames(locked_mp);
9200 		mount_drop(locked_mp, 0);
9201 		holding_mntlock = 0;
9202 	}
9203 	if (tdvp != fdvp && fvp->v_type == VDIR) {
9204 		/*
9205 		 * serialize renames that re-shape
9206 		 * the tree... if holding_mntlock is
9207 		 * set, then we're ready to go...
9208 		 * otherwise we
9209 		 * first need to drop the iocounts
9210 		 * we picked up, second take the
9211 		 * lock to serialize the access,
9212 		 * then finally start the lookup
9213 		 * process over with the lock held
9214 		 */
9215 		if (!holding_mntlock) {
9216 			/*
9217 			 * need to grab a reference on
9218 			 * the mount point before we
9219 			 * drop all the iocounts... once
9220 			 * the iocounts are gone, the mount
9221 			 * could follow
9222 			 */
9223 			locked_mp = fvp->v_mount;
9224 			mount_ref(locked_mp, 0);
9225 
9226 			/*
9227 			 * nameidone has to happen before we vnode_put(tvp)
9228 			 * since it may need to release the fs_nodelock on the tvp
9229 			 */
9230 			nameidone(tond);
9231 
9232 			if (tvp) {
9233 				vnode_put(tvp);
9234 			}
9235 			vnode_put(tdvp);
9236 
9237 			/*
9238 			 * nameidone has to happen before we vnode_put(fdvp)
9239 			 * since it may need to release the fs_nodelock on the fvp
9240 			 */
9241 			nameidone(fromnd);
9242 
9243 			vnode_put(fvp);
9244 			vnode_put(fdvp);
9245 
9246 			if (mnt_fvp != NULLVP) {
9247 				vnode_put(mnt_fvp);
9248 			}
9249 
9250 			mount_lock_renames(locked_mp);
9251 			holding_mntlock = 1;
9252 
9253 			goto retry;
9254 		}
9255 	} else {
9256 		/*
9257 		 * when we dropped the iocounts to take
9258 		 * the lock, we allowed the identity of
9259 		 * the various vnodes to change... if they did,
9260 		 * we may no longer be dealing with a rename
9261 		 * that reshapes the tree... once we're holding
9262 		 * the iocounts, the vnodes can't change type
9263 		 * so we're free to drop the lock at this point
9264 		 * and continue on
9265 		 */
9266 		if (holding_mntlock) {
9267 			mount_unlock_renames(locked_mp);
9268 			mount_drop(locked_mp, 0);
9269 			holding_mntlock = 0;
9270 		}
9271 	}
9272 
9273 	if (!batched) {
9274 		error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9275 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9276 		    flags, NULL);
9277 		if (error) {
9278 			if (error == ENOENT) {
9279 				if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9280 					/*
9281 					 * We encountered a race where after doing the namei,
9282 					 * tvp stops being valid. If so, simply re-drive the rename
9283 					 * call from the top.
9284 					 */
9285 					do_retry = 1;
9286 					retry_count += 1;
9287 				}
9288 			}
9289 			goto out1;
9290 		}
9291 	}
9292 
9293 	/* Release the 'mnt_fvp' now that it is no longer needed. */
9294 	if (mnt_fvp != NULLVP) {
9295 		vnode_put(mnt_fvp);
9296 		mnt_fvp = NULLVP;
9297 	}
9298 
9299 	// save these off so we can later verify that fvp is the same
9300 	oname   = fvp->v_name;
9301 	oparent = fvp->v_parent;
9302 
9303 skipped_lookup:
9304 #if CONFIG_FILE_LEASES
9305 	/* Lease break needed for source's parent dir? */
9306 	vnode_breakdirlease(fdvp, false, O_WRONLY);
9307 
9308 	/* Lease break needed for target's parent dir? */
9309 	vnode_breakdirlease(tdvp, false, O_WRONLY);
9310 #endif
9311 
9312 	error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9313 	    tdvp, &tvp, &tond->ni_cnd, tvap,
9314 	    flags, ctx);
9315 
9316 	if (holding_mntlock) {
9317 		/*
9318 		 * we can drop our serialization
9319 		 * lock now
9320 		 */
9321 		mount_unlock_renames(locked_mp);
9322 		mount_drop(locked_mp, 0);
9323 		holding_mntlock = 0;
9324 	}
9325 	if (error) {
9326 		if (error == EDATALESS) {
9327 			/*
9328 			 * If we've been here before, something has gone
9329 			 * horribly wrong and we should just get out lest
9330 			 * we spiral around the drain forever.
9331 			 */
9332 			if (flags & VFS_RENAME_DATALESS) {
9333 				error = EIO;
9334 				goto out1;
9335 			}
9336 
9337 			/*
9338 			 * The object we're renaming is dataless (or has a
9339 			 * dataless descendent) and requires materialization
9340 			 * before the rename occurs.  But we're holding the
9341 			 * mount point's rename lock, so it's not safe to
9342 			 * make the upcall.
9343 			 *
9344 			 * In this case, we release the lock, perform the
9345 			 * materialization, and start the whole thing over.
9346 			 */
9347 			error = vnode_materialize_dataless_file(fvp,
9348 			    NAMESPACE_HANDLER_RENAME_OP);
9349 
9350 			if (error == 0) {
9351 				/*
9352 				 * The next time around we need to tell the
9353 				 * file system that the materializtaion has
9354 				 * been performed.
9355 				 */
9356 				flags |= VFS_RENAME_DATALESS;
9357 				do_retry = 1;
9358 			}
9359 			goto out1;
9360 		}
9361 		if (error == EKEEPLOOKING) {
9362 			if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9363 				if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9364 					panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9365 				}
9366 			}
9367 
9368 			fromnd->ni_vp = fvp;
9369 			tond->ni_vp = tvp;
9370 
9371 			goto continue_lookup;
9372 		}
9373 
9374 		/*
9375 		 * We may encounter a race in the VNOP where the destination didn't
9376 		 * exist when we did the namei, but it does by the time we go and
9377 		 * try to create the entry. In this case, we should re-drive this rename
9378 		 * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
9379 		 * but other filesystems susceptible to this race could return it, too.
9380 		 */
9381 		if (error == ERECYCLE) {
9382 			if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9383 				do_retry = 1;
9384 				retry_count += 1;
9385 			} else {
9386 				printf("rename retry limit due to ERECYCLE reached\n");
9387 				error = ENOENT;
9388 			}
9389 		}
9390 
9391 		/*
9392 		 * For compound VNOPs, the authorization callback may return
9393 		 * ENOENT in case of racing hardlink lookups hitting the name
9394 		 * cache, redrive the lookup.
9395 		 */
9396 		if (batched && error == ENOENT) {
9397 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9398 				do_retry = 1;
9399 				retry_count += 1;
9400 			}
9401 		}
9402 
9403 		goto out1;
9404 	}
9405 
9406 	/* call out to allow 3rd party notification of rename.
9407 	 * Ignore result of kauth_authorize_fileop call.
9408 	 */
9409 	kauth_authorize_fileop(vfs_context_ucred(ctx),
9410 	    KAUTH_FILEOP_RENAME,
9411 	    (uintptr_t)from_name, (uintptr_t)to_name);
9412 	if (flags & VFS_RENAME_SWAP) {
9413 		kauth_authorize_fileop(vfs_context_ucred(ctx),
9414 		    KAUTH_FILEOP_RENAME,
9415 		    (uintptr_t)to_name, (uintptr_t)from_name);
9416 	}
9417 
9418 #if CONFIG_FSE
9419 	if (from_name != NULL && to_name != NULL) {
9420 		if (from_truncated || to_truncated) {
9421 			// set it here since only the from_finfo gets reported up to user space
9422 			from_finfo.mode |= FSE_TRUNCATED_PATH;
9423 		}
9424 
9425 		if (tvap && tvp) {
9426 			vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
9427 		}
9428 		if (fvap) {
9429 			vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
9430 		}
9431 
9432 		if (tvp) {
9433 			add_fsevent(FSE_RENAME, ctx,
9434 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9435 			    FSE_ARG_FINFO, &from_finfo,
9436 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9437 			    FSE_ARG_FINFO, &to_finfo,
9438 			    FSE_ARG_DONE);
9439 			if (flags & VFS_RENAME_SWAP) {
9440 				/*
9441 				 * Strictly speaking, swap is the equivalent of
9442 				 * *three* renames.  FSEvents clients should only take
9443 				 * the events as a hint, so we only bother reporting
9444 				 * two.
9445 				 */
9446 				add_fsevent(FSE_RENAME, ctx,
9447 				    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9448 				    FSE_ARG_FINFO, &to_finfo,
9449 				    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9450 				    FSE_ARG_FINFO, &from_finfo,
9451 				    FSE_ARG_DONE);
9452 			}
9453 		} else {
9454 			add_fsevent(FSE_RENAME, ctx,
9455 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9456 			    FSE_ARG_FINFO, &from_finfo,
9457 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9458 			    FSE_ARG_DONE);
9459 		}
9460 	}
9461 #endif /* CONFIG_FSE */
9462 
9463 	/*
9464 	 * update filesystem's mount point data
9465 	 */
9466 	if (mntrename) {
9467 		char *cp, *pathend, *mpname;
9468 		char * tobuf;
9469 		struct mount *mp;
9470 		int maxlen;
9471 		size_t len = 0;
9472 
9473 		mp = fvp->v_mountedhere;
9474 
9475 		if (vfs_busy(mp, LK_NOWAIT)) {
9476 			error = EBUSY;
9477 			goto out1;
9478 		}
9479 		tobuf = zalloc(ZV_NAMEI);
9480 
9481 		if (UIO_SEG_IS_USER_SPACE(segflg)) {
9482 			error = copyinstr(to, tobuf, MAXPATHLEN, &len);
9483 		} else {
9484 			error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
9485 		}
9486 		if (!error) {
9487 			/* find current mount point prefix */
9488 			pathend = &mp->mnt_vfsstat.f_mntonname[0];
9489 			for (cp = pathend; *cp != '\0'; ++cp) {
9490 				if (*cp == '/') {
9491 					pathend = cp + 1;
9492 				}
9493 			}
9494 			/* find last component of target name */
9495 			for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9496 				if (*cp == '/') {
9497 					mpname = cp + 1;
9498 				}
9499 			}
9500 
9501 			/* Update f_mntonname of sub mounts */
9502 			vfs_iterate(0, rename_submounts_callback, (void *)mp);
9503 
9504 			/* append name to prefix */
9505 			maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9506 			bzero(pathend, maxlen);
9507 
9508 			strlcpy(pathend, mpname, maxlen);
9509 		}
9510 		zfree(ZV_NAMEI, tobuf);
9511 
9512 		vfs_unbusy(mp);
9513 
9514 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
9515 	}
9516 	/*
9517 	 * fix up name & parent pointers.  note that we first
9518 	 * check that fvp has the same name/parent pointers it
9519 	 * had before the rename call... this is a 'weak' check
9520 	 * at best...
9521 	 *
9522 	 * XXX oparent and oname may not be set in the compound vnop case
9523 	 */
9524 	if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9525 		int update_flags;
9526 
9527 		update_flags = VNODE_UPDATE_NAME;
9528 
9529 		if (fdvp != tdvp) {
9530 			update_flags |= VNODE_UPDATE_PARENT;
9531 		}
9532 
9533 		vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
9534 	}
9535 out1:
9536 	/*
9537 	 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9538 	 * skipped earlier as no actual rename was performed.
9539 	 */
9540 	if (vn_authorize_skipped && error == 0) {
9541 		error = vn_authorize_renamex_with_paths(fdvp, fvp,
9542 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9543 		    flags, NULL);
9544 		if (error && error == ENOENT) {
9545 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9546 				do_retry = 1;
9547 				retry_count += 1;
9548 			}
9549 		}
9550 	}
9551 	if (to_name != NULL) {
9552 		RELEASE_PATH(to_name);
9553 		to_name = NULL;
9554 	}
9555 	if (to_name_no_firmlink != NULL) {
9556 		RELEASE_PATH(to_name_no_firmlink);
9557 		to_name_no_firmlink = NULL;
9558 	}
9559 	if (from_name != NULL) {
9560 		RELEASE_PATH(from_name);
9561 		from_name = NULL;
9562 	}
9563 	if (from_name_no_firmlink != NULL) {
9564 		RELEASE_PATH(from_name_no_firmlink);
9565 		from_name_no_firmlink = NULL;
9566 	}
9567 	if (holding_mntlock) {
9568 		mount_unlock_renames(locked_mp);
9569 		mount_drop(locked_mp, 0);
9570 		holding_mntlock = 0;
9571 	}
9572 	if (tdvp) {
9573 		/*
9574 		 * nameidone has to happen before we vnode_put(tdvp)
9575 		 * since it may need to release the fs_nodelock on the tdvp
9576 		 */
9577 		nameidone(tond);
9578 
9579 		if (tvp) {
9580 			vnode_put(tvp);
9581 		}
9582 		vnode_put(tdvp);
9583 	}
9584 	if (fdvp) {
9585 		/*
9586 		 * nameidone has to happen before we vnode_put(fdvp)
9587 		 * since it may need to release the fs_nodelock on the fdvp
9588 		 */
9589 		nameidone(fromnd);
9590 
9591 		if (fvp) {
9592 			vnode_put(fvp);
9593 		}
9594 		vnode_put(fdvp);
9595 	}
9596 	if (mnt_fvp != NULLVP) {
9597 		vnode_put(mnt_fvp);
9598 	}
9599 	/*
9600 	 * If things changed after we did the namei, then we will re-drive
9601 	 * this rename call from the top.
9602 	 */
9603 	if (do_retry) {
9604 		do_retry = 0;
9605 		goto retry;
9606 	}
9607 
9608 	kfree_type(typeof(*__rename_data), __rename_data);
9609 	return error;
9610 }
9611 
9612 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9613 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9614 {
9615 	return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9616 	           AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9617 }
9618 
9619 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9620 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9621 {
9622 	if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9623 		return EINVAL;
9624 	}
9625 
9626 	if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9627 		return EINVAL;
9628 	}
9629 
9630 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9631 	           uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9632 }
9633 
9634 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9635 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9636 {
9637 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9638 	           uap->tofd, uap->to, UIO_USERSPACE, 0);
9639 }
9640 
9641 /*
9642  * Make a directory file.
9643  *
9644  * Returns:	0			Success
9645  *		EEXIST
9646  *	namei:???
9647  *	vnode_authorize:???
9648  *	vn_create:???
9649  */
9650 /* ARGSUSED */
9651 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9652 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9653     enum uio_seg segflg)
9654 {
9655 	vnode_t vp, dvp;
9656 	int error;
9657 	int update_flags = 0;
9658 	int batched;
9659 	struct nameidata nd;
9660 
9661 	AUDIT_ARG(mode, vap->va_mode);
9662 	NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9663 	    path, ctx);
9664 	nd.ni_cnd.cn_flags |= WILLBEDIR;
9665 	nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9666 
9667 continue_lookup:
9668 	error = nameiat(&nd, fd);
9669 	if (error) {
9670 		return error;
9671 	}
9672 	dvp = nd.ni_dvp;
9673 	vp = nd.ni_vp;
9674 
9675 	if (vp != NULL) {
9676 		error = EEXIST;
9677 		goto out;
9678 	}
9679 
9680 	batched = vnode_compound_mkdir_available(dvp);
9681 
9682 	VATTR_SET(vap, va_type, VDIR);
9683 
9684 	/*
9685 	 * XXX
9686 	 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9687 	 * only get EXISTS or EISDIR for existing path components, and not that it could see
9688 	 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9689 	 * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
9690 	 */
9691 	if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9692 		if (error == EACCES || error == EPERM) {
9693 			int error2;
9694 
9695 			nameidone(&nd);
9696 			vnode_put(dvp);
9697 			dvp = NULLVP;
9698 
9699 			/*
9700 			 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9701 			 * rather than EACCESS if the target exists.
9702 			 */
9703 			NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9704 			    path, ctx);
9705 			error2 = nameiat(&nd, fd);
9706 			if (error2) {
9707 				goto out;
9708 			} else {
9709 				vp = nd.ni_vp;
9710 				error = EEXIST;
9711 				goto out;
9712 			}
9713 		}
9714 
9715 		goto out;
9716 	}
9717 
9718 #if CONFIG_FILE_LEASES
9719 	vnode_breakdirlease(dvp, false, O_WRONLY);
9720 #endif
9721 
9722 	/*
9723 	 * make the directory
9724 	 */
9725 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9726 		if (error == EKEEPLOOKING) {
9727 			nd.ni_vp = vp;
9728 			goto continue_lookup;
9729 		}
9730 
9731 		goto out;
9732 	}
9733 
9734 	// Make sure the name & parent pointers are hooked up
9735 	if (vp->v_name == NULL) {
9736 		update_flags |= VNODE_UPDATE_NAME;
9737 	}
9738 	if (vp->v_parent == NULLVP) {
9739 		update_flags |= VNODE_UPDATE_PARENT;
9740 	}
9741 
9742 	if (update_flags) {
9743 		vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9744 	}
9745 
9746 #if CONFIG_FSE
9747 	add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9748 #endif
9749 
9750 out:
9751 	/*
9752 	 * nameidone has to happen before we vnode_put(dvp)
9753 	 * since it may need to release the fs_nodelock on the dvp
9754 	 */
9755 	nameidone(&nd);
9756 
9757 	if (vp) {
9758 		vnode_put(vp);
9759 	}
9760 	if (dvp) {
9761 		vnode_put(dvp);
9762 	}
9763 
9764 	return error;
9765 }
9766 
9767 /*
9768  * mkdir_extended: Create a directory; with extended security (ACL).
9769  *
9770  * Parameters:    p                       Process requesting to create the directory
9771  *                uap                     User argument descriptor (see below)
9772  *                retval                  (ignored)
9773  *
9774  * Indirect:      uap->path               Path of directory to create
9775  *                uap->mode               Access permissions to set
9776  *                uap->xsecurity          ACL to set
9777  *
9778  * Returns:        0                      Success
9779  *                !0                      Not success
9780  *
9781  */
9782 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)9783 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9784 {
9785 	int ciferror;
9786 	kauth_filesec_t xsecdst;
9787 	struct vnode_attr va;
9788 
9789 	AUDIT_ARG(owner, uap->uid, uap->gid);
9790 
9791 	xsecdst = NULL;
9792 	if ((uap->xsecurity != USER_ADDR_NULL) &&
9793 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9794 		return ciferror;
9795 	}
9796 
9797 	VATTR_INIT(&va);
9798 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9799 	if (xsecdst != NULL) {
9800 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9801 		va.va_vaflags |= VA_FILESEC_ACL;
9802 	}
9803 
9804 	ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9805 	    UIO_USERSPACE);
9806 	if (xsecdst != NULL) {
9807 		kauth_filesec_free(xsecdst);
9808 	}
9809 	return ciferror;
9810 }
9811 
9812 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)9813 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9814 {
9815 	struct vnode_attr va;
9816 
9817 	VATTR_INIT(&va);
9818 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9819 
9820 	return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9821 	           UIO_USERSPACE);
9822 }
9823 
9824 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)9825 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9826 {
9827 	struct vnode_attr va;
9828 
9829 	VATTR_INIT(&va);
9830 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9831 
9832 	return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9833 	           UIO_USERSPACE);
9834 }
9835 
9836 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)9837 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9838     enum uio_seg segflg, int unlink_flags)
9839 {
9840 	struct {
9841 		struct nameidata nd;
9842 #if CONFIG_FSE
9843 		struct vnode_attr va;
9844 #endif /* CONFIG_FSE */
9845 	} *__rmdir_data;
9846 	vnode_t vp, dvp;
9847 	int error;
9848 	struct nameidata *ndp;
9849 	char     *path = NULL;
9850 	char     *no_firmlink_path = NULL;
9851 	int       len_path = 0;
9852 	int       len_no_firmlink_path = 0;
9853 	int has_listeners = 0;
9854 	int need_event = 0;
9855 	int truncated_path = 0;
9856 	int truncated_no_firmlink_path = 0;
9857 	struct vnode_attr *vap = NULL;
9858 	int restart_count = 0;
9859 	int batched;
9860 
9861 	int restart_flag;
9862 
9863 	__rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
9864 	ndp = &__rmdir_data->nd;
9865 
9866 	/*
9867 	 * This loop exists to restart rmdir in the unlikely case that two
9868 	 * processes are simultaneously trying to remove the same directory
9869 	 * containing orphaned appleDouble files.
9870 	 */
9871 	do {
9872 		NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9873 		    segflg, dirpath, ctx);
9874 		ndp->ni_flag = NAMEI_COMPOUNDRMDIR;
9875 continue_lookup:
9876 		restart_flag = 0;
9877 		vap = NULL;
9878 
9879 		error = nameiat(ndp, fd);
9880 		if (error) {
9881 			goto err_out;
9882 		}
9883 
9884 		dvp = ndp->ni_dvp;
9885 		vp = ndp->ni_vp;
9886 
9887 		if (vp) {
9888 			batched = vnode_compound_rmdir_available(vp);
9889 
9890 			if (vp->v_flag & VROOT) {
9891 				/*
9892 				 * The root of a mounted filesystem cannot be deleted.
9893 				 */
9894 				error = EBUSY;
9895 				goto out;
9896 			}
9897 
9898 #if DEVELOPMENT || DEBUG
9899 			/*
9900 			 * XXX VSWAP: Check for entitlements or special flag here
9901 			 * so we can restrict access appropriately.
9902 			 */
9903 #else /* DEVELOPMENT || DEBUG */
9904 
9905 			if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
9906 				error = EPERM;
9907 				goto out;
9908 			}
9909 #endif /* DEVELOPMENT || DEBUG */
9910 
9911 			/*
9912 			 * Removed a check here; we used to abort if vp's vid
9913 			 * was not the same as what we'd seen the last time around.
9914 			 * I do not think that check was valid, because if we retry
9915 			 * and all dirents are gone, the directory could legitimately
9916 			 * be recycled but still be present in a situation where we would
9917 			 * have had permission to delete.  Therefore, we won't make
9918 			 * an effort to preserve that check now that we may not have a
9919 			 * vp here.
9920 			 */
9921 
9922 			if (!batched) {
9923 				error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
9924 				if (error) {
9925 					if (error == ENOENT) {
9926 						if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9927 							restart_flag = 1;
9928 							restart_count += 1;
9929 						}
9930 					}
9931 					goto out;
9932 				}
9933 			}
9934 		} else {
9935 			batched = 1;
9936 
9937 			if (!vnode_compound_rmdir_available(dvp)) {
9938 				panic("No error, but no compound rmdir?");
9939 			}
9940 		}
9941 
9942 #if CONFIG_FSE
9943 		fse_info  finfo = {0};
9944 
9945 		need_event = need_fsevent(FSE_DELETE, dvp);
9946 		if (need_event) {
9947 			if (!batched) {
9948 				get_fse_info(vp, &finfo, ctx);
9949 			} else {
9950 				error = vfs_get_notify_attributes(&__rmdir_data->va);
9951 				if (error) {
9952 					goto out;
9953 				}
9954 
9955 				vap = &__rmdir_data->va;
9956 			}
9957 		}
9958 #endif
9959 		has_listeners = kauth_authorize_fileop_has_listeners();
9960 		if (need_event || has_listeners) {
9961 			if (path == NULL) {
9962 				GET_PATH(path);
9963 			}
9964 
9965 			len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
9966 
9967 			if (no_firmlink_path == NULL) {
9968 				GET_PATH(no_firmlink_path);
9969 			}
9970 
9971 			len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
9972 #if CONFIG_FSE
9973 			if (truncated_no_firmlink_path) {
9974 				finfo.mode |= FSE_TRUNCATED_PATH;
9975 			}
9976 #endif
9977 		}
9978 
9979 #if CONFIG_FILE_LEASES
9980 		vnode_breakdirlease(dvp, false, O_WRONLY);
9981 #endif
9982 
9983 		error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
9984 		ndp->ni_vp = vp;
9985 		if (vp == NULLVP) {
9986 			/* Couldn't find a vnode */
9987 			goto out;
9988 		}
9989 
9990 		if (error == EKEEPLOOKING) {
9991 			goto continue_lookup;
9992 		} else if (batched && error == ENOENT) {
9993 			if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9994 				/*
9995 				 * For compound VNOPs, the authorization callback
9996 				 * may return ENOENT in case of racing hard link lookups
9997 				 * redrive the lookup.
9998 				 */
9999 				restart_flag = 1;
10000 				restart_count += 1;
10001 				goto out;
10002 			}
10003 		}
10004 
10005 		/*
10006 		 * XXX There's no provision for passing flags
10007 		 * to VNOP_RMDIR().  So, if vn_rmdir() fails
10008 		 * because it's not empty, then we try again
10009 		 * with VNOP_REMOVE(), passing in a special
10010 		 * flag that clever file systems will know
10011 		 * how to handle.
10012 		 */
10013 		if (error == ENOTEMPTY &&
10014 		    (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10015 			/*
10016 			 * If this fails, we want to keep the original
10017 			 * error.
10018 			 */
10019 			if (vn_remove(dvp, &vp, ndp,
10020 			    VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10021 				error = 0;
10022 			}
10023 		}
10024 
10025 #if CONFIG_APPLEDOUBLE
10026 		/*
10027 		 * Special case to remove orphaned AppleDouble
10028 		 * files. I don't like putting this in the kernel,
10029 		 * but carbon does not like putting this in carbon either,
10030 		 * so here we are.
10031 		 */
10032 		if (error == ENOTEMPTY) {
10033 			int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10034 			if (ad_error == EBUSY) {
10035 				error = ad_error;
10036 				goto out;
10037 			}
10038 
10039 
10040 			/*
10041 			 * Assuming everything went well, we will try the RMDIR again
10042 			 */
10043 			if (!ad_error) {
10044 				error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10045 			}
10046 		}
10047 #endif /* CONFIG_APPLEDOUBLE */
10048 		/*
10049 		 * Call out to allow 3rd party notification of delete.
10050 		 * Ignore result of kauth_authorize_fileop call.
10051 		 */
10052 		if (!error) {
10053 			if (has_listeners) {
10054 				kauth_authorize_fileop(vfs_context_ucred(ctx),
10055 				    KAUTH_FILEOP_DELETE,
10056 				    (uintptr_t)vp,
10057 				    (uintptr_t)path);
10058 			}
10059 
10060 			if (vp->v_flag & VISHARDLINK) {
10061 				// see the comment in unlink1() about why we update
10062 				// the parent of a hard link when it is removed
10063 				vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10064 			}
10065 
10066 #if CONFIG_FSE
10067 			if (need_event) {
10068 				if (vap) {
10069 					vnode_get_fse_info_from_vap(vp, &finfo, vap);
10070 				}
10071 				add_fsevent(FSE_DELETE, ctx,
10072 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10073 				    FSE_ARG_FINFO, &finfo,
10074 				    FSE_ARG_DONE);
10075 			}
10076 #endif
10077 
10078 #if CONFIG_MACF
10079 			mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10080 #endif
10081 		}
10082 
10083 out:
10084 		if (path != NULL) {
10085 			RELEASE_PATH(path);
10086 			path = NULL;
10087 		}
10088 
10089 		if (no_firmlink_path != NULL) {
10090 			RELEASE_PATH(no_firmlink_path);
10091 			no_firmlink_path = NULL;
10092 		}
10093 
10094 		/*
10095 		 * nameidone has to happen before we vnode_put(dvp)
10096 		 * since it may need to release the fs_nodelock on the dvp
10097 		 */
10098 		nameidone(ndp);
10099 		vnode_put(dvp);
10100 
10101 		if (vp) {
10102 			vnode_put(vp);
10103 		}
10104 
10105 		if (restart_flag == 0) {
10106 			wakeup_one((caddr_t)vp);
10107 			goto err_out;
10108 		}
10109 		tsleep(vp, PVFS, "rm AD", 1);
10110 	} while (restart_flag != 0);
10111 
10112 err_out:
10113 	kfree_type(typeof(*__rmdir_data), __rmdir_data);
10114 
10115 	return error;
10116 }
10117 
10118 /*
10119  * Remove a directory file.
10120  */
10121 /* ARGSUSED */
10122 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10123 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10124 {
10125 	return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10126 	           CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10127 }
10128 
10129 /* Get direntry length padded to 8 byte alignment */
10130 #define DIRENT64_LEN(namlen) \
10131 	((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10132 
10133 /* Get dirent length padded to 4 byte alignment */
10134 #define DIRENT_LEN(namelen) \
10135 	((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10136 
10137 /* Get the end of this dirent */
10138 #define DIRENT_END(dep) \
10139 	(((char *)(dep)) + (dep)->d_reclen - 1)
10140 
10141 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10142 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10143     int *numdirent, vfs_context_t ctxp)
10144 {
10145 	/* Check if fs natively supports VNODE_READDIR_EXTENDED */
10146 	if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10147 	    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10148 		return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10149 	} else {
10150 		size_t bufsize;
10151 		void * bufptr;
10152 		uio_t auio;
10153 		struct direntry *entry64;
10154 		struct dirent *dep;
10155 		size_t bytesread;
10156 		int error;
10157 
10158 		/*
10159 		 * We're here because the underlying file system does not
10160 		 * support direnties or we mounted denying support so we must
10161 		 * fall back to dirents and convert them to direntries.
10162 		 *
10163 		 * Our kernel buffer needs to be smaller since re-packing will
10164 		 * expand each dirent.  The worse case (when the name length
10165 		 * is 3 or less) corresponds to a struct direntry size of 32
10166 		 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10167 		 * (4-byte aligned).  So having a buffer that is 3/8 the size
10168 		 * will prevent us from reading more than we can pack.
10169 		 *
10170 		 * Since this buffer is wired memory, we will limit the
10171 		 * buffer size to a maximum of 32K. We would really like to
10172 		 * use 32K in the MIN(), but we use magic number 87371 to
10173 		 * prevent uio_resid() * 3 / 8 from overflowing.
10174 		 */
10175 		bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10176 		bufptr = kalloc_data(bufsize, Z_WAITOK);
10177 		if (bufptr == NULL) {
10178 			return ENOMEM;
10179 		}
10180 
10181 		auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10182 		uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10183 		auio->uio_offset = uio->uio_offset;
10184 
10185 		error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10186 
10187 		dep = (struct dirent *)bufptr;
10188 		bytesread = bufsize - uio_resid(auio);
10189 
10190 		entry64 = kalloc_type(struct direntry, Z_WAITOK);
10191 		/*
10192 		 * Convert all the entries and copy them out to user's buffer.
10193 		 */
10194 		while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10195 			/* First check that the dirent struct up to d_name is within the buffer */
10196 			if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10197 			    /* Check that the length of the entire dirent is within the buffer */
10198 			    DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10199 			    /* Check that the actual length including the name doesn't exceed d_reclen */
10200 			    DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10201 				printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10202 				    vp->v_mount->mnt_vfsstat.f_mntonname,
10203 				    vp->v_name ? vp->v_name : "<unknown>");
10204 				error = EIO;
10205 				break;
10206 			}
10207 
10208 			size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
10209 
10210 			bzero(entry64, enbufsize);
10211 			/* Convert a dirent to a dirent64. */
10212 			entry64->d_ino = dep->d_ino;
10213 			entry64->d_seekoff = 0;
10214 			entry64->d_reclen = (uint16_t)enbufsize;
10215 			entry64->d_namlen = dep->d_namlen;
10216 			entry64->d_type = dep->d_type;
10217 			bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10218 
10219 			/* Move to next entry. */
10220 			dep = (struct dirent *)((char *)dep + dep->d_reclen);
10221 
10222 			/* Copy entry64 to user's buffer. */
10223 			error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10224 		}
10225 
10226 		/* Update the real offset using the offset we got from VNOP_READDIR. */
10227 		if (error == 0) {
10228 			uio->uio_offset = auio->uio_offset;
10229 		}
10230 		uio_free(auio);
10231 		kfree_data(bufptr, bufsize);
10232 		kfree_type(struct direntry, entry64);
10233 		return error;
10234 	}
10235 }
10236 
10237 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
10238 
10239 /*
10240  * Read a block of directory entries in a file system independent format.
10241  */
10242 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10243 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10244     off_t *offset, int *eofflag, int flags)
10245 {
10246 	vnode_t vp;
10247 	struct vfs_context context = *vfs_context_current();    /* local copy */
10248 	struct fileproc *fp;
10249 	uio_t auio;
10250 	int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10251 	off_t loff;
10252 	int error, numdirent;
10253 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10254 
10255 get_from_fd:
10256 	error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10257 	if (error) {
10258 		return error;
10259 	}
10260 
10261 	vn_offset_lock(fp->fp_glob);
10262 	if (((vnode_t)fp_get_data(fp)) != vp) {
10263 		vn_offset_unlock(fp->fp_glob);
10264 		file_drop(fd);
10265 		goto get_from_fd;
10266 	}
10267 
10268 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10269 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10270 		error = EBADF;
10271 		goto out;
10272 	}
10273 
10274 	if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10275 		bufsize = GETDIRENTRIES_MAXBUFSIZE;
10276 	}
10277 
10278 #if CONFIG_MACF
10279 	error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10280 	if (error) {
10281 		goto out;
10282 	}
10283 #endif
10284 
10285 	if ((error = vnode_getwithref(vp))) {
10286 		goto out;
10287 	}
10288 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10289 
10290 #if CONFIG_UNION_MOUNTS
10291 unionread:
10292 #endif /* CONFIG_UNION_MOUNTS */
10293 	if (vp->v_type != VDIR) {
10294 		(void)vnode_put(vp);
10295 		error = EINVAL;
10296 		goto out;
10297 	}
10298 
10299 #if CONFIG_MACF
10300 	error = mac_vnode_check_readdir(&context, vp);
10301 	if (error != 0) {
10302 		(void)vnode_put(vp);
10303 		goto out;
10304 	}
10305 #endif /* MAC */
10306 
10307 	loff = fp->fp_glob->fg_offset;
10308 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10309 	uio_addiov(auio, bufp, bufsize);
10310 
10311 	if (flags & VNODE_READDIR_EXTENDED) {
10312 		error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10313 		fp->fp_glob->fg_offset = uio_offset(auio);
10314 	} else {
10315 		error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10316 		fp->fp_glob->fg_offset = uio_offset(auio);
10317 	}
10318 	if (error) {
10319 		(void)vnode_put(vp);
10320 		goto out;
10321 	}
10322 
10323 #if CONFIG_UNION_MOUNTS
10324 	if ((user_ssize_t)bufsize == uio_resid(auio) &&
10325 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
10326 		vnode_t uvp;
10327 
10328 		if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10329 			if (vnode_ref(uvp) == 0) {
10330 				fp_set_data(fp, uvp);
10331 				fp->fp_glob->fg_offset = 0;
10332 				vnode_rele(vp);
10333 				vnode_put(vp);
10334 				vp = uvp;
10335 				goto unionread;
10336 			} else {
10337 				/* could not get a ref, can't replace in fd */
10338 				vnode_put(uvp);
10339 			}
10340 		}
10341 	}
10342 #endif /* CONFIG_UNION_MOUNTS */
10343 
10344 	vnode_put(vp);
10345 	if (offset) {
10346 		*offset = loff;
10347 	}
10348 
10349 	*bytesread = bufsize - uio_resid(auio);
10350 out:
10351 	vn_offset_unlock(fp->fp_glob);
10352 	file_drop(fd);
10353 	return error;
10354 }
10355 
10356 
10357 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)10358 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10359 {
10360 	off_t offset;
10361 	ssize_t bytesread;
10362 	int error, eofflag;
10363 
10364 	AUDIT_ARG(fd, uap->fd);
10365 	error = getdirentries_common(uap->fd, uap->buf, uap->count,
10366 	    &bytesread, &offset, &eofflag, 0);
10367 
10368 	if (error == 0) {
10369 		if (proc_is64bit(p)) {
10370 			user64_long_t base = (user64_long_t)offset;
10371 			error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10372 		} else {
10373 			user32_long_t base = (user32_long_t)offset;
10374 			error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10375 		}
10376 		*retval = (int)bytesread;
10377 	}
10378 	return error;
10379 }
10380 
10381 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)10382 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10383 {
10384 	off_t offset;
10385 	ssize_t bytesread;
10386 	int error, eofflag;
10387 	user_size_t bufsize;
10388 
10389 	AUDIT_ARG(fd, uap->fd);
10390 
10391 	/*
10392 	 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10393 	 * then the kernel carves out the last 4 bytes to return extended
10394 	 * information to userspace (namely whether we reached EOF with this call).
10395 	 */
10396 	if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10397 		bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10398 	} else {
10399 		bufsize = uap->bufsize;
10400 	}
10401 
10402 	error = getdirentries_common(uap->fd, uap->buf, bufsize,
10403 	    &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
10404 
10405 	if (error == 0) {
10406 		*retval = bytesread;
10407 		error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10408 
10409 		if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10410 			getdirentries64_flags_t flags = 0;
10411 			if (eofflag) {
10412 				flags |= GETDIRENTRIES64_EOF;
10413 			}
10414 			error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10415 			    sizeof(flags));
10416 		}
10417 	}
10418 	return error;
10419 }
10420 
10421 
10422 /*
10423  * Set the mode mask for creation of filesystem nodes.
10424  * XXX implement xsecurity
10425  */
10426 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
10427 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)10428 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10429 {
10430 	AUDIT_ARG(mask, newmask);
10431 	proc_fdlock(p);
10432 	*retval = p->p_fd.fd_cmask;
10433 	p->p_fd.fd_cmask = newmask & ALLPERMS;
10434 	proc_fdunlock(p);
10435 	return 0;
10436 }
10437 
10438 /*
10439  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10440  *
10441  * Parameters:    p                       Process requesting to set the umask
10442  *                uap                     User argument descriptor (see below)
10443  *                retval                  umask of the process (parameter p)
10444  *
10445  * Indirect:      uap->newmask            umask to set
10446  *                uap->xsecurity          ACL to set
10447  *
10448  * Returns:        0                      Success
10449  *                !0                      Not success
10450  *
10451  */
10452 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)10453 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10454 {
10455 	return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
10456 }
10457 
10458 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)10459 umask(proc_t p, struct umask_args *uap, int32_t *retval)
10460 {
10461 	return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
10462 }
10463 
10464 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT                               \
10465 	"com.apple.private.vfs.revoke-mounted-device"
10466 
10467 /*
10468  * Void all references to file by ripping underlying filesystem
10469  * away from vnode.
10470  */
10471 /* ARGSUSED */
10472 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)10473 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10474 {
10475 	vnode_t vp;
10476 	struct vnode_attr va;
10477 	vfs_context_t ctx = vfs_context_current();
10478 	int error;
10479 	struct nameidata nd;
10480 
10481 	NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10482 	    uap->path, ctx);
10483 	error = namei(&nd);
10484 	if (error) {
10485 		return error;
10486 	}
10487 	vp = nd.ni_vp;
10488 
10489 	nameidone(&nd);
10490 
10491 	if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10492 		error = ENOTSUP;
10493 		goto out;
10494 	}
10495 
10496 	if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10497 		error = EBUSY;
10498 		goto out;
10499 	}
10500 
10501 #if CONFIG_MACF
10502 	error = mac_vnode_check_revoke(ctx, vp);
10503 	if (error) {
10504 		goto out;
10505 	}
10506 #endif
10507 
10508 	VATTR_INIT(&va);
10509 	VATTR_WANTED(&va, va_uid);
10510 	if ((error = vnode_getattr(vp, &va, ctx))) {
10511 		goto out;
10512 	}
10513 	if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
10514 	    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
10515 		goto out;
10516 	}
10517 	if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10518 		VNOP_REVOKE(vp, REVOKEALL, ctx);
10519 	}
10520 out:
10521 	vnode_put(vp);
10522 	return error;
10523 }
10524 
10525 
10526 /*
10527  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10528  *  The following system calls are designed to support features
10529  *  which are specific to the HFS & HFS Plus volume formats
10530  */
10531 
10532 
10533 /*
10534  * Obtain attribute information on objects in a directory while enumerating
10535  * the directory.
10536  */
10537 /* ARGSUSED */
10538 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)10539 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10540 {
10541 	vnode_t vp;
10542 	struct fileproc *fp;
10543 	uio_t auio = NULL;
10544 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10545 	uint32_t count = 0, savecount = 0;
10546 	uint32_t newstate = 0;
10547 	int error, eofflag = 0;
10548 	off_t loff = 0;
10549 	struct attrlist attributelist;
10550 	vfs_context_t ctx = vfs_context_current();
10551 	int fd = uap->fd;
10552 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10553 	kauth_action_t action;
10554 
10555 	AUDIT_ARG(fd, fd);
10556 
10557 	/* Get the attributes into kernel space */
10558 	if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10559 		return error;
10560 	}
10561 	if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10562 		return error;
10563 	}
10564 	savecount = count;
10565 
10566 get_from_fd:
10567 	if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10568 		return error;
10569 	}
10570 
10571 	vn_offset_lock(fp->fp_glob);
10572 	if (((vnode_t)fp_get_data(fp)) != vp) {
10573 		vn_offset_unlock(fp->fp_glob);
10574 		file_drop(fd);
10575 		goto get_from_fd;
10576 	}
10577 
10578 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10579 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10580 		error = EBADF;
10581 		goto out;
10582 	}
10583 
10584 
10585 #if CONFIG_MACF
10586 	error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10587 	    fp->fp_glob);
10588 	if (error) {
10589 		goto out;
10590 	}
10591 #endif
10592 
10593 
10594 	if ((error = vnode_getwithref(vp))) {
10595 		goto out;
10596 	}
10597 
10598 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10599 
10600 #if CONFIG_UNION_MOUNTS
10601 unionread:
10602 #endif /* CONFIG_UNION_MOUNTS */
10603 	if (vp->v_type != VDIR) {
10604 		(void)vnode_put(vp);
10605 		error = EINVAL;
10606 		goto out;
10607 	}
10608 
10609 #if CONFIG_MACF
10610 	error = mac_vnode_check_readdir(ctx, vp);
10611 	if (error != 0) {
10612 		(void)vnode_put(vp);
10613 		goto out;
10614 	}
10615 #endif /* MAC */
10616 
10617 	/* set up the uio structure which will contain the users return buffer */
10618 	loff = fp->fp_glob->fg_offset;
10619 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10620 	uio_addiov(auio, uap->buffer, uap->buffersize);
10621 
10622 	/*
10623 	 * If the only item requested is file names, we can let that past with
10624 	 * just LIST_DIRECTORY.  If they want any other attributes, that means
10625 	 * they need SEARCH as well.
10626 	 */
10627 	action = KAUTH_VNODE_LIST_DIRECTORY;
10628 	if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10629 	    attributelist.fileattr || attributelist.dirattr) {
10630 		action |= KAUTH_VNODE_SEARCH;
10631 	}
10632 
10633 	if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10634 		/* Believe it or not, uap->options only has 32-bits of valid
10635 		 * info, so truncate before extending again */
10636 
10637 		error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10638 		    (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10639 	}
10640 
10641 	if (error) {
10642 		(void) vnode_put(vp);
10643 		goto out;
10644 	}
10645 
10646 #if CONFIG_UNION_MOUNTS
10647 	/*
10648 	 * If we've got the last entry of a directory in a union mount
10649 	 * then reset the eofflag and pretend there's still more to come.
10650 	 * The next call will again set eofflag and the buffer will be empty,
10651 	 * so traverse to the underlying directory and do the directory
10652 	 * read there.
10653 	 */
10654 	if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10655 		if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10656 			eofflag = 0;
10657 		} else {                                                // Empty buffer
10658 			vnode_t uvp;
10659 			if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10660 				if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10661 					fp_set_data(fp, uvp);
10662 					fp->fp_glob->fg_offset = 0; // reset index for new dir
10663 					count = savecount;
10664 					vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10665 					vnode_put(vp);
10666 					vp = uvp;
10667 					goto unionread;
10668 				} else {
10669 					/* could not get a ref, can't replace in fd */
10670 					vnode_put(uvp);
10671 				}
10672 			}
10673 		}
10674 	}
10675 #endif /* CONFIG_UNION_MOUNTS */
10676 
10677 	(void)vnode_put(vp);
10678 
10679 	if (error) {
10680 		goto out;
10681 	}
10682 	fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10683 
10684 	if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10685 		goto out;
10686 	}
10687 	if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10688 		goto out;
10689 	}
10690 	if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10691 		goto out;
10692 	}
10693 
10694 	*retval = eofflag;  /* similar to getdirentries */
10695 	error = 0;
10696 out:
10697 	vn_offset_unlock(fp->fp_glob);
10698 	file_drop(fd);
10699 	return error; /* return error earlier, an retval of 0 or 1 now */
10700 } /* end of getdirentriesattr system call */
10701 
10702 /*
10703  * Exchange data between two files
10704  */
10705 
10706 /* ARGSUSED */
10707 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10708 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10709 {
10710 	struct nameidata fnd, snd;
10711 	vfs_context_t ctx = vfs_context_current();
10712 	vnode_t fvp;
10713 	vnode_t svp;
10714 	int error;
10715 	u_int32_t nameiflags;
10716 	char *fpath = NULL;
10717 	char *spath = NULL;
10718 	int   flen = 0, slen = 0;
10719 	int from_truncated = 0, to_truncated = 0;
10720 #if CONFIG_FSE
10721 	fse_info f_finfo, s_finfo;
10722 #endif
10723 
10724 	nameiflags = 0;
10725 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10726 		nameiflags |= FOLLOW;
10727 	}
10728 
10729 	NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10730 	    UIO_USERSPACE, uap->path1, ctx);
10731 
10732 	error = namei(&fnd);
10733 	if (error) {
10734 		goto out2;
10735 	}
10736 
10737 	nameidone(&fnd);
10738 	fvp = fnd.ni_vp;
10739 
10740 	NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10741 	    UIO_USERSPACE, uap->path2, ctx);
10742 
10743 	error = namei(&snd);
10744 	if (error) {
10745 		vnode_put(fvp);
10746 		goto out2;
10747 	}
10748 	nameidone(&snd);
10749 	svp = snd.ni_vp;
10750 
10751 	/*
10752 	 * if the files are the same, return an inval error
10753 	 */
10754 	if (svp == fvp) {
10755 		error = EINVAL;
10756 		goto out;
10757 	}
10758 
10759 	/*
10760 	 * if the files are on different volumes, return an error
10761 	 */
10762 	if (svp->v_mount != fvp->v_mount) {
10763 		error = EXDEV;
10764 		goto out;
10765 	}
10766 
10767 	/* If they're not files, return an error */
10768 	if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10769 		error = EINVAL;
10770 		goto out;
10771 	}
10772 
10773 #if CONFIG_MACF
10774 	error = mac_vnode_check_exchangedata(ctx,
10775 	    fvp, svp);
10776 	if (error) {
10777 		goto out;
10778 	}
10779 #endif
10780 	if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10781 	    ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10782 		goto out;
10783 	}
10784 
10785 	if (
10786 #if CONFIG_FSE
10787 		need_fsevent(FSE_EXCHANGE, fvp) ||
10788 #endif
10789 		kauth_authorize_fileop_has_listeners()) {
10790 		GET_PATH(fpath);
10791 		GET_PATH(spath);
10792 
10793 		flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10794 		slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10795 
10796 #if CONFIG_FSE
10797 		get_fse_info(fvp, &f_finfo, ctx);
10798 		get_fse_info(svp, &s_finfo, ctx);
10799 		if (from_truncated || to_truncated) {
10800 			// set it here since only the f_finfo gets reported up to user space
10801 			f_finfo.mode |= FSE_TRUNCATED_PATH;
10802 		}
10803 #endif
10804 	}
10805 	/* Ok, make the call */
10806 	error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10807 
10808 	if (error == 0) {
10809 		const char *tmpname;
10810 
10811 		if (fpath != NULL && spath != NULL) {
10812 			/* call out to allow 3rd party notification of exchangedata.
10813 			 * Ignore result of kauth_authorize_fileop call.
10814 			 */
10815 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10816 			    (uintptr_t)fpath, (uintptr_t)spath);
10817 		}
10818 		name_cache_lock();
10819 
10820 		tmpname     = fvp->v_name;
10821 		fvp->v_name = svp->v_name;
10822 		svp->v_name = tmpname;
10823 
10824 		if (fvp->v_parent != svp->v_parent) {
10825 			vnode_t tmp;
10826 
10827 			tmp           = fvp->v_parent;
10828 			fvp->v_parent = svp->v_parent;
10829 			svp->v_parent = tmp;
10830 		}
10831 		name_cache_unlock();
10832 
10833 #if CONFIG_FSE
10834 		if (fpath != NULL && spath != NULL) {
10835 			add_fsevent(FSE_EXCHANGE, ctx,
10836 			    FSE_ARG_STRING, flen, fpath,
10837 			    FSE_ARG_FINFO, &f_finfo,
10838 			    FSE_ARG_STRING, slen, spath,
10839 			    FSE_ARG_FINFO, &s_finfo,
10840 			    FSE_ARG_DONE);
10841 		}
10842 #endif
10843 	}
10844 
10845 out:
10846 	if (fpath != NULL) {
10847 		RELEASE_PATH(fpath);
10848 	}
10849 	if (spath != NULL) {
10850 		RELEASE_PATH(spath);
10851 	}
10852 	vnode_put(svp);
10853 	vnode_put(fvp);
10854 out2:
10855 	return error;
10856 }
10857 
10858 /*
10859  * Return (in MB) the amount of freespace on the given vnode's volume.
10860  */
10861 uint32_t freespace_mb(vnode_t vp);
10862 
10863 uint32_t
freespace_mb(vnode_t vp)10864 freespace_mb(vnode_t vp)
10865 {
10866 	vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
10867 	return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10868 	       vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10869 }
10870 
10871 #if CONFIG_SEARCHFS
10872 
10873 /* ARGSUSED */
10874 
10875 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)10876 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10877 {
10878 	vnode_t vp, tvp;
10879 	int i, error = 0;
10880 	int fserror = 0;
10881 	struct nameidata nd;
10882 	struct user64_fssearchblock searchblock;
10883 	struct searchstate *state;
10884 	struct attrlist *returnattrs;
10885 	struct timeval timelimit;
10886 	void *searchparams1, *searchparams2;
10887 	uio_t auio = NULL;
10888 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10889 	uint32_t nummatches;
10890 	size_t mallocsize;
10891 	uint32_t nameiflags;
10892 	vfs_context_t ctx = vfs_context_current();
10893 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10894 
10895 	/* Start by copying in fsearchblock parameter list */
10896 	if (IS_64BIT_PROCESS(p)) {
10897 		error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
10898 		timelimit.tv_sec = searchblock.timelimit.tv_sec;
10899 		timelimit.tv_usec = searchblock.timelimit.tv_usec;
10900 	} else {
10901 		struct user32_fssearchblock tmp_searchblock;
10902 
10903 		error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
10904 		// munge into 64-bit version
10905 		searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
10906 		searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
10907 		searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
10908 		searchblock.maxmatches = tmp_searchblock.maxmatches;
10909 		/*
10910 		 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
10911 		 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
10912 		 */
10913 		timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
10914 		timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
10915 		searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
10916 		searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
10917 		searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
10918 		searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
10919 		searchblock.searchattrs = tmp_searchblock.searchattrs;
10920 	}
10921 	if (error) {
10922 		return error;
10923 	}
10924 
10925 	/* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
10926 	 */
10927 	if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10928 	    searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10929 		return EINVAL;
10930 	}
10931 
10932 	/* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
10933 	/* It all has to do into local memory and it's not that big so we might as well  put it all together. */
10934 	/* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
10935 	/* block.                                                                                             */
10936 	/*												      */
10937 	/* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
10938 	/*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
10939 	/*       assumes the size is still 556 bytes it will continue to work				      */
10940 
10941 	mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
10942 	    sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
10943 
10944 	searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
10945 
10946 	/* Now set up the various pointers to the correct place in our newly allocated memory */
10947 
10948 	searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
10949 	returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
10950 	state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
10951 
10952 	/* Now copy in the stuff given our local variables. */
10953 
10954 	if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
10955 		goto freeandexit;
10956 	}
10957 
10958 	if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
10959 		goto freeandexit;
10960 	}
10961 
10962 	if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
10963 		goto freeandexit;
10964 	}
10965 
10966 	if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
10967 		goto freeandexit;
10968 	}
10969 
10970 	/*
10971 	 * When searching a union mount, need to set the
10972 	 * start flag at the first call on each layer to
10973 	 * reset state for the new volume.
10974 	 */
10975 	if (uap->options & SRCHFS_START) {
10976 		state->ss_union_layer = 0;
10977 	} else {
10978 		uap->options |= state->ss_union_flags;
10979 	}
10980 	state->ss_union_flags = 0;
10981 
10982 	/*
10983 	 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
10984 	 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
10985 	 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
10986 	 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
10987 	 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
10988 	 */
10989 
10990 	if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
10991 		attrreference_t* string_ref;
10992 		u_int32_t* start_length;
10993 		user64_size_t param_length;
10994 
10995 		/* validate searchparams1 */
10996 		param_length = searchblock.sizeofsearchparams1;
10997 		/* skip the word that specifies length of the buffer */
10998 		start_length = (u_int32_t*) searchparams1;
10999 		start_length = start_length + 1;
11000 		string_ref = (attrreference_t*) start_length;
11001 
11002 		/* ensure no negative offsets or too big offsets */
11003 		if (string_ref->attr_dataoffset < 0) {
11004 			error = EINVAL;
11005 			goto freeandexit;
11006 		}
11007 		if (string_ref->attr_length > MAXPATHLEN) {
11008 			error = EINVAL;
11009 			goto freeandexit;
11010 		}
11011 
11012 		/* Check for pointer overflow in the string ref */
11013 		if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11014 			error = EINVAL;
11015 			goto freeandexit;
11016 		}
11017 
11018 		if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11019 			error = EINVAL;
11020 			goto freeandexit;
11021 		}
11022 		if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11023 			error = EINVAL;
11024 			goto freeandexit;
11025 		}
11026 	}
11027 
11028 	/* set up the uio structure which will contain the users return buffer */
11029 	auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11030 	uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11031 
11032 	nameiflags = 0;
11033 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11034 		nameiflags |= FOLLOW;
11035 	}
11036 	NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11037 	    UIO_USERSPACE, uap->path, ctx);
11038 
11039 	error = namei(&nd);
11040 	if (error) {
11041 		goto freeandexit;
11042 	}
11043 	vp = nd.ni_vp;
11044 	nameidone(&nd);
11045 
11046 	/*
11047 	 * Switch to the root vnode for the volume
11048 	 */
11049 	error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11050 	vnode_put(vp);
11051 	if (error) {
11052 		goto freeandexit;
11053 	}
11054 	vp = tvp;
11055 
11056 #if CONFIG_UNION_MOUNTS
11057 	/*
11058 	 * If it's a union mount, the path lookup takes
11059 	 * us to the top layer. But we may need to descend
11060 	 * to a lower layer. For non-union mounts the layer
11061 	 * is always zero.
11062 	 */
11063 	for (i = 0; i < (int) state->ss_union_layer; i++) {
11064 		if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11065 			break;
11066 		}
11067 		tvp = vp;
11068 		vp = vp->v_mount->mnt_vnodecovered;
11069 		if (vp == NULL) {
11070 			vnode_put(tvp);
11071 			error = ENOENT;
11072 			goto freeandexit;
11073 		}
11074 		error = vnode_getwithref(vp);
11075 		vnode_put(tvp);
11076 		if (error) {
11077 			goto freeandexit;
11078 		}
11079 	}
11080 #endif /* CONFIG_UNION_MOUNTS */
11081 
11082 #if CONFIG_MACF
11083 	error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11084 	if (error) {
11085 		vnode_put(vp);
11086 		goto freeandexit;
11087 	}
11088 #endif
11089 
11090 
11091 	/*
11092 	 * If searchblock.maxmatches == 0, then skip the search. This has happened
11093 	 * before and sometimes the underlying code doesnt deal with it well.
11094 	 */
11095 	if (searchblock.maxmatches == 0) {
11096 		nummatches = 0;
11097 		goto saveandexit;
11098 	}
11099 
11100 	/*
11101 	 * Allright, we have everything we need, so lets make that call.
11102 	 *
11103 	 * We keep special track of the return value from the file system:
11104 	 * EAGAIN is an acceptable error condition that shouldn't keep us
11105 	 * from copying out any results...
11106 	 */
11107 
11108 	fserror = VNOP_SEARCHFS(vp,
11109 	    searchparams1,
11110 	    searchparams2,
11111 	    &searchblock.searchattrs,
11112 	    (uint32_t)searchblock.maxmatches,
11113 	    &timelimit,
11114 	    returnattrs,
11115 	    &nummatches,
11116 	    (uint32_t)uap->scriptcode,
11117 	    (uint32_t)uap->options,
11118 	    auio,
11119 	    (struct searchstate *) &state->ss_fsstate,
11120 	    ctx);
11121 
11122 #if CONFIG_UNION_MOUNTS
11123 	/*
11124 	 * If it's a union mount we need to be called again
11125 	 * to search the mounted-on filesystem.
11126 	 */
11127 	if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11128 		state->ss_union_flags = SRCHFS_START;
11129 		state->ss_union_layer++;        // search next layer down
11130 		fserror = EAGAIN;
11131 	}
11132 #endif /* CONFIG_UNION_MOUNTS */
11133 
11134 saveandexit:
11135 
11136 	vnode_put(vp);
11137 
11138 	/* Now copy out the stuff that needs copying out. That means the number of matches, the
11139 	 *  search state.  Everything was already put into he return buffer by the vop call. */
11140 
11141 	if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11142 		goto freeandexit;
11143 	}
11144 
11145 	if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11146 		goto freeandexit;
11147 	}
11148 
11149 	error = fserror;
11150 
11151 freeandexit:
11152 
11153 	kfree_data(searchparams1, mallocsize);
11154 
11155 	return error;
11156 } /* end of searchfs system call */
11157 
11158 #else /* CONFIG_SEARCHFS */
11159 
11160 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11161 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11162 {
11163 	return ENOTSUP;
11164 }
11165 
11166 #endif /* CONFIG_SEARCHFS */
11167 
11168 
11169 #if CONFIG_DATALESS_FILES
11170 
11171 /*
11172  * === Namespace Resolver Up-call Mechanism ===
11173  *
11174  * When I/O is performed to a dataless file or directory (read, write,
11175  * lookup-in, etc.), the file system performs an upcall to the namespace
11176  * resolver (filecoordinationd) to materialize the object.
11177  *
11178  * We need multiple up-calls to be in flight at once, and we need these
11179  * up-calls to be interruptible, thus the following implementation:
11180  *
11181  * => The nspace_resolver_request represents the in-kernel request state.
11182  *    It contains a request ID, storage space for the errno code returned
11183  *    by filecoordinationd, and flags.
11184  *
11185  * => The request ID is simply a global monotonically incrementing 32-bit
11186  *    number.  Outstanding requests are stored in a hash table, and the
11187  *    hash function is extremely simple.
11188  *
11189  * => When an upcall is to be made to filecoordinationd, a request structure
11190  *    is allocated on the stack (it is small, and needs to live only during
11191  *    the duration of the call to resolve_nspace_item_ext()).  It is
11192  *    initialized and inserted into the table.  Some backpressure from
11193  *    filecoordinationd is applied by limiting the numnber of entries that
11194  *    can be inserted into the table (and thus limiting the number of
11195  *    outstanding requests issued to filecoordinationd); waiting for an
11196  *    available slot is interruptible.
11197  *
11198  * => Once the request has been inserted into the table, the up-call is made
11199  *    to filecoordinationd via a MiG-generated stub.  The up-call returns
11200  *    immediately and filecoordinationd processes the request asynchronously.
11201  *
11202  * => The caller now waits for the request to complete.  Tnis is achieved by
11203  *    sleeping on the address of the request structure and waiting for
11204  *    filecoordinationd to mark the request structure as complete.  This
11205  *    is an interruptible sleep call; if interrupted, the request structure
11206  *    is removed from the table and EINTR is returned to the caller.  If
11207  *    this occurs, an advisory up-call is made to filecoordinationd with
11208  *    the request ID to indicate that the request can be aborted or
11209  *    de-prioritized at the discretion of filecoordinationd.
11210  *
11211  * => When filecoordinationd has completed the request, it signals completion
11212  *    by writing to the vfs.nspace.complete sysctl node.  Only a process
11213  *    decorated as a namespace resolver can write to this sysctl node.  The
11214  *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11215  *    The request ID is looked up in the table, and if the request is found,
11216  *    the error code is stored in the request structure and a wakeup()
11217  *    issued on the address of the request structure.  If the request is not
11218  *    found, we simply drop the completion notification, assuming that the
11219  *    caller was interrupted.
11220  *
11221  * => When the waiting thread wakes up, it extracts the error code from the
11222  *    request structure, removes the request from the table, and returns the
11223  *    error code to the calling function.  Fini!
11224  */
11225 
11226 struct nspace_resolver_request {
11227 	LIST_ENTRY(nspace_resolver_request) r_hashlink;
11228 	vnode_t         r_vp;
11229 	uint32_t        r_req_id;
11230 	int             r_resolver_error;
11231 	int             r_flags;
11232 };
11233 
11234 #define RRF_COMPLETE    0x0001
11235 
11236 static uint32_t
next_nspace_req_id(void)11237 next_nspace_req_id(void)
11238 {
11239 	static uint32_t next_req_id;
11240 
11241 	return OSAddAtomic(1, &next_req_id);
11242 }
11243 
11244 #define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
11245 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
11246 
11247 static LIST_HEAD(nspace_resolver_requesthead,
11248     nspace_resolver_request) * nspace_resolver_request_hashtbl;
11249 static u_long nspace_resolver_request_hashmask;
11250 static u_int nspace_resolver_request_count;
11251 static bool nspace_resolver_request_wait_slot;
11252 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11253 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11254     &nspace_resolver_request_lck_grp);
11255 
11256 #define NSPACE_REQ_LOCK() \
11257 	lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11258 #define NSPACE_REQ_UNLOCK() \
11259 	lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11260 
11261 #define NSPACE_RESOLVER_HASH(req_id)    \
11262 	(&nspace_resolver_request_hashtbl[(req_id) & \
11263 	 nspace_resolver_request_hashmask])
11264 
11265 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id)11266 nspace_resolver_req_lookup(uint32_t req_id)
11267 {
11268 	struct nspace_resolver_requesthead *bucket;
11269 	struct nspace_resolver_request *req;
11270 
11271 	bucket = NSPACE_RESOLVER_HASH(req_id);
11272 	LIST_FOREACH(req, bucket, r_hashlink) {
11273 		if (req->r_req_id == req_id) {
11274 			return req;
11275 		}
11276 	}
11277 
11278 	return NULL;
11279 }
11280 
11281 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11282 nspace_resolver_req_add(struct nspace_resolver_request *req)
11283 {
11284 	struct nspace_resolver_requesthead *bucket;
11285 	int error;
11286 
11287 	while (nspace_resolver_request_count >=
11288 	    NSPACE_RESOLVER_MAX_OUTSTANDING) {
11289 		nspace_resolver_request_wait_slot = true;
11290 		error = msleep(&nspace_resolver_request_count,
11291 		    &nspace_resolver_request_hash_mutex,
11292 		    PVFS | PCATCH, "nspacerq", NULL);
11293 		if (error) {
11294 			return error;
11295 		}
11296 	}
11297 
11298 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11299 #if DIAGNOSTIC
11300 	assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
11301 #endif /* DIAGNOSTIC */
11302 	LIST_INSERT_HEAD(bucket, req, r_hashlink);
11303 	nspace_resolver_request_count++;
11304 
11305 	return 0;
11306 }
11307 
11308 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)11309 nspace_resolver_req_remove(struct nspace_resolver_request *req)
11310 {
11311 	struct nspace_resolver_requesthead *bucket;
11312 
11313 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11314 #if DIAGNOSTIC
11315 	assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
11316 #endif /* DIAGNOSTIC */
11317 	LIST_REMOVE(req, r_hashlink);
11318 	nspace_resolver_request_count--;
11319 
11320 	if (nspace_resolver_request_wait_slot) {
11321 		nspace_resolver_request_wait_slot = false;
11322 		wakeup(&nspace_resolver_request_count);
11323 	}
11324 }
11325 
11326 static void
nspace_resolver_req_cancel(uint32_t req_id)11327 nspace_resolver_req_cancel(uint32_t req_id)
11328 {
11329 	kern_return_t kr;
11330 	mach_port_t mp;
11331 
11332 	// Failures here aren't fatal -- the cancellation message
11333 	// sent to the resolver is merely advisory.
11334 
11335 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11336 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11337 		return;
11338 	}
11339 
11340 	kr = send_nspace_resolve_cancel(mp, req_id);
11341 	if (kr != KERN_SUCCESS) {
11342 		os_log_error(OS_LOG_DEFAULT,
11343 		    "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11344 	}
11345 
11346 	ipc_port_release_send(mp);
11347 }
11348 
11349 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)11350 nspace_resolver_req_wait(struct nspace_resolver_request *req)
11351 {
11352 	bool send_cancel_message = false;
11353 	int error;
11354 
11355 	NSPACE_REQ_LOCK();
11356 
11357 	while ((req->r_flags & RRF_COMPLETE) == 0) {
11358 		error = msleep(req, &nspace_resolver_request_hash_mutex,
11359 		    PVFS | PCATCH, "nspace", NULL);
11360 		if (error && error != ERESTART) {
11361 			req->r_resolver_error = (error == EINTR) ? EINTR :
11362 			    ETIMEDOUT;
11363 			send_cancel_message = true;
11364 			break;
11365 		}
11366 	}
11367 
11368 	nspace_resolver_req_remove(req);
11369 
11370 	NSPACE_REQ_UNLOCK();
11371 
11372 	if (send_cancel_message) {
11373 		nspace_resolver_req_cancel(req->r_req_id);
11374 	}
11375 
11376 	return req->r_resolver_error;
11377 }
11378 
11379 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)11380 nspace_resolver_req_mark_complete(
11381 	struct nspace_resolver_request *req,
11382 	int resolver_error)
11383 {
11384 	req->r_resolver_error = resolver_error;
11385 	req->r_flags |= RRF_COMPLETE;
11386 	wakeup(req);
11387 }
11388 
11389 static void
nspace_resolver_req_completed(uint32_t req_id,int resolver_error,uint64_t orig_gencount)11390 nspace_resolver_req_completed(uint32_t req_id, int resolver_error, uint64_t orig_gencount)
11391 {
11392 	struct nspace_resolver_request *req;
11393 
11394 	NSPACE_REQ_LOCK();
11395 
11396 	// If we don't find the request corresponding to our req_id,
11397 	// just drop the completion signal on the floor; it's likely
11398 	// that the requester interrupted with a signal.
11399 
11400 	req = nspace_resolver_req_lookup(req_id);
11401 	if (req) {
11402 		mount_t locked_mp = NULL;
11403 
11404 		locked_mp = req->r_vp->v_mount;
11405 		mount_ref(locked_mp, 0);
11406 		mount_lock_renames(locked_mp);
11407 
11408 		//
11409 		// if the resolver isn't already returning an error and we have an
11410 		// orig_gencount, then get an iocount on the request vnode and check
11411 		// that the gencount on req->r_vp has not changed.
11412 		//
11413 		// note: a ref was taken on req->r_vp when the request was created
11414 		// and that ref will be dropped by that thread when it wakes up.
11415 		//
11416 		if (resolver_error == 0 &&
11417 		    orig_gencount != 0 &&
11418 		    vnode_getwithref(req->r_vp) == 0) {
11419 			struct vnode_attr va;
11420 			uint64_t cur_gencount;
11421 
11422 			VATTR_INIT(&va);
11423 			VATTR_WANTED(&va, va_recursive_gencount);
11424 
11425 			if (vnode_getattr(req->r_vp, &va, vfs_context_kernel()) == 0) {
11426 				cur_gencount = va.va_recursive_gencount;
11427 			} else {
11428 				cur_gencount = 0;
11429 			}
11430 
11431 			if (resolver_error == 0 && cur_gencount && orig_gencount && cur_gencount != orig_gencount) {
11432 				printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n", orig_gencount, cur_gencount);
11433 
11434 				// this error will be returned to the thread that initiated the
11435 				// materialization of req->r_vp.
11436 				resolver_error = EBUSY;
11437 
11438 				// note: we explicitly do not return an error to the caller (i.e.
11439 				// the thread that did the materialization) because they said they
11440 				// don't want one.
11441 			}
11442 
11443 			vnode_put(req->r_vp);
11444 		}
11445 
11446 		mount_unlock_renames(locked_mp);
11447 		mount_drop(locked_mp, 0);
11448 
11449 		nspace_resolver_req_mark_complete(req, resolver_error);
11450 	}
11451 
11452 	NSPACE_REQ_UNLOCK();
11453 
11454 	return;
11455 }
11456 
11457 static struct proc *nspace_resolver_proc;
11458 
11459 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)11460 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11461 {
11462 	*is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11463 	    p == nspace_resolver_proc) ? 1 : 0;
11464 	return 0;
11465 }
11466 
11467 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11468 
11469 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)11470 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11471 {
11472 	vfs_context_t ctx = vfs_context_current();
11473 	int error = 0;
11474 
11475 	//
11476 	// The system filecoordinationd runs as uid == 0.  This also
11477 	// has the nice side-effect of filtering out filecoordinationd
11478 	// running in the simulator.
11479 	//
11480 	if (!vfs_context_issuser(ctx) ||
11481 	    !vfs_context_is_dataless_resolver(ctx)) {
11482 		return EPERM;
11483 	}
11484 
11485 	if (is_resolver) {
11486 		NSPACE_REQ_LOCK();
11487 
11488 		if (nspace_resolver_proc == NULL) {
11489 			proc_lock(p);
11490 			p->p_lflag |= P_LNSPACE_RESOLVER;
11491 			proc_unlock(p);
11492 			nspace_resolver_proc = p;
11493 		} else {
11494 			error = EBUSY;
11495 		}
11496 
11497 		NSPACE_REQ_UNLOCK();
11498 	} else {
11499 		// This is basically just like the exit case.
11500 		// nspace_resolver_exited() will verify that the
11501 		// process is the resolver, and will clear the
11502 		// global.
11503 		nspace_resolver_exited(p);
11504 	}
11505 
11506 	return error;
11507 }
11508 
11509 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)11510 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11511 {
11512 	if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11513 	    (p->p_vfs_iopolicy &
11514 	    P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11515 		*is_prevented = 1;
11516 	} else {
11517 		*is_prevented = 0;
11518 	}
11519 	return 0;
11520 }
11521 
11522 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)11523 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11524 {
11525 	if (p->p_lflag & P_LNSPACE_RESOLVER) {
11526 		return is_prevented ? 0 : EBUSY;
11527 	}
11528 
11529 	if (is_prevented) {
11530 		OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
11531 	} else {
11532 		OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
11533 	}
11534 	return 0;
11535 }
11536 
11537 static int
nspace_materialization_get_thread_state(int * is_prevented)11538 nspace_materialization_get_thread_state(int *is_prevented)
11539 {
11540 	uthread_t ut = current_uthread();
11541 
11542 	*is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11543 	return 0;
11544 }
11545 
11546 static int
nspace_materialization_set_thread_state(int is_prevented)11547 nspace_materialization_set_thread_state(int is_prevented)
11548 {
11549 	uthread_t ut = current_uthread();
11550 
11551 	if (is_prevented) {
11552 		ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11553 	} else {
11554 		ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11555 	}
11556 	return 0;
11557 }
11558 
11559 /* the vfs.nspace branch */
11560 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11561 
11562 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11563 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11564     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11565 {
11566 	struct proc *p = req->p;
11567 	int new_value, old_value, changed = 0;
11568 	int error;
11569 
11570 	error = nspace_resolver_get_proc_state(p, &old_value);
11571 	if (error) {
11572 		return error;
11573 	}
11574 
11575 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11576 	    &changed);
11577 	if (error == 0 && changed) {
11578 		error = nspace_resolver_set_proc_state(p, new_value);
11579 	}
11580 	return error;
11581 }
11582 
11583 /* decorate this process as the dataless file resolver */
11584 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11585     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11586     0, 0, sysctl_nspace_resolver, "I", "");
11587 
11588 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11589 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11590     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11591 {
11592 	struct proc *p = req->p;
11593 	int new_value, old_value, changed = 0;
11594 	int error;
11595 
11596 	error = nspace_materialization_get_proc_state(p, &old_value);
11597 	if (error) {
11598 		return error;
11599 	}
11600 
11601 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11602 	    &changed);
11603 	if (error == 0 && changed) {
11604 		error = nspace_materialization_set_proc_state(p, new_value);
11605 	}
11606 	return error;
11607 }
11608 
11609 /* decorate this process as not wanting to materialize dataless files */
11610 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
11611     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11612     0, 0, sysctl_nspace_prevent_materialization, "I", "");
11613 
11614 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11615 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
11616     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11617 {
11618 	int new_value, old_value, changed = 0;
11619 	int error;
11620 
11621 	error = nspace_materialization_get_thread_state(&old_value);
11622 	if (error) {
11623 		return error;
11624 	}
11625 
11626 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11627 	    &changed);
11628 	if (error == 0 && changed) {
11629 		error = nspace_materialization_set_thread_state(new_value);
11630 	}
11631 	return error;
11632 }
11633 
11634 /* decorate this thread as not wanting to materialize dataless files */
11635 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
11636     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11637     0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
11638 
11639 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11640 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
11641     __unused int arg2, struct sysctl_req *req)
11642 {
11643 	struct proc *p = req->p;
11644 	uint32_t req_status[2] = { 0, 0 };
11645 	uint64_t gencount = 0;
11646 	int error, is_resolver, changed = 0, gencount_changed;
11647 
11648 	error = nspace_resolver_get_proc_state(p, &is_resolver);
11649 	if (error) {
11650 		return error;
11651 	}
11652 
11653 	if (!is_resolver) {
11654 		return EPERM;
11655 	}
11656 
11657 	error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11658 	    &changed);
11659 	if (error) {
11660 		return error;
11661 	}
11662 
11663 	// get the gencount if it was passed
11664 	error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11665 	    &gencount_changed);
11666 	if (error) {
11667 		gencount = 0;
11668 		// we ignore the error because the gencount was optional
11669 		error = 0;
11670 	}
11671 
11672 	/*
11673 	 * req_status[0] is the req_id
11674 	 *
11675 	 * req_status[1] is the errno
11676 	 */
11677 	if (error == 0 && changed) {
11678 		nspace_resolver_req_completed(req_status[0],
11679 		    (int)req_status[1], gencount);
11680 	}
11681 	return error;
11682 }
11683 
11684 /* Resolver reports completed reqs here. */
11685 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11686     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11687     0, 0, sysctl_nspace_complete, "-", "");
11688 
11689 #endif /* CONFIG_DATALESS_FILES */
11690 
11691 #if CONFIG_DATALESS_FILES
11692 #define __no_dataless_unused    /* nothing */
11693 #else
11694 #define __no_dataless_unused    __unused
11695 #endif
11696 
11697 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)11698 vfs_context_dataless_materialization_is_prevented(
11699 	vfs_context_t const ctx __no_dataless_unused)
11700 {
11701 #if CONFIG_DATALESS_FILES
11702 	proc_t const p = vfs_context_proc(ctx);
11703 	thread_t const t = vfs_context_thread(ctx);
11704 	uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11705 
11706 	/*
11707 	 * Kernel context ==> return EDEADLK, as we would with any random
11708 	 * process decorated as no-materialize.
11709 	 */
11710 	if (ctx == vfs_context_kernel()) {
11711 		return EDEADLK;
11712 	}
11713 
11714 	/*
11715 	 * If the process has the dataless-manipulation entitlement,
11716 	 * materialization is prevented, and depending on the kind
11717 	 * of file system operation, things get to proceed as if the
11718 	 * object is not dataless.
11719 	 */
11720 	if (vfs_context_is_dataless_manipulator(ctx)) {
11721 		return EJUSTRETURN;
11722 	}
11723 
11724 	/*
11725 	 * Per-thread decorations override any process-wide decorations.
11726 	 * (Foundation uses this, and this overrides even the dataless-
11727 	 * manipulation entitlement so as to make API contracts consistent.)
11728 	 */
11729 	if (ut != NULL) {
11730 		if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11731 			return EDEADLK;
11732 		}
11733 		if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11734 			return 0;
11735 		}
11736 	}
11737 
11738 	/*
11739 	 * If the process's iopolicy specifies that dataless files
11740 	 * can be materialized, then we let it go ahead.
11741 	 */
11742 	if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11743 		return 0;
11744 	}
11745 #endif /* CONFIG_DATALESS_FILES */
11746 
11747 	/*
11748 	 * The default behavior is to not materialize dataless files;
11749 	 * return to the caller that deadlock was detected.
11750 	 */
11751 	return EDEADLK;
11752 }
11753 
11754 void
nspace_resolver_init(void)11755 nspace_resolver_init(void)
11756 {
11757 #if CONFIG_DATALESS_FILES
11758 	nspace_resolver_request_hashtbl =
11759 	    hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
11760 	    M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
11761 #endif /* CONFIG_DATALESS_FILES */
11762 }
11763 
11764 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)11765 nspace_resolver_exited(struct proc *p __no_dataless_unused)
11766 {
11767 #if CONFIG_DATALESS_FILES
11768 	struct nspace_resolver_requesthead *bucket;
11769 	struct nspace_resolver_request *req;
11770 	u_long idx;
11771 
11772 	NSPACE_REQ_LOCK();
11773 
11774 	if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11775 	    p == nspace_resolver_proc) {
11776 		for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
11777 			bucket = &nspace_resolver_request_hashtbl[idx];
11778 			LIST_FOREACH(req, bucket, r_hashlink) {
11779 				nspace_resolver_req_mark_complete(req,
11780 				    ETIMEDOUT);
11781 			}
11782 		}
11783 		nspace_resolver_proc = NULL;
11784 	}
11785 
11786 	NSPACE_REQ_UNLOCK();
11787 #endif /* CONFIG_DATALESS_FILES */
11788 }
11789 
11790 int
resolve_nspace_item(struct vnode * vp,uint64_t op)11791 resolve_nspace_item(struct vnode *vp, uint64_t op)
11792 {
11793 	return resolve_nspace_item_ext(vp, op, NULL);
11794 }
11795 
11796 #define DATALESS_RESOLVER_ENTITLEMENT     \
11797 	"com.apple.private.vfs.dataless-resolver"
11798 #define DATALESS_MANIPULATION_ENTITLEMENT \
11799 	"com.apple.private.vfs.dataless-manipulation"
11800 
11801 #if CONFIG_DATALESS_FILES
11802 /*
11803  * Return TRUE if the vfs context is associated with the dataless
11804  * resolver.
11805  */
11806 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)11807 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
11808 {
11809 	return IOTaskHasEntitlement(vfs_context_task(ctx),
11810 	           DATALESS_RESOLVER_ENTITLEMENT);
11811 }
11812 #endif /* CONFIG_DATALESS_FILES */
11813 
11814 /*
11815  * Return TRUE if the vfs context is associated with a process entitled
11816  * for dataless manipulation.
11817  *
11818  * XXX Arguably belongs in vfs_subr.c, but is here because of the
11819  * complication around CONFIG_DATALESS_FILES.
11820  */
11821 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)11822 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
11823 {
11824 #if CONFIG_DATALESS_FILES
11825 	task_t task = vfs_context_task(ctx);
11826 	return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
11827 	       IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
11828 #else
11829 	return false;
11830 #endif /* CONFIG_DATALESS_FILES */
11831 }
11832 
11833 #if CONFIG_DATALESS_FILES
11834 static void
log_materialization_prevented(vnode_t vp,uint64_t op)11835 log_materialization_prevented(vnode_t vp, uint64_t op)
11836 {
11837 	char p_name[MAXCOMLEN + 1];
11838 	char *vntype;
11839 	proc_selfname(&p_name[0], sizeof(p_name));
11840 
11841 	if (vp->v_type == VREG) {
11842 		vntype = "File";
11843 	} else if (vp->v_type == VDIR) {
11844 		vntype = "Dir";
11845 	} else if (vp->v_type == VLNK) {
11846 		vntype = "SymLink";
11847 	} else {
11848 		vntype = "Other";
11849 	}
11850 
11851 #if DEVELOPMENT
11852 	char *path = NULL;
11853 	int   len;
11854 
11855 	path = get_pathbuff();
11856 	len = MAXPATHLEN;
11857 	if (path) {
11858 		vn_getpath(vp, path, &len);
11859 	}
11860 
11861 	os_log_debug(OS_LOG_DEFAULT,
11862 	    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) path: %s",
11863 	    p_name, proc_selfpid(),
11864 	    op, vntype, path ? path : "<unknown-path>");
11865 	if (path) {
11866 		release_pathbuff(path);
11867 	}
11868 #else
11869 	os_log_debug(OS_LOG_DEFAULT,
11870 	    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
11871 	    p_name, proc_selfpid(),
11872 	    op, vntype);
11873 #endif
11874 }
11875 #endif /* CONFIG_DATALESS_FILES */
11876 
11877 static int
vfs_materialize_item(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused)11878 vfs_materialize_item(
11879 	struct vnode *vp __no_dataless_unused,
11880 	uint64_t op __no_dataless_unused,
11881 	int64_t offset __no_dataless_unused,
11882 	int64_t size __no_dataless_unused,
11883 	char *lookup_name __no_dataless_unused,
11884 	size_t const namelen __no_dataless_unused)
11885 {
11886 #if CONFIG_DATALESS_FILES
11887 	struct nspace_resolver_request req;
11888 	kern_return_t kern_ret;
11889 	mach_port_t mach_port;
11890 	char *path = NULL;
11891 	vfs_context_t context;
11892 	int path_len;
11893 	int error;
11894 	audit_token_t atoken;
11895 
11896 	/*
11897 	 * If this is a snapshot event and the vnode is on a disk image just
11898 	 * pretend nothing happened since any change to the disk image will
11899 	 * cause the disk image itself to get backed up and this avoids multi-
11900 	 * way deadlocks between the snapshot handler and the ever popular
11901 	 * diskimages-helper process. The variable nspace_allow_virtual_devs
11902 	 * allows this behavior to be overridden (for use by the Mobile
11903 	 * TimeMachine testing infrastructure which uses disk images).
11904 	 */
11905 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
11906 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
11907 		return ENOTSUP;
11908 	}
11909 
11910 	context = vfs_context_current();
11911 
11912 	error = vfs_context_dataless_materialization_is_prevented(context);
11913 	if (error) {
11914 		log_materialization_prevented(vp, op);
11915 		return error;
11916 	}
11917 
11918 	kern_ret = host_get_filecoordinationd_port(host_priv_self(),
11919 	    &mach_port);
11920 	if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
11921 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
11922 		/*
11923 		 * Treat this like being unable to access the backing store
11924 		 * server.
11925 		 */
11926 		return ETIMEDOUT;
11927 	}
11928 
11929 	path = zalloc(ZV_NAMEI);
11930 	path_len = MAXPATHLEN;
11931 
11932 	error = vn_getpath(vp, path, &path_len);
11933 	if (error) {
11934 		goto out_release_port;
11935 	}
11936 
11937 	error = vfs_context_copy_audit_token(context, &atoken);
11938 	if (error) {
11939 		goto out_release_port;
11940 	}
11941 
11942 	req.r_req_id = next_nspace_req_id();
11943 	req.r_resolver_error = 0;
11944 	req.r_flags = 0;
11945 	req.r_vp = vp;
11946 
11947 	NSPACE_REQ_LOCK();
11948 	error = nspace_resolver_req_add(&req);
11949 	NSPACE_REQ_UNLOCK();
11950 	if (error) {
11951 		goto out_release_port;
11952 	}
11953 
11954 	os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
11955 	if (vp->v_type == VDIR) {
11956 		char *tmpname = NULL;
11957 
11958 		/*
11959 		 * If the caller provided a lookup_name *and* a name length,
11960 		 * then we assume the lookup_name is not NUL-terminated.
11961 		 * Allocate a temporary buffer in this case to provide
11962 		 * a NUL-terminated path name to the IPC call.
11963 		 */
11964 		if (lookup_name != NULL && namelen != 0) {
11965 			if (namelen >= PATH_MAX) {
11966 				error = EINVAL;
11967 				goto out_release_port;
11968 			}
11969 			tmpname = zalloc(ZV_NAMEI);
11970 			strlcpy(tmpname, lookup_name, namelen + 1);
11971 			lookup_name = tmpname;
11972 		} else if (lookup_name != NULL) {
11973 			/*
11974 			 * If the caller provided a lookup_name with a
11975 			 * zero name length, then we assume it's NUL-
11976 			 * terminated.  Verify it has a valid length.
11977 			 */
11978 			if (strlen(lookup_name) >= PATH_MAX) {
11979 				error = EINVAL;
11980 				goto out_release_port;
11981 			}
11982 		}
11983 
11984 		kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
11985 		    req.r_req_id, (uint32_t)(op & 0xffffffff),
11986 		    lookup_name == NULL ? "" : lookup_name, path, atoken);
11987 
11988 		if (tmpname != NULL) {
11989 			zfree(ZV_NAMEI, tmpname);
11990 
11991 			/*
11992 			 * Poison lookup_name rather than reference
11993 			 * freed memory.
11994 			 */
11995 			lookup_name = NULL;
11996 		}
11997 	} else {
11998 		kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
11999 		    req.r_req_id, (uint32_t)(op & 0xffffffff),
12000 		    offset, size, path, atoken);
12001 	}
12002 	if (kern_ret != KERN_SUCCESS) {
12003 		/*
12004 		 * Also treat this like being unable to access the backing
12005 		 * store server.
12006 		 */
12007 		os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12008 		    kern_ret);
12009 		error = ETIMEDOUT;
12010 
12011 		NSPACE_REQ_LOCK();
12012 		nspace_resolver_req_remove(&req);
12013 		NSPACE_REQ_UNLOCK();
12014 		goto out_release_port;
12015 	}
12016 
12017 	/*
12018 	 * Give back the memory we allocated earlier while we wait; we
12019 	 * no longer need it.
12020 	 */
12021 	zfree(ZV_NAMEI, path);
12022 	path = NULL;
12023 
12024 	/*
12025 	 * Request has been submitted to the resolver. Now (interruptibly)
12026 	 * wait for completion. Upon requrn, the request will have been
12027 	 * removed from the lookup table.
12028 	 */
12029 	error = nspace_resolver_req_wait(&req);
12030 
12031 out_release_port:
12032 	if (path != NULL) {
12033 		zfree(ZV_NAMEI, path);
12034 	}
12035 	ipc_port_release_send(mach_port);
12036 
12037 	return error;
12038 #else
12039 	return ENOTSUP;
12040 #endif /* CONFIG_DATALESS_FILES */
12041 }
12042 
12043 /*
12044  * vfs_materialize_file: Materialize a regular file.
12045  *
12046  * Inputs:
12047  * vp		The dataless file to be materialized.
12048  *
12049  * op		What kind of operation is being performed:
12050  *		-> NAMESPACE_HANDLER_READ_OP
12051  *		-> NAMESPACE_HANDLER_WRITE_OP
12052  *		-> NAMESPACE_HANDLER_LINK_CREATE
12053  *		-> NAMESPACE_HANDLER_DELETE_OP
12054  *		-> NAMESPACE_HANDLER_TRUNCATE_OP
12055  *		-> NAMESPACE_HANDLER_RENAME_OP
12056  *
12057  * offset	offset of I/O for READ or WRITE.  Ignored for
12058  *		other ops.
12059  *
12060  * size		size of I/O for READ or WRITE  Ignored for
12061  *		other ops.
12062  *
12063  * If offsize or size are -1 for a READ or WRITE, then the resolver should
12064  * consider the range to be unknown.
12065  *
12066  * Upon successful return, the caller may proceed with the operation.
12067  * N.B. the file may still be "dataless" in this case.
12068  */
12069 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12070 vfs_materialize_file(
12071 	struct vnode *vp,
12072 	uint64_t op,
12073 	int64_t offset,
12074 	int64_t size)
12075 {
12076 	if (vp->v_type != VREG) {
12077 		return EFTYPE;
12078 	}
12079 	return vfs_materialize_item(vp, op, offset, size, NULL, 0);
12080 }
12081 
12082 /*
12083  * vfs_materialize_dir:
12084  *
12085  * Inputs:
12086  * vp		The dataless directory to be materialized.
12087  *
12088  * op		What kind of operation is being performed:
12089  *		-> NAMESPACE_HANDLER_READ_OP
12090  *		-> NAMESPACE_HANDLER_WRITE_OP
12091  *		-> NAMESPACE_HANDLER_DELETE_OP
12092  *		-> NAMESPACE_HANDLER_RENAME_OP
12093  *		-> NAMESPACE_HANDLER_LOOKUP_OP
12094  *
12095  * lookup_name	Name being looked up for a LOOKUP op.  Ignored for
12096  *		other ops.  May or may not be NUL-terminated; see below.
12097  *
12098  * namelen	If non-zero, then lookup_name is assumed to not be NUL-
12099  *		terminated and namelen is the number of valid bytes in
12100  *		lookup_name. If zero, then lookup_name is assumed to be
12101  *		NUL-terminated.
12102  *
12103  * Upon successful return, the caller may proceed with the operation.
12104  * N.B. the directory may still be "dataless" in this case.
12105  */
12106 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12107 vfs_materialize_dir(
12108 	struct vnode *vp,
12109 	uint64_t op,
12110 	char *lookup_name,
12111 	size_t namelen)
12112 {
12113 	if (vp->v_type != VDIR) {
12114 		return EFTYPE;
12115 	}
12116 	if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12117 		return EINVAL;
12118 	}
12119 	return vfs_materialize_item(vp, op, 0, 0, lookup_name, namelen);
12120 }
12121 
12122 int
resolve_nspace_item_ext(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,void * arg __unused)12123 resolve_nspace_item_ext(
12124 	struct vnode *vp __no_dataless_unused,
12125 	uint64_t op __no_dataless_unused,
12126 	void *arg __unused)
12127 {
12128 #if CONFIG_DATALESS_FILES
12129 	int error;
12130 	mach_port_t mp;
12131 	char *path = NULL;
12132 	int path_len;
12133 	kern_return_t kr;
12134 	struct nspace_resolver_request req;
12135 
12136 	// only allow namespace events on regular files, directories and symlinks.
12137 	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
12138 		return EFTYPE;
12139 	}
12140 
12141 	//
12142 	// if this is a snapshot event and the vnode is on a
12143 	// disk image just pretend nothing happened since any
12144 	// change to the disk image will cause the disk image
12145 	// itself to get backed up and this avoids multi-way
12146 	// deadlocks between the snapshot handler and the ever
12147 	// popular diskimages-helper process.  the variable
12148 	// nspace_allow_virtual_devs allows this behavior to
12149 	// be overridden (for use by the Mobile TimeMachine
12150 	// testing infrastructure which uses disk images)
12151 	//
12152 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12153 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12154 		return ENOTSUP;
12155 	}
12156 
12157 	error = vfs_context_dataless_materialization_is_prevented(
12158 		vfs_context_current());
12159 	if (error) {
12160 		log_materialization_prevented(vp, op);
12161 		return error;
12162 	}
12163 
12164 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
12165 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
12166 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12167 		// Treat this like being unable to access the backing
12168 		// store server.
12169 		return ETIMEDOUT;
12170 	}
12171 
12172 	path = zalloc(ZV_NAMEI);
12173 	path_len = MAXPATHLEN;
12174 
12175 	error = vn_getpath(vp, path, &path_len);
12176 	if (error == 0) {
12177 		int xxx_rdar44371223;   /* XXX Mig bug */
12178 		req.r_req_id = next_nspace_req_id();
12179 		req.r_resolver_error = 0;
12180 		req.r_flags = 0;
12181 
12182 		if ((error = vnode_ref(vp)) == 0) {     // take a ref so that the vnode doesn't go away
12183 			req.r_vp = vp;
12184 		} else {
12185 			goto out_release_port;
12186 		}
12187 
12188 		NSPACE_REQ_LOCK();
12189 		error = nspace_resolver_req_add(&req);
12190 		NSPACE_REQ_UNLOCK();
12191 		if (error) {
12192 			vnode_rele(req.r_vp);
12193 			goto out_release_port;
12194 		}
12195 
12196 		os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12197 		kr = send_nspace_resolve_path(mp, req.r_req_id,
12198 		    proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
12199 		    path, &xxx_rdar44371223);
12200 		if (kr != KERN_SUCCESS) {
12201 			// Also treat this like being unable to access
12202 			// the backing store server.
12203 			os_log_error(OS_LOG_DEFAULT,
12204 			    "NSPACE resolve_path failure: %d", kr);
12205 			error = ETIMEDOUT;
12206 
12207 			NSPACE_REQ_LOCK();
12208 			nspace_resolver_req_remove(&req);
12209 			NSPACE_REQ_UNLOCK();
12210 			vnode_rele(req.r_vp);
12211 			goto out_release_port;
12212 		}
12213 
12214 		// Give back the memory we allocated earlier while
12215 		// we wait; we no longer need it.
12216 		zfree(ZV_NAMEI, path);
12217 		path = NULL;
12218 
12219 		// Request has been submitted to the resolver.
12220 		// Now (interruptibly) wait for completion.
12221 		// Upon requrn, the request will have been removed
12222 		// from the lookup table.
12223 		error = nspace_resolver_req_wait(&req);
12224 
12225 		vnode_rele(req.r_vp);
12226 	}
12227 
12228 out_release_port:
12229 	if (path != NULL) {
12230 		zfree(ZV_NAMEI, path);
12231 	}
12232 	ipc_port_release_send(mp);
12233 
12234 	return error;
12235 #else
12236 	return ENOTSUP;
12237 #endif /* CONFIG_DATALESS_FILES */
12238 }
12239 
12240 int
nspace_snapshot_event(__unused vnode_t vp,__unused time_t ctime,__unused uint64_t op_type,__unused void * arg)12241 nspace_snapshot_event(__unused vnode_t vp, __unused  time_t ctime,
12242     __unused uint64_t op_type, __unused void *arg)
12243 {
12244 	return 0;
12245 }
12246 
12247 #if 0
12248 static int
12249 build_volfs_path(struct vnode *vp, char *path, int *len)
12250 {
12251 	struct vnode_attr va;
12252 	int ret;
12253 
12254 	VATTR_INIT(&va);
12255 	VATTR_WANTED(&va, va_fsid);
12256 	VATTR_WANTED(&va, va_fileid);
12257 
12258 	if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12259 		*len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12260 		ret = -1;
12261 	} else {
12262 		*len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12263 		ret = 0;
12264 	}
12265 
12266 	return ret;
12267 }
12268 #endif
12269 
12270 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)12271 fsctl_bogus_command_compat(unsigned long cmd)
12272 {
12273 	switch (cmd) {
12274 	case IOCBASECMD(FSIOC_SYNC_VOLUME):
12275 		return FSIOC_SYNC_VOLUME;
12276 	case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12277 		return FSIOC_ROUTEFS_SETROUTEID;
12278 	case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12279 		return FSIOC_SET_PACKAGE_EXTS;
12280 	case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12281 		return FSIOC_SET_FSTYPENAME_OVERRIDE;
12282 	case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12283 		return DISK_CONDITIONER_IOC_GET;
12284 	case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12285 		return DISK_CONDITIONER_IOC_SET;
12286 	case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12287 		return FSIOC_FIOSEEKHOLE;
12288 	case IOCBASECMD(FSIOC_FIOSEEKDATA):
12289 		return FSIOC_FIOSEEKDATA;
12290 	case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12291 		return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12292 	case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12293 		return SPOTLIGHT_IOC_GET_LAST_MTIME;
12294 	}
12295 
12296 	return cmd;
12297 }
12298 
12299 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)12300 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12301 {
12302 	return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
12303 }
12304 
12305 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)12306 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12307 {
12308 	struct vfs_attr vfa;
12309 	mount_t mp = vp->v_mount;
12310 	unsigned arg;
12311 	int error;
12312 
12313 	/* record vid of vp so we can drop it below. */
12314 	uint32_t vvid = vp->v_id;
12315 
12316 	/*
12317 	 * Then grab mount_iterref so that we can release the vnode.
12318 	 * Without this, a thread may call vnode_iterate_prepare then
12319 	 * get into a deadlock because we've never released the root vp
12320 	 */
12321 	error = mount_iterref(mp, 0);
12322 	if (error) {
12323 		return error;
12324 	}
12325 	vnode_hold(vp);
12326 	vnode_put(vp);
12327 
12328 	arg = MNT_NOWAIT;
12329 	if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12330 		arg = MNT_WAIT;
12331 	}
12332 
12333 	/*
12334 	 * If the filessytem supports multiple filesytems in a
12335 	 * partition (For eg APFS volumes in a container, it knows
12336 	 * that the waitfor argument to VFS_SYNC are flags.
12337 	 */
12338 	VFSATTR_INIT(&vfa);
12339 	VFSATTR_WANTED(&vfa, f_capabilities);
12340 	if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
12341 	    VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12342 	    ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12343 	    ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12344 		arg |= MNT_VOLUME;
12345 	}
12346 
12347 	/* issue the sync for this volume */
12348 	(void)sync_callback(mp, &arg);
12349 
12350 	/*
12351 	 * Then release the mount_iterref once we're done syncing; it's not
12352 	 * needed for the VNOP_IOCTL below
12353 	 */
12354 	mount_iterdrop(mp);
12355 
12356 	if (arg & FSCTL_SYNC_FULLSYNC) {
12357 		/* re-obtain vnode iocount on the root vp, if possible */
12358 		error = vnode_getwithvid(vp, vvid);
12359 		if (error == 0) {
12360 			error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
12361 			vnode_put(vp);
12362 		}
12363 	}
12364 	vnode_drop(vp);
12365 	/* mark the argument VP as having been released */
12366 	*arg_vp = NULL;
12367 	return error;
12368 }
12369 
12370 #if ROUTEFS
12371 static int __attribute__((noinline))
handle_routes(user_addr_t udata)12372 handle_routes(user_addr_t udata)
12373 {
12374 	char routepath[MAXPATHLEN];
12375 	size_t len = 0;
12376 	int error;
12377 
12378 	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12379 		return error;
12380 	}
12381 	bzero(routepath, MAXPATHLEN);
12382 	error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12383 	if (error) {
12384 		return error;
12385 	}
12386 	error = routefs_kernel_mount(routepath);
12387 	return error;
12388 }
12389 #endif
12390 
12391 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)12392 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12393 {
12394 	struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12395 	struct vnode_attr va;
12396 	int error;
12397 
12398 	VATTR_INIT(&va);
12399 	VATTR_SET(&va, va_flags, cas->new_flags);
12400 
12401 	error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
12402 
12403 #if CONFIG_FSE
12404 	if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12405 		add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12406 	}
12407 #endif
12408 
12409 	return error;
12410 }
12411 
12412 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)12413 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12414 {
12415 	struct mount *mp = NULL;
12416 	errno_t rootauth = 0;
12417 
12418 	mp = vp->v_mount;
12419 
12420 	/*
12421 	 * query the underlying FS and see if it reports something
12422 	 * sane for this vnode. If volume is authenticated via
12423 	 * chunklist, leave that for the caller to determine.
12424 	 */
12425 	rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12426 
12427 	return rootauth;
12428 }
12429 
12430 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12431 	"com.apple.private.kernel.set-package-extensions"
12432 
12433 /*
12434  * Make a filesystem-specific control call:
12435  */
12436 /* ARGSUSED */
12437 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)12438 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12439 {
12440 	int error = 0;
12441 	boolean_t is64bit;
12442 	u_int size;
12443 #define STK_PARAMS 128
12444 	char stkbuf[STK_PARAMS] = {0};
12445 	caddr_t data, memp;
12446 	vnode_t vp = *arg_vp;
12447 
12448 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
12449 		return ENOTTY;
12450 	}
12451 
12452 	cmd = fsctl_bogus_command_compat(cmd);
12453 
12454 	size = IOCPARM_LEN(cmd);
12455 	if (size > IOCPARM_MAX) {
12456 		return EINVAL;
12457 	}
12458 
12459 	is64bit = proc_is64bit(p);
12460 
12461 	memp = NULL;
12462 
12463 	if (size > sizeof(stkbuf)) {
12464 		if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12465 			return ENOMEM;
12466 		}
12467 		data = memp;
12468 	} else {
12469 		data = &stkbuf[0];
12470 	};
12471 
12472 	if (cmd & IOC_IN) {
12473 		if (size) {
12474 			error = copyin(udata, data, size);
12475 			if (error) {
12476 				if (memp) {
12477 					kfree_data(memp, size);
12478 				}
12479 				return error;
12480 			}
12481 		} else {
12482 			if (is64bit) {
12483 				*(user_addr_t *)data = udata;
12484 			} else {
12485 				*(uint32_t *)data = (uint32_t)udata;
12486 			}
12487 		};
12488 	} else if ((cmd & IOC_OUT) && size) {
12489 		/*
12490 		 * Zero the buffer so the user always
12491 		 * gets back something deterministic.
12492 		 */
12493 		bzero(data, size);
12494 	} else if (cmd & IOC_VOID) {
12495 		if (is64bit) {
12496 			*(user_addr_t *)data = udata;
12497 		} else {
12498 			*(uint32_t *)data = (uint32_t)udata;
12499 		}
12500 	}
12501 
12502 	/* Check to see if it's a generic command */
12503 	switch (cmd) {
12504 	case FSIOC_SYNC_VOLUME:
12505 		error = handle_sync_volume(vp, arg_vp, data, ctx);
12506 		break;
12507 
12508 	case FSIOC_ROUTEFS_SETROUTEID:
12509 #if ROUTEFS
12510 		error = handle_routes(udata);
12511 #endif
12512 		break;
12513 
12514 	case FSIOC_SET_PACKAGE_EXTS: {
12515 		user_addr_t ext_strings;
12516 		uint32_t    num_entries;
12517 		uint32_t    max_width;
12518 
12519 		if (!IOTaskHasEntitlement(vfs_context_task(ctx),
12520 		    SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12521 			error = EPERM;
12522 			break;
12523 		}
12524 
12525 		if ((is64bit && size != sizeof(user64_package_ext_info))
12526 		    || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12527 			// either you're 64-bit and passed a 64-bit struct or
12528 			// you're 32-bit and passed a 32-bit struct.  otherwise
12529 			// it's not ok.
12530 			error = EINVAL;
12531 			break;
12532 		}
12533 
12534 		if (is64bit) {
12535 			if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12536 				assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12537 			}
12538 			ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12539 			num_entries = ((user64_package_ext_info *)data)->num_entries;
12540 			max_width   = ((user64_package_ext_info *)data)->max_width;
12541 		} else {
12542 			ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12543 			num_entries = ((user32_package_ext_info *)data)->num_entries;
12544 			max_width   = ((user32_package_ext_info *)data)->max_width;
12545 		}
12546 		error = set_package_extensions_table(ext_strings, num_entries, max_width);
12547 	}
12548 	break;
12549 
12550 	case FSIOC_SET_FSTYPENAME_OVERRIDE:
12551 	{
12552 		if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12553 			break;
12554 		}
12555 		if (vp->v_mount) {
12556 			mount_lock(vp->v_mount);
12557 			if (data[0] != 0) {
12558 				int i;
12559 				for (i = 0; i < MFSTYPENAMELEN; i++) {
12560 					if (!data[i]) {
12561 						goto continue_copy;
12562 					}
12563 				}
12564 				/*
12565 				 * Getting here means we have a user data string which has no
12566 				 * NULL termination in its first MFSTYPENAMELEN bytes.
12567 				 * This is bogus, let's avoid strlcpy-ing the read data and
12568 				 * return an error.
12569 				 */
12570 				error = EINVAL;
12571 				goto unlock;
12572 continue_copy:
12573 				strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
12574 				vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
12575 				if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
12576 					vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
12577 					vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
12578 				}
12579 			} else {
12580 				if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
12581 					vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
12582 				}
12583 				vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
12584 				vp->v_mount->fstypename_override[0] = '\0';
12585 			}
12586 unlock:
12587 			mount_unlock(vp->v_mount);
12588 		}
12589 	}
12590 	break;
12591 
12592 	case DISK_CONDITIONER_IOC_GET: {
12593 		error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12594 	}
12595 	break;
12596 
12597 	case DISK_CONDITIONER_IOC_SET: {
12598 		error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
12599 	}
12600 	break;
12601 
12602 	case FSIOC_CAS_BSDFLAGS:
12603 		error = handle_flags(vp, data, ctx);
12604 		break;
12605 
12606 	case FSIOC_FD_ONLY_OPEN_ONCE: {
12607 		error = 0;
12608 		if (vnode_usecount(vp) > 1) {
12609 			vnode_lock_spin(vp);
12610 			if (vp->v_lflag & VL_HASSTREAMS) {
12611 				if (vnode_isinuse_locked(vp, 1, 1)) {
12612 					error = EBUSY;
12613 				}
12614 			} else if (vnode_usecount(vp) > 1) {
12615 				error = EBUSY;
12616 			}
12617 			vnode_unlock(vp);
12618 		}
12619 	}
12620 	break;
12621 
12622 	case FSIOC_EVAL_ROOTAUTH:
12623 		error = handle_auth(vp, cmd, data, options, ctx);
12624 		break;
12625 
12626 	case FSIOC_TEST_FSE_ACCESS_GRANTED:
12627 		error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
12628 		break;
12629 
12630 	default: {
12631 		/* other, known commands shouldn't be passed down here */
12632 		switch (cmd) {
12633 		case F_PUNCHHOLE:
12634 		case F_TRIM_ACTIVE_FILE:
12635 		case F_RDADVISE:
12636 		case F_TRANSCODEKEY:
12637 		case F_GETPROTECTIONLEVEL:
12638 		case F_GETDEFAULTPROTLEVEL:
12639 		case F_MAKECOMPRESSED:
12640 		case F_SET_GREEDY_MODE:
12641 		case F_SETSTATICCONTENT:
12642 		case F_SETIOTYPE:
12643 		case F_SETBACKINGSTORE:
12644 		case F_GETPATH_MTMINFO:
12645 		case APFSIOC_REVERT_TO_SNAPSHOT:
12646 		case FSIOC_FIOSEEKHOLE:
12647 		case FSIOC_FIOSEEKDATA:
12648 		case HFS_GET_BOOT_INFO:
12649 		case HFS_SET_BOOT_INFO:
12650 		case FIOPINSWAP:
12651 		case F_CHKCLEAN:
12652 		case F_FULLFSYNC:
12653 		case F_BARRIERFSYNC:
12654 		case F_FREEZE_FS:
12655 		case F_THAW_FS:
12656 		case FSIOC_KERNEL_ROOTAUTH:
12657 		case FSIOC_GRAFT_FS:
12658 		case FSIOC_UNGRAFT_FS:
12659 		case FSIOC_AUTH_FS:
12660 			error = EINVAL;
12661 			goto outdrop;
12662 		}
12663 		/* Invoke the filesystem-specific code */
12664 		error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12665 	}
12666 	} /* end switch stmt */
12667 
12668 	/*
12669 	 * if no errors, copy any data to user. Size was
12670 	 * already set and checked above.
12671 	 */
12672 	if (error == 0 && (cmd & IOC_OUT) && size) {
12673 		error = copyout(data, udata, size);
12674 	}
12675 
12676 outdrop:
12677 	if (memp) {
12678 		kfree_data(memp, size);
12679 	}
12680 
12681 	return error;
12682 }
12683 
12684 /* ARGSUSED */
12685 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)12686 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
12687 {
12688 	int error;
12689 	struct nameidata nd;
12690 	uint32_t nameiflags;
12691 	vnode_t vp = NULL;
12692 	vfs_context_t ctx = vfs_context_current();
12693 
12694 	AUDIT_ARG(cmd, (int)uap->cmd);
12695 	AUDIT_ARG(value32, uap->options);
12696 	/* Get the vnode for the file we are getting info on:  */
12697 	nameiflags = 0;
12698 	//
12699 	// if we come through fsctl() then the file is by definition not open.
12700 	// therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
12701 	// lest the caller mistakenly thinks the only open is their own (but in
12702 	// reality it's someone elses).
12703 	//
12704 	if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
12705 		return EINVAL;
12706 	}
12707 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
12708 		nameiflags |= FOLLOW;
12709 	}
12710 	if (uap->cmd == FSIOC_FIRMLINK_CTL) {
12711 		nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
12712 	}
12713 	NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
12714 	    UIO_USERSPACE, uap->path, ctx);
12715 	if ((error = namei(&nd))) {
12716 		goto done;
12717 	}
12718 	vp = nd.ni_vp;
12719 	nameidone(&nd);
12720 
12721 #if CONFIG_MACF
12722 	error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
12723 	if (error) {
12724 		goto done;
12725 	}
12726 #endif
12727 
12728 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12729 
12730 done:
12731 	if (vp) {
12732 		vnode_put(vp);
12733 	}
12734 	return error;
12735 }
12736 /* ARGSUSED */
12737 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)12738 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
12739 {
12740 	int error;
12741 	vnode_t vp = NULL;
12742 	vfs_context_t ctx = vfs_context_current();
12743 	int fd = -1;
12744 
12745 	AUDIT_ARG(fd, uap->fd);
12746 	AUDIT_ARG(cmd, (int)uap->cmd);
12747 	AUDIT_ARG(value32, uap->options);
12748 
12749 	/* Get the vnode for the file we are getting info on:  */
12750 	if ((error = file_vnode(uap->fd, &vp))) {
12751 		return error;
12752 	}
12753 	fd = uap->fd;
12754 	if ((error = vnode_getwithref(vp))) {
12755 		file_drop(fd);
12756 		return error;
12757 	}
12758 
12759 #if CONFIG_MACF
12760 	if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
12761 		file_drop(fd);
12762 		vnode_put(vp);
12763 		return error;
12764 	}
12765 #endif
12766 
12767 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12768 
12769 	file_drop(fd);
12770 
12771 	/*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
12772 	if (vp) {
12773 		vnode_put(vp);
12774 	}
12775 
12776 	return error;
12777 }
12778 /* end of fsctl system call */
12779 
12780 #define FILESEC_ACCESS_ENTITLEMENT              \
12781 	"com.apple.private.vfs.filesec-access"
12782 
12783 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)12784 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
12785 {
12786 	if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
12787 		/*
12788 		 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
12789 		 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
12790 		 */
12791 		if ((!setting && vfs_context_issuser(ctx)) ||
12792 		    IOTaskHasEntitlement(vfs_context_task(ctx),
12793 		    FILESEC_ACCESS_ENTITLEMENT)) {
12794 			return 0;
12795 		}
12796 	}
12797 
12798 	return EPERM;
12799 }
12800 
12801 /*
12802  *  Retrieve the data of an extended attribute.
12803  */
12804 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)12805 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
12806 {
12807 	vnode_t vp;
12808 	struct nameidata nd;
12809 	char attrname[XATTR_MAXNAMELEN + 1];
12810 	vfs_context_t ctx = vfs_context_current();
12811 	uio_t auio = NULL;
12812 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12813 	size_t attrsize = 0;
12814 	size_t namelen;
12815 	u_int32_t nameiflags;
12816 	int error;
12817 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12818 
12819 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12820 		return EINVAL;
12821 	}
12822 
12823 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12824 	NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
12825 	if ((error = namei(&nd))) {
12826 		return error;
12827 	}
12828 	vp = nd.ni_vp;
12829 	nameidone(&nd);
12830 
12831 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12832 	if (error != 0) {
12833 		goto out;
12834 	}
12835 	if (xattr_protected(attrname) &&
12836 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12837 		goto out;
12838 	}
12839 	/*
12840 	 * the specific check for 0xffffffff is a hack to preserve
12841 	 * binaray compatibilty in K64 with applications that discovered
12842 	 * that passing in a buf pointer and a size of -1 resulted in
12843 	 * just the size of the indicated extended attribute being returned.
12844 	 * this isn't part of the documented behavior, but because of the
12845 	 * original implemtation's check for "uap->size > 0", this behavior
12846 	 * was allowed. In K32 that check turned into a signed comparison
12847 	 * even though uap->size is unsigned...  in K64, we blow by that
12848 	 * check because uap->size is unsigned and doesn't get sign smeared
12849 	 * in the munger for a 32 bit user app.  we also need to add a
12850 	 * check to limit the maximum size of the buffer being passed in...
12851 	 * unfortunately, the underlying fileystems seem to just malloc
12852 	 * the requested size even if the actual extended attribute is tiny.
12853 	 * because that malloc is for kernel wired memory, we have to put a
12854 	 * sane limit on it.
12855 	 *
12856 	 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
12857 	 * U64 running on K64 will yield -1 (64 bits wide)
12858 	 * U32/U64 running on K32 will yield -1 (32 bits wide)
12859 	 */
12860 	if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
12861 		goto no_uio;
12862 	}
12863 
12864 	if (uap->value) {
12865 		if (uap->size > (size_t)XATTR_MAXSIZE) {
12866 			uap->size = XATTR_MAXSIZE;
12867 		}
12868 
12869 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12870 		    &uio_buf[0], sizeof(uio_buf));
12871 		uio_addiov(auio, uap->value, uap->size);
12872 	}
12873 no_uio:
12874 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
12875 out:
12876 	vnode_put(vp);
12877 
12878 	if (auio) {
12879 		*retval = uap->size - uio_resid(auio);
12880 	} else {
12881 		*retval = (user_ssize_t)attrsize;
12882 	}
12883 
12884 	return error;
12885 }
12886 
12887 /*
12888  * Retrieve the data of an extended attribute.
12889  */
12890 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)12891 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
12892 {
12893 	vnode_t vp;
12894 	char attrname[XATTR_MAXNAMELEN + 1];
12895 	vfs_context_t ctx = vfs_context_current();
12896 	uio_t auio = NULL;
12897 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12898 	size_t attrsize = 0;
12899 	size_t namelen;
12900 	int error;
12901 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12902 
12903 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12904 		return EINVAL;
12905 	}
12906 
12907 	if ((error = file_vnode(uap->fd, &vp))) {
12908 		return error;
12909 	}
12910 	if ((error = vnode_getwithref(vp))) {
12911 		file_drop(uap->fd);
12912 		return error;
12913 	}
12914 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12915 	if (error != 0) {
12916 		goto out;
12917 	}
12918 	if (xattr_protected(attrname) &&
12919 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12920 		goto out;
12921 	}
12922 	if (uap->value && uap->size > 0) {
12923 		if (uap->size > (size_t)XATTR_MAXSIZE) {
12924 			uap->size = XATTR_MAXSIZE;
12925 		}
12926 
12927 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12928 		    &uio_buf[0], sizeof(uio_buf));
12929 		uio_addiov(auio, uap->value, uap->size);
12930 	}
12931 
12932 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
12933 out:
12934 	(void)vnode_put(vp);
12935 	file_drop(uap->fd);
12936 
12937 	if (auio) {
12938 		*retval = uap->size - uio_resid(auio);
12939 	} else {
12940 		*retval = (user_ssize_t)attrsize;
12941 	}
12942 	return error;
12943 }
12944 
12945 /* struct for checkdirs iteration */
12946 struct setxattr_ctx {
12947 	struct nameidata nd;
12948 	char attrname[XATTR_MAXNAMELEN + 1];
12949 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12950 };
12951 
12952 /*
12953  * Set the data of an extended attribute.
12954  */
12955 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)12956 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
12957 {
12958 	vnode_t vp;
12959 	vfs_context_t ctx = vfs_context_current();
12960 	uio_t auio = NULL;
12961 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12962 	size_t namelen;
12963 	u_int32_t nameiflags;
12964 	int error;
12965 	struct setxattr_ctx *sactx;
12966 
12967 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12968 		return EINVAL;
12969 	}
12970 
12971 	sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
12972 	if (sactx == NULL) {
12973 		return ENOMEM;
12974 	}
12975 
12976 	error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
12977 	if (error != 0) {
12978 		if (error == EPERM) {
12979 			/* if the string won't fit in attrname, copyinstr emits EPERM */
12980 			error = ENAMETOOLONG;
12981 		}
12982 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
12983 		goto out;
12984 	}
12985 	if (xattr_protected(sactx->attrname) &&
12986 	    (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
12987 		goto out;
12988 	}
12989 	if (uap->size != 0 && uap->value == 0) {
12990 		error = EINVAL;
12991 		goto out;
12992 	}
12993 	if (uap->size > INT_MAX) {
12994 		error = E2BIG;
12995 		goto out;
12996 	}
12997 
12998 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12999 #if CONFIG_FILE_LEASES
13000 	nameiflags |= WANTPARENT;
13001 #endif
13002 	NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13003 	if ((error = namei(&sactx->nd))) {
13004 		goto out;
13005 	}
13006 	vp = sactx->nd.ni_vp;
13007 #if CONFIG_FILE_LEASES
13008 	vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13009 	vnode_put(sactx->nd.ni_dvp);
13010 #endif
13011 	nameidone(&sactx->nd);
13012 
13013 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13014 	    &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13015 	uio_addiov(auio, uap->value, uap->size);
13016 
13017 	error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13018 #if CONFIG_FSE
13019 	if (error == 0) {
13020 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13021 		    FSE_ARG_VNODE, vp,
13022 		    FSE_ARG_DONE);
13023 	}
13024 #endif
13025 	vnode_put(vp);
13026 out:
13027 	kfree_type(struct setxattr_ctx, sactx);
13028 	*retval = 0;
13029 	return error;
13030 }
13031 
13032 /*
13033  * Set the data of an extended attribute.
13034  */
13035 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13036 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13037 {
13038 	vnode_t vp;
13039 	char attrname[XATTR_MAXNAMELEN + 1];
13040 	vfs_context_t ctx = vfs_context_current();
13041 	uio_t auio = NULL;
13042 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13043 	size_t namelen;
13044 	int error;
13045 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13046 
13047 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13048 		return EINVAL;
13049 	}
13050 
13051 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13052 	if (error != 0) {
13053 		if (error == EPERM) {
13054 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13055 			return ENAMETOOLONG;
13056 		}
13057 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13058 		return error;
13059 	}
13060 	if (xattr_protected(attrname) &&
13061 	    (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13062 		return error;
13063 	}
13064 	if (uap->size != 0 && uap->value == 0) {
13065 		return EINVAL;
13066 	}
13067 	if (uap->size > INT_MAX) {
13068 		return E2BIG;
13069 	}
13070 	if ((error = file_vnode(uap->fd, &vp))) {
13071 		return error;
13072 	}
13073 	if ((error = vnode_getwithref(vp))) {
13074 		file_drop(uap->fd);
13075 		return error;
13076 	}
13077 
13078 #if CONFIG_FILE_LEASES
13079 	vnode_breakdirlease(vp, true, O_WRONLY);
13080 #endif
13081 
13082 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13083 	    &uio_buf[0], sizeof(uio_buf));
13084 	uio_addiov(auio, uap->value, uap->size);
13085 
13086 	error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13087 #if CONFIG_FSE
13088 	if (error == 0) {
13089 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13090 		    FSE_ARG_VNODE, vp,
13091 		    FSE_ARG_DONE);
13092 	}
13093 #endif
13094 	vnode_put(vp);
13095 	file_drop(uap->fd);
13096 	*retval = 0;
13097 	return error;
13098 }
13099 
13100 /*
13101  * Remove an extended attribute.
13102  * XXX Code duplication here.
13103  */
13104 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13105 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13106 {
13107 	vnode_t vp;
13108 	struct nameidata nd;
13109 	char attrname[XATTR_MAXNAMELEN + 1];
13110 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13111 	vfs_context_t ctx = vfs_context_current();
13112 	size_t namelen;
13113 	u_int32_t nameiflags;
13114 	int error;
13115 
13116 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13117 		return EINVAL;
13118 	}
13119 
13120 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13121 	if (error != 0) {
13122 		return error;
13123 	}
13124 	if (xattr_protected(attrname)) {
13125 		return EPERM;
13126 	}
13127 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13128 #if CONFIG_FILE_LEASES
13129 	nameiflags |= WANTPARENT;
13130 #endif
13131 	NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13132 	if ((error = namei(&nd))) {
13133 		return error;
13134 	}
13135 	vp = nd.ni_vp;
13136 #if CONFIG_FILE_LEASES
13137 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13138 	vnode_put(nd.ni_dvp);
13139 #endif
13140 	nameidone(&nd);
13141 
13142 	error = vn_removexattr(vp, attrname, uap->options, ctx);
13143 #if CONFIG_FSE
13144 	if (error == 0) {
13145 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13146 		    FSE_ARG_VNODE, vp,
13147 		    FSE_ARG_DONE);
13148 	}
13149 #endif
13150 	vnode_put(vp);
13151 	*retval = 0;
13152 	return error;
13153 }
13154 
13155 /*
13156  * Remove an extended attribute.
13157  * XXX Code duplication here.
13158  */
13159 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)13160 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13161 {
13162 	vnode_t vp;
13163 	char attrname[XATTR_MAXNAMELEN + 1];
13164 	size_t namelen;
13165 	int error;
13166 #if CONFIG_FSE
13167 	vfs_context_t ctx = vfs_context_current();
13168 #endif
13169 
13170 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13171 		return EINVAL;
13172 	}
13173 
13174 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13175 	if (error != 0) {
13176 		return error;
13177 	}
13178 	if (xattr_protected(attrname)) {
13179 		return EPERM;
13180 	}
13181 	if ((error = file_vnode(uap->fd, &vp))) {
13182 		return error;
13183 	}
13184 	if ((error = vnode_getwithref(vp))) {
13185 		file_drop(uap->fd);
13186 		return error;
13187 	}
13188 
13189 #if CONFIG_FILE_LEASES
13190 	vnode_breakdirlease(vp, true, O_WRONLY);
13191 #endif
13192 
13193 	error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13194 #if CONFIG_FSE
13195 	if (error == 0) {
13196 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13197 		    FSE_ARG_VNODE, vp,
13198 		    FSE_ARG_DONE);
13199 	}
13200 #endif
13201 	vnode_put(vp);
13202 	file_drop(uap->fd);
13203 	*retval = 0;
13204 	return error;
13205 }
13206 
13207 /*
13208  * Retrieve the list of extended attribute names.
13209  * XXX Code duplication here.
13210  */
13211 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)13212 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13213 {
13214 	vnode_t vp;
13215 	struct nameidata nd;
13216 	vfs_context_t ctx = vfs_context_current();
13217 	uio_t auio = NULL;
13218 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13219 	size_t attrsize = 0;
13220 	u_int32_t nameiflags;
13221 	int error;
13222 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13223 
13224 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13225 		return EINVAL;
13226 	}
13227 
13228 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13229 	NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13230 	if ((error = namei(&nd))) {
13231 		return error;
13232 	}
13233 	vp = nd.ni_vp;
13234 	nameidone(&nd);
13235 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13236 		auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
13237 		    &uio_buf[0], sizeof(uio_buf));
13238 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13239 	}
13240 
13241 	error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13242 
13243 	vnode_put(vp);
13244 	if (auio) {
13245 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13246 	} else {
13247 		*retval = (user_ssize_t)attrsize;
13248 	}
13249 	return error;
13250 }
13251 
13252 /*
13253  * Retrieve the list of extended attribute names.
13254  * XXX Code duplication here.
13255  */
13256 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)13257 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13258 {
13259 	vnode_t vp;
13260 	uio_t auio = NULL;
13261 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13262 	size_t attrsize = 0;
13263 	int error;
13264 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13265 
13266 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13267 		return EINVAL;
13268 	}
13269 
13270 	if ((error = file_vnode(uap->fd, &vp))) {
13271 		return error;
13272 	}
13273 	if ((error = vnode_getwithref(vp))) {
13274 		file_drop(uap->fd);
13275 		return error;
13276 	}
13277 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13278 		auio = uio_createwithbuffer(1, 0, spacetype,
13279 		    UIO_READ, &uio_buf[0], sizeof(uio_buf));
13280 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13281 	}
13282 
13283 	error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13284 
13285 	vnode_put(vp);
13286 	file_drop(uap->fd);
13287 	if (auio) {
13288 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13289 	} else {
13290 		*retval = (user_ssize_t)attrsize;
13291 	}
13292 	return error;
13293 }
13294 
13295 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)13296 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13297     vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13298 {
13299 	int error;
13300 	struct mount *mp = NULL;
13301 	vnode_t vp;
13302 	int length;
13303 	int bpflags;
13304 	/* maximum number of times to retry build_path */
13305 	unsigned int retries = 0x10;
13306 
13307 	if (bufsize > PAGE_SIZE) {
13308 		return EINVAL;
13309 	}
13310 
13311 	if (buf == NULL) {
13312 		return ENOMEM;
13313 	}
13314 
13315 retry:
13316 	if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13317 		error = ENOTSUP;  /* unexpected failure */
13318 		return ENOTSUP;
13319 	}
13320 
13321 #if CONFIG_UNION_MOUNTS
13322 unionget:
13323 #endif /* CONFIG_UNION_MOUNTS */
13324 	if (objid == 2) {
13325 		struct vfs_attr vfsattr;
13326 		int use_vfs_root = TRUE;
13327 
13328 		VFSATTR_INIT(&vfsattr);
13329 		VFSATTR_WANTED(&vfsattr, f_capabilities);
13330 		if (!(options & FSOPT_ISREALFSID) &&
13331 		    vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
13332 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13333 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13334 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13335 				use_vfs_root = FALSE;
13336 			}
13337 		}
13338 
13339 		if (use_vfs_root) {
13340 			error = VFS_ROOT(mp, &vp, ctx);
13341 		} else {
13342 			error = VFS_VGET(mp, objid, &vp, ctx);
13343 		}
13344 	} else {
13345 		error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13346 	}
13347 
13348 #if CONFIG_UNION_MOUNTS
13349 	if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13350 		/*
13351 		 * If the fileid isn't found and we're in a union
13352 		 * mount volume, then see if the fileid is in the
13353 		 * mounted-on volume.
13354 		 */
13355 		struct mount *tmp = mp;
13356 		mp = vnode_mount(tmp->mnt_vnodecovered);
13357 		vfs_unbusy(tmp);
13358 		if (vfs_busy(mp, LK_NOWAIT) == 0) {
13359 			goto unionget;
13360 		}
13361 	} else {
13362 		vfs_unbusy(mp);
13363 	}
13364 #else
13365 	vfs_unbusy(mp);
13366 #endif /* CONFIG_UNION_MOUNTS */
13367 
13368 	if (error) {
13369 		return error;
13370 	}
13371 
13372 #if CONFIG_MACF
13373 	error = mac_vnode_check_fsgetpath(ctx, vp);
13374 	if (error) {
13375 		vnode_put(vp);
13376 		return error;
13377 	}
13378 #endif
13379 
13380 	/* Obtain the absolute path to this vnode. */
13381 	bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13382 	if (options & FSOPT_NOFIRMLINKPATH) {
13383 		bpflags |= BUILDPATH_NO_FIRMLINK;
13384 	}
13385 	bpflags |= BUILDPATH_CHECK_MOVED;
13386 	error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
13387 	vnode_put(vp);
13388 
13389 	if (error) {
13390 		/* there was a race building the path, try a few more times */
13391 		if (error == EAGAIN) {
13392 			--retries;
13393 			if (retries > 0) {
13394 				goto retry;
13395 			}
13396 
13397 			error = ENOENT;
13398 		}
13399 		goto out;
13400 	}
13401 
13402 	AUDIT_ARG(text, buf);
13403 
13404 	if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13405 		unsigned long path_words[NUMPARMS];
13406 		size_t path_len = sizeof(path_words);
13407 
13408 		if ((size_t)length < path_len) {
13409 			memcpy((char *)path_words, buf, length);
13410 			memset((char *)path_words + length, 0, path_len - length);
13411 
13412 			path_len = length;
13413 		} else {
13414 			memcpy((char *)path_words, buf + (length - path_len), path_len);
13415 		}
13416 
13417 		kdebug_vfs_lookup(path_words, (int)path_len, vp,
13418 		    KDBG_VFS_LOOKUP_FLAG_LOOKUP);
13419 	}
13420 
13421 	*pathlen = length; /* may be superseded by error */
13422 
13423 out:
13424 	return error;
13425 }
13426 
13427 /*
13428  * Obtain the full pathname of a file system object by id.
13429  */
13430 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)13431 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13432     uint32_t options, user_ssize_t *retval)
13433 {
13434 	vfs_context_t ctx = vfs_context_current();
13435 	fsid_t fsid;
13436 	char *realpath;
13437 	int length;
13438 	int error;
13439 
13440 	if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13441 		return EINVAL;
13442 	}
13443 
13444 	if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13445 		return error;
13446 	}
13447 	AUDIT_ARG(value32, fsid.val[0]);
13448 	AUDIT_ARG(value64, objid);
13449 	/* Restrict output buffer size for now. */
13450 
13451 	if (bufsize > PAGE_SIZE || bufsize <= 0) {
13452 		return EINVAL;
13453 	}
13454 	realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13455 	if (realpath == NULL) {
13456 		return ENOMEM;
13457 	}
13458 
13459 	error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
13460 	    options, &length);
13461 
13462 	if (error) {
13463 		goto out;
13464 	}
13465 
13466 	error = copyout((caddr_t)realpath, buf, length);
13467 
13468 	*retval = (user_ssize_t)length; /* may be superseded by error */
13469 out:
13470 	kfree_data(realpath, bufsize);
13471 	return error;
13472 }
13473 
13474 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)13475 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13476 {
13477 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13478 	           0, retval);
13479 }
13480 
13481 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)13482 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13483 {
13484 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13485 	           uap->options, retval);
13486 }
13487 
13488 /*
13489  * Common routine to handle various flavors of statfs data heading out
13490  *	to user space.
13491  *
13492  * Returns:	0			Success
13493  *		EFAULT
13494  */
13495 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)13496 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
13497     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
13498     boolean_t partial_copy)
13499 {
13500 	int             error;
13501 	int             my_size, copy_size;
13502 
13503 	if (is_64_bit) {
13504 		struct user64_statfs sfs;
13505 		my_size = copy_size = sizeof(sfs);
13506 		bzero(&sfs, my_size);
13507 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13508 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13509 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13510 		sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
13511 		sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
13512 		sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
13513 		sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
13514 		sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
13515 		sfs.f_files = (user64_long_t)sfsp->f_files;
13516 		sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
13517 		sfs.f_fsid = sfsp->f_fsid;
13518 		sfs.f_owner = sfsp->f_owner;
13519 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
13520 			strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
13521 		} else {
13522 			strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
13523 		}
13524 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13525 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13526 
13527 		if (partial_copy) {
13528 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13529 		}
13530 		error = copyout((caddr_t)&sfs, bufp, copy_size);
13531 	} else {
13532 		struct user32_statfs sfs;
13533 
13534 		my_size = copy_size = sizeof(sfs);
13535 		bzero(&sfs, my_size);
13536 
13537 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13538 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13539 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13540 
13541 		/*
13542 		 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
13543 		 * have to fudge the numbers here in that case.   We inflate the blocksize in order
13544 		 * to reflect the filesystem size as best we can.
13545 		 */
13546 		if ((sfsp->f_blocks > INT_MAX)
13547 		    /* Hack for 4061702 . I think the real fix is for Carbon to
13548 		     * look for some volume capability and not depend on hidden
13549 		     * semantics agreed between a FS and carbon.
13550 		     * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
13551 		     * for Carbon to set bNoVolumeSizes volume attribute.
13552 		     * Without this the webdavfs files cannot be copied onto
13553 		     * disk as they look huge. This change should not affect
13554 		     * XSAN as they should not setting these to -1..
13555 		     */
13556 		    && (sfsp->f_blocks != 0xffffffffffffffffULL)
13557 		    && (sfsp->f_bfree != 0xffffffffffffffffULL)
13558 		    && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
13559 			int             shift;
13560 
13561 			/*
13562 			 * Work out how far we have to shift the block count down to make it fit.
13563 			 * Note that it's possible to have to shift so far that the resulting
13564 			 * blocksize would be unreportably large.  At that point, we will clip
13565 			 * any values that don't fit.
13566 			 *
13567 			 * For safety's sake, we also ensure that f_iosize is never reported as
13568 			 * being smaller than f_bsize.
13569 			 */
13570 			for (shift = 0; shift < 32; shift++) {
13571 				if ((sfsp->f_blocks >> shift) <= INT_MAX) {
13572 					break;
13573 				}
13574 				if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
13575 					break;
13576 				}
13577 			}
13578 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
13579 			sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
13580 			sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
13581 			sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
13582 #undef __SHIFT_OR_CLIP
13583 			sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
13584 			sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
13585 		} else {
13586 			/* filesystem is small enough to be reported honestly */
13587 			sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
13588 			sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
13589 			sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
13590 			sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
13591 			sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
13592 		}
13593 		sfs.f_files = (user32_long_t)sfsp->f_files;
13594 		sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
13595 		sfs.f_fsid = sfsp->f_fsid;
13596 		sfs.f_owner = sfsp->f_owner;
13597 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
13598 			strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
13599 		} else {
13600 			strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
13601 		}
13602 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13603 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13604 
13605 		if (partial_copy) {
13606 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13607 		}
13608 		error = copyout((caddr_t)&sfs, bufp, copy_size);
13609 	}
13610 
13611 	if (sizep != NULL) {
13612 		*sizep = my_size;
13613 	}
13614 	return error;
13615 }
13616 
13617 /*
13618  * copy stat structure into user_stat structure.
13619  */
13620 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)13621 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
13622 {
13623 	bzero(usbp, sizeof(*usbp));
13624 
13625 	usbp->st_dev = sbp->st_dev;
13626 	usbp->st_ino = sbp->st_ino;
13627 	usbp->st_mode = sbp->st_mode;
13628 	usbp->st_nlink = sbp->st_nlink;
13629 	usbp->st_uid = sbp->st_uid;
13630 	usbp->st_gid = sbp->st_gid;
13631 	usbp->st_rdev = sbp->st_rdev;
13632 #ifndef _POSIX_C_SOURCE
13633 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13634 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13635 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13636 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13637 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13638 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13639 #else
13640 	usbp->st_atime = sbp->st_atime;
13641 	usbp->st_atimensec = sbp->st_atimensec;
13642 	usbp->st_mtime = sbp->st_mtime;
13643 	usbp->st_mtimensec = sbp->st_mtimensec;
13644 	usbp->st_ctime = sbp->st_ctime;
13645 	usbp->st_ctimensec = sbp->st_ctimensec;
13646 #endif
13647 	usbp->st_size = sbp->st_size;
13648 	usbp->st_blocks = sbp->st_blocks;
13649 	usbp->st_blksize = sbp->st_blksize;
13650 	usbp->st_flags = sbp->st_flags;
13651 	usbp->st_gen = sbp->st_gen;
13652 	usbp->st_lspare = sbp->st_lspare;
13653 	usbp->st_qspare[0] = sbp->st_qspare[0];
13654 	usbp->st_qspare[1] = sbp->st_qspare[1];
13655 }
13656 
13657 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)13658 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
13659 {
13660 	bzero(usbp, sizeof(*usbp));
13661 
13662 	usbp->st_dev = sbp->st_dev;
13663 	usbp->st_ino = sbp->st_ino;
13664 	usbp->st_mode = sbp->st_mode;
13665 	usbp->st_nlink = sbp->st_nlink;
13666 	usbp->st_uid = sbp->st_uid;
13667 	usbp->st_gid = sbp->st_gid;
13668 	usbp->st_rdev = sbp->st_rdev;
13669 #ifndef _POSIX_C_SOURCE
13670 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13671 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13672 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13673 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13674 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13675 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13676 #else
13677 	usbp->st_atime = sbp->st_atime;
13678 	usbp->st_atimensec = sbp->st_atimensec;
13679 	usbp->st_mtime = sbp->st_mtime;
13680 	usbp->st_mtimensec = sbp->st_mtimensec;
13681 	usbp->st_ctime = sbp->st_ctime;
13682 	usbp->st_ctimensec = sbp->st_ctimensec;
13683 #endif
13684 	usbp->st_size = sbp->st_size;
13685 	usbp->st_blocks = sbp->st_blocks;
13686 	usbp->st_blksize = sbp->st_blksize;
13687 	usbp->st_flags = sbp->st_flags;
13688 	usbp->st_gen = sbp->st_gen;
13689 	usbp->st_lspare = sbp->st_lspare;
13690 	usbp->st_qspare[0] = sbp->st_qspare[0];
13691 	usbp->st_qspare[1] = sbp->st_qspare[1];
13692 }
13693 
13694 /*
13695  * copy stat64 structure into user_stat64 structure.
13696  */
13697 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)13698 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
13699 {
13700 	bzero(usbp, sizeof(*usbp));
13701 
13702 	usbp->st_dev = sbp->st_dev;
13703 	usbp->st_ino = sbp->st_ino;
13704 	usbp->st_mode = sbp->st_mode;
13705 	usbp->st_nlink = sbp->st_nlink;
13706 	usbp->st_uid = sbp->st_uid;
13707 	usbp->st_gid = sbp->st_gid;
13708 	usbp->st_rdev = sbp->st_rdev;
13709 #ifndef _POSIX_C_SOURCE
13710 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13711 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13712 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13713 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13714 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13715 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13716 	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
13717 	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
13718 #else
13719 	usbp->st_atime = sbp->st_atime;
13720 	usbp->st_atimensec = sbp->st_atimensec;
13721 	usbp->st_mtime = sbp->st_mtime;
13722 	usbp->st_mtimensec = sbp->st_mtimensec;
13723 	usbp->st_ctime = sbp->st_ctime;
13724 	usbp->st_ctimensec = sbp->st_ctimensec;
13725 	usbp->st_birthtime = sbp->st_birthtime;
13726 	usbp->st_birthtimensec = sbp->st_birthtimensec;
13727 #endif
13728 	usbp->st_size = sbp->st_size;
13729 	usbp->st_blocks = sbp->st_blocks;
13730 	usbp->st_blksize = sbp->st_blksize;
13731 	usbp->st_flags = sbp->st_flags;
13732 	usbp->st_gen = sbp->st_gen;
13733 	usbp->st_lspare = sbp->st_lspare;
13734 	usbp->st_qspare[0] = sbp->st_qspare[0];
13735 	usbp->st_qspare[1] = sbp->st_qspare[1];
13736 }
13737 
13738 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)13739 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
13740 {
13741 	bzero(usbp, sizeof(*usbp));
13742 
13743 	usbp->st_dev = sbp->st_dev;
13744 	usbp->st_ino = sbp->st_ino;
13745 	usbp->st_mode = sbp->st_mode;
13746 	usbp->st_nlink = sbp->st_nlink;
13747 	usbp->st_uid = sbp->st_uid;
13748 	usbp->st_gid = sbp->st_gid;
13749 	usbp->st_rdev = sbp->st_rdev;
13750 #ifndef _POSIX_C_SOURCE
13751 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13752 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13753 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13754 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13755 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13756 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13757 	usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
13758 	usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
13759 #else
13760 	usbp->st_atime = sbp->st_atime;
13761 	usbp->st_atimensec = sbp->st_atimensec;
13762 	usbp->st_mtime = sbp->st_mtime;
13763 	usbp->st_mtimensec = sbp->st_mtimensec;
13764 	usbp->st_ctime = sbp->st_ctime;
13765 	usbp->st_ctimensec = sbp->st_ctimensec;
13766 	usbp->st_birthtime = sbp->st_birthtime;
13767 	usbp->st_birthtimensec = sbp->st_birthtimensec;
13768 #endif
13769 	usbp->st_size = sbp->st_size;
13770 	usbp->st_blocks = sbp->st_blocks;
13771 	usbp->st_blksize = sbp->st_blksize;
13772 	usbp->st_flags = sbp->st_flags;
13773 	usbp->st_gen = sbp->st_gen;
13774 	usbp->st_lspare = sbp->st_lspare;
13775 	usbp->st_qspare[0] = sbp->st_qspare[0];
13776 	usbp->st_qspare[1] = sbp->st_qspare[1];
13777 }
13778 
13779 /*
13780  * Purge buffer cache for simulating cold starts
13781  */
13782 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)13783 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
13784 {
13785 	ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
13786 
13787 	return VNODE_RETURNED;
13788 }
13789 
13790 static int
vfs_purge_callback(mount_t mp,__unused void * arg)13791 vfs_purge_callback(mount_t mp, __unused void * arg)
13792 {
13793 	vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
13794 
13795 	return VFS_RETURNED;
13796 }
13797 
13798 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)13799 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
13800 {
13801 	if (!kauth_cred_issuser(kauth_cred_get())) {
13802 		return EPERM;
13803 	}
13804 
13805 	vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
13806 
13807 	return 0;
13808 }
13809 
13810 /*
13811  * gets the vnode associated with the (unnamed) snapshot directory
13812  * for a Filesystem. The snapshot directory vnode is returned with
13813  * an iocount on it.
13814  */
13815 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)13816 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
13817 {
13818 	return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
13819 }
13820 
13821 /*
13822  * Get the snapshot vnode.
13823  *
13824  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
13825  * needs nameidone() on ndp.
13826  *
13827  * If the snapshot vnode exists it is returned in ndp->ni_vp.
13828  *
13829  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
13830  * not needed.
13831  */
13832 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)13833 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
13834     user_addr_t name, struct nameidata *ndp, int32_t op,
13835 #if !CONFIG_TRIGGERS
13836     __unused
13837 #endif
13838     enum path_operation pathop,
13839     vfs_context_t ctx)
13840 {
13841 	int error, i;
13842 	caddr_t name_buf;
13843 	size_t name_len;
13844 	struct vfs_attr vfa;
13845 
13846 	*sdvpp = NULLVP;
13847 	*rvpp = NULLVP;
13848 
13849 	error = vnode_getfromfd(ctx, dirfd, rvpp);
13850 	if (error) {
13851 		return error;
13852 	}
13853 
13854 	if (!vnode_isvroot(*rvpp)) {
13855 		error = EINVAL;
13856 		goto out;
13857 	}
13858 
13859 	/* Make sure the filesystem supports snapshots */
13860 	VFSATTR_INIT(&vfa);
13861 	VFSATTR_WANTED(&vfa, f_capabilities);
13862 	if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
13863 	    !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
13864 	    !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
13865 	    VOL_CAP_INT_SNAPSHOT)) ||
13866 	    !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
13867 	    VOL_CAP_INT_SNAPSHOT))) {
13868 		error = ENOTSUP;
13869 		goto out;
13870 	}
13871 
13872 	error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
13873 	if (error) {
13874 		goto out;
13875 	}
13876 
13877 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13878 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13879 	if (error) {
13880 		goto out1;
13881 	}
13882 
13883 	/*
13884 	 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
13885 	 * (the length returned by copyinstr includes the terminating NUL)
13886 	 */
13887 	if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
13888 	    (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
13889 		error = EINVAL;
13890 		goto out1;
13891 	}
13892 	for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
13893 		;
13894 	}
13895 	if (i < (int)name_len) {
13896 		error = EINVAL;
13897 		goto out1;
13898 	}
13899 
13900 #if CONFIG_MACF
13901 	if (op == CREATE) {
13902 		error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
13903 		    name_buf);
13904 	} else if (op == DELETE) {
13905 		error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
13906 		    name_buf);
13907 	}
13908 	if (error) {
13909 		goto out1;
13910 	}
13911 #endif
13912 
13913 	/* Check if the snapshot already exists ... */
13914 	NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
13915 	    UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
13916 	ndp->ni_dvp = *sdvpp;
13917 
13918 	error = namei(ndp);
13919 out1:
13920 	zfree(ZV_NAMEI, name_buf);
13921 out:
13922 	if (error) {
13923 		if (*sdvpp) {
13924 			vnode_put(*sdvpp);
13925 			*sdvpp = NULLVP;
13926 		}
13927 		if (*rvpp) {
13928 			vnode_put(*rvpp);
13929 			*rvpp = NULLVP;
13930 		}
13931 	}
13932 	return error;
13933 }
13934 
13935 /*
13936  * create a filesystem snapshot (for supporting filesystems)
13937  *
13938  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
13939  * We get to the (unnamed) snapshot directory vnode and create the vnode
13940  * for the snapshot in it.
13941  *
13942  * Restrictions:
13943  *
13944  *    a) Passed in name for snapshot cannot have slashes.
13945  *    b) name can't be "." or ".."
13946  *
13947  * Since this requires superuser privileges, vnode_authorize calls are not
13948  * made.
13949  */
13950 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13951 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
13952     vfs_context_t ctx)
13953 {
13954 	vnode_t rvp, snapdvp;
13955 	int error;
13956 	struct nameidata *ndp;
13957 
13958 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
13959 
13960 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
13961 	    OP_LINK, ctx);
13962 	if (error) {
13963 		goto out;
13964 	}
13965 
13966 	if (ndp->ni_vp) {
13967 		vnode_put(ndp->ni_vp);
13968 		error = EEXIST;
13969 	} else {
13970 		struct vnode_attr *vap;
13971 		vnode_t vp = NULLVP;
13972 
13973 		vap = kalloc_type(struct vnode_attr, Z_WAITOK);
13974 
13975 		VATTR_INIT(vap);
13976 		VATTR_SET(vap, va_type, VREG);
13977 		VATTR_SET(vap, va_mode, 0);
13978 
13979 		error = vn_create(snapdvp, &vp, ndp, vap,
13980 		    VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
13981 		if (!error && vp) {
13982 			vnode_put(vp);
13983 		}
13984 
13985 		kfree_type(struct vnode_attr, vap);
13986 	}
13987 
13988 	nameidone(ndp);
13989 	vnode_put(snapdvp);
13990 	vnode_put(rvp);
13991 out:
13992 	kfree_type(struct nameidata, ndp);
13993 
13994 	return error;
13995 }
13996 
13997 /*
13998  * Delete a Filesystem snapshot
13999  *
14000  * get the vnode for the unnamed snapshot directory and the snapshot and
14001  * delete the snapshot.
14002  */
14003 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14004 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14005     vfs_context_t ctx)
14006 {
14007 	vnode_t rvp, snapdvp;
14008 	int error;
14009 	struct nameidata *ndp;
14010 
14011 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14012 
14013 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14014 	    OP_UNLINK, ctx);
14015 	if (error) {
14016 		goto out;
14017 	}
14018 
14019 	error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14020 	    VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14021 
14022 	vnode_put(ndp->ni_vp);
14023 	nameidone(ndp);
14024 	vnode_put(snapdvp);
14025 	vnode_put(rvp);
14026 out:
14027 	kfree_type(struct nameidata, ndp);
14028 
14029 	return error;
14030 }
14031 
14032 /*
14033  * Revert a filesystem to a snapshot
14034  *
14035  * Marks the filesystem to revert to the given snapshot on next mount.
14036  */
14037 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14038 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14039     vfs_context_t ctx)
14040 {
14041 	int error;
14042 	vnode_t rvp;
14043 	mount_t mp;
14044 	struct fs_snapshot_revert_args revert_data;
14045 	struct componentname cnp;
14046 	caddr_t name_buf;
14047 	size_t name_len;
14048 
14049 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14050 	if (error) {
14051 		return error;
14052 	}
14053 	mp = vnode_mount(rvp);
14054 
14055 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14056 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14057 	if (error) {
14058 		zfree(ZV_NAMEI, name_buf);
14059 		vnode_put(rvp);
14060 		return error;
14061 	}
14062 
14063 #if CONFIG_MACF
14064 	error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14065 	if (error) {
14066 		zfree(ZV_NAMEI, name_buf);
14067 		vnode_put(rvp);
14068 		return error;
14069 	}
14070 #endif
14071 
14072 	/*
14073 	 * Grab mount_iterref so that we can release the vnode,
14074 	 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14075 	 */
14076 	error = mount_iterref(mp, 0);
14077 	vnode_put(rvp);
14078 	if (error) {
14079 		zfree(ZV_NAMEI, name_buf);
14080 		return error;
14081 	}
14082 
14083 	memset(&cnp, 0, sizeof(cnp));
14084 	cnp.cn_pnbuf = (char *)name_buf;
14085 	cnp.cn_nameiop = LOOKUP;
14086 	cnp.cn_flags = ISLASTCN | HASBUF;
14087 	cnp.cn_pnlen = MAXPATHLEN;
14088 	cnp.cn_nameptr = cnp.cn_pnbuf;
14089 	cnp.cn_namelen = (int)name_len;
14090 	revert_data.sr_cnp = &cnp;
14091 
14092 	error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14093 	mount_iterdrop(mp);
14094 	zfree(ZV_NAMEI, name_buf);
14095 
14096 	if (error) {
14097 		/* If there was any error, try again using VNOP_IOCTL */
14098 
14099 		vnode_t snapdvp;
14100 		struct nameidata namend;
14101 
14102 		error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14103 		    OP_LOOKUP, ctx);
14104 		if (error) {
14105 			return error;
14106 		}
14107 
14108 
14109 		error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14110 		    0, ctx);
14111 
14112 		vnode_put(namend.ni_vp);
14113 		nameidone(&namend);
14114 		vnode_put(snapdvp);
14115 		vnode_put(rvp);
14116 	}
14117 
14118 	return error;
14119 }
14120 
14121 /*
14122  * rename a Filesystem snapshot
14123  *
14124  * get the vnode for the unnamed snapshot directory and the snapshot and
14125  * rename the snapshot. This is a very specialised (and simple) case of
14126  * rename(2) (which has to deal with a lot more complications). It differs
14127  * slightly from rename(2) in that EEXIST is returned if the new name exists.
14128  */
14129 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)14130 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14131     __unused uint32_t flags, vfs_context_t ctx)
14132 {
14133 	vnode_t rvp, snapdvp;
14134 	int error, i;
14135 	caddr_t newname_buf;
14136 	size_t name_len;
14137 	vnode_t fvp;
14138 	struct nameidata *fromnd, *tond;
14139 	/* carving out a chunk for structs that are too big to be on stack. */
14140 	struct {
14141 		struct nameidata from_node;
14142 		struct nameidata to_node;
14143 	} * __rename_data;
14144 
14145 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14146 	fromnd = &__rename_data->from_node;
14147 	tond = &__rename_data->to_node;
14148 
14149 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14150 	    OP_UNLINK, ctx);
14151 	if (error) {
14152 		goto out;
14153 	}
14154 	fvp  = fromnd->ni_vp;
14155 
14156 	newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14157 	error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14158 	if (error) {
14159 		goto out1;
14160 	}
14161 
14162 	/*
14163 	 * Some sanity checks- new name can't be empty, "." or ".." or have
14164 	 * slashes.
14165 	 * (the length returned by copyinstr includes the terminating NUL)
14166 	 *
14167 	 * The FS rename VNOP is suppossed to handle this but we'll pick it
14168 	 * off here itself.
14169 	 */
14170 	if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14171 	    (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14172 		error = EINVAL;
14173 		goto out1;
14174 	}
14175 	for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14176 		;
14177 	}
14178 	if (i < (int)name_len) {
14179 		error = EINVAL;
14180 		goto out1;
14181 	}
14182 
14183 #if CONFIG_MACF
14184 	error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14185 	    newname_buf);
14186 	if (error) {
14187 		goto out1;
14188 	}
14189 #endif
14190 
14191 	NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14192 	    UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14193 	tond->ni_dvp = snapdvp;
14194 
14195 	error = namei(tond);
14196 	if (error) {
14197 		goto out2;
14198 	} else if (tond->ni_vp) {
14199 		/*
14200 		 * snapshot rename behaves differently than rename(2) - if the
14201 		 * new name exists, EEXIST is returned.
14202 		 */
14203 		vnode_put(tond->ni_vp);
14204 		error = EEXIST;
14205 		goto out2;
14206 	}
14207 
14208 	error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14209 	    &tond->ni_cnd, ctx);
14210 
14211 out2:
14212 	nameidone(tond);
14213 out1:
14214 	zfree(ZV_NAMEI, newname_buf);
14215 	vnode_put(fvp);
14216 	vnode_put(snapdvp);
14217 	vnode_put(rvp);
14218 	nameidone(fromnd);
14219 out:
14220 	kfree_type(typeof(*__rename_data), __rename_data);
14221 	return error;
14222 }
14223 
14224 /*
14225  * Mount a Filesystem snapshot
14226  *
14227  * get the vnode for the unnamed snapshot directory and the snapshot and
14228  * mount the snapshot.
14229  */
14230 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)14231 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14232     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14233 {
14234 	mount_t mp;
14235 	vnode_t rvp, snapdvp, snapvp, vp, pvp;
14236 	struct fs_snapshot_mount_args smnt_data;
14237 	int error;
14238 	struct nameidata *snapndp, *dirndp;
14239 	/* carving out a chunk for structs that are too big to be on stack. */
14240 	struct {
14241 		struct nameidata snapnd;
14242 		struct nameidata dirnd;
14243 	} * __snapshot_mount_data;
14244 
14245 	__snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14246 	snapndp = &__snapshot_mount_data->snapnd;
14247 	dirndp = &__snapshot_mount_data->dirnd;
14248 
14249 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
14250 	    OP_LOOKUP, ctx);
14251 	if (error) {
14252 		goto out;
14253 	}
14254 
14255 	snapvp  = snapndp->ni_vp;
14256 	if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
14257 		error = EIO;
14258 		goto out1;
14259 	}
14260 
14261 	/* Get the vnode to be covered */
14262 	NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14263 	    UIO_USERSPACE, directory, ctx);
14264 	error = namei(dirndp);
14265 	if (error) {
14266 		goto out1;
14267 	}
14268 
14269 	vp = dirndp->ni_vp;
14270 	pvp = dirndp->ni_dvp;
14271 	mp = vnode_mount(rvp);
14272 
14273 	if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14274 		error = EINVAL;
14275 		goto out2;
14276 	}
14277 
14278 #if CONFIG_MACF
14279 	error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
14280 	    mp->mnt_vfsstat.f_fstypename);
14281 	if (error) {
14282 		goto out2;
14283 	}
14284 #endif
14285 
14286 	smnt_data.sm_mp  = mp;
14287 	smnt_data.sm_cnp = &snapndp->ni_cnd;
14288 	error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
14289 	    &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
14290 	    KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14291 
14292 out2:
14293 	vnode_put(vp);
14294 	vnode_put(pvp);
14295 	nameidone(dirndp);
14296 out1:
14297 	vnode_put(snapvp);
14298 	vnode_put(snapdvp);
14299 	vnode_put(rvp);
14300 	nameidone(snapndp);
14301 out:
14302 	kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14303 	return error;
14304 }
14305 
14306 /*
14307  * Root from a snapshot of the filesystem
14308  *
14309  * Marks the filesystem to root from the given snapshot on next boot.
14310  */
14311 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14312 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14313     vfs_context_t ctx)
14314 {
14315 	int error;
14316 	vnode_t rvp;
14317 	mount_t mp;
14318 	struct fs_snapshot_root_args root_data;
14319 	struct componentname cnp;
14320 	caddr_t name_buf;
14321 	size_t name_len;
14322 
14323 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14324 	if (error) {
14325 		return error;
14326 	}
14327 	mp = vnode_mount(rvp);
14328 
14329 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14330 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14331 	if (error) {
14332 		zfree(ZV_NAMEI, name_buf);
14333 		vnode_put(rvp);
14334 		return error;
14335 	}
14336 
14337 	// XXX MAC checks ?
14338 
14339 	/*
14340 	 * Grab mount_iterref so that we can release the vnode,
14341 	 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14342 	 */
14343 	error = mount_iterref(mp, 0);
14344 	vnode_put(rvp);
14345 	if (error) {
14346 		zfree(ZV_NAMEI, name_buf);
14347 		return error;
14348 	}
14349 
14350 	memset(&cnp, 0, sizeof(cnp));
14351 	cnp.cn_pnbuf = (char *)name_buf;
14352 	cnp.cn_nameiop = LOOKUP;
14353 	cnp.cn_flags = ISLASTCN | HASBUF;
14354 	cnp.cn_pnlen = MAXPATHLEN;
14355 	cnp.cn_nameptr = cnp.cn_pnbuf;
14356 	cnp.cn_namelen = (int)name_len;
14357 	root_data.sr_cnp = &cnp;
14358 
14359 	error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
14360 
14361 	mount_iterdrop(mp);
14362 	zfree(ZV_NAMEI, name_buf);
14363 
14364 	return error;
14365 }
14366 
14367 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)14368 vfs_context_can_snapshot(vfs_context_t ctx)
14369 {
14370 	static const char * const snapshot_entitlements[] = {
14371 		"com.apple.private.vfs.snapshot",
14372 		"com.apple.developer.vfs.snapshot",
14373 		"com.apple.private.apfs.arv.limited.snapshot",
14374 	};
14375 	static const size_t nentitlements =
14376 	    sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14377 	size_t i;
14378 
14379 	task_t task = vfs_context_task(ctx);
14380 	for (i = 0; i < nentitlements; i++) {
14381 		if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
14382 			return TRUE;
14383 		}
14384 	}
14385 	return FALSE;
14386 }
14387 
14388 /*
14389  * FS snapshot operations dispatcher
14390  */
14391 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)14392 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14393     __unused int32_t *retval)
14394 {
14395 	int error;
14396 	vfs_context_t ctx = vfs_context_current();
14397 
14398 	AUDIT_ARG(fd, uap->dirfd);
14399 	AUDIT_ARG(value32, uap->op);
14400 
14401 	if (!vfs_context_can_snapshot(ctx)) {
14402 		return EPERM;
14403 	}
14404 
14405 	/*
14406 	 * Enforce user authorization for snapshot modification operations,
14407 	 * or if trying to root from snapshot.
14408 	 */
14409 	if (uap->op != SNAPSHOT_OP_MOUNT) {
14410 		vnode_t dvp = NULLVP;
14411 		vnode_t devvp = NULLVP;
14412 		mount_t mp;
14413 
14414 		error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
14415 		if (error) {
14416 			return error;
14417 		}
14418 		mp = vnode_mount(dvp);
14419 		devvp = mp->mnt_devvp;
14420 
14421 		/* get an iocount on devvp */
14422 		if (devvp == NULLVP) {
14423 			error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
14424 			/* for mounts which arent block devices */
14425 			if (error == ENOENT) {
14426 				error = ENXIO;
14427 			}
14428 		} else {
14429 			error = vnode_getwithref(devvp);
14430 		}
14431 
14432 		if (error) {
14433 			vnode_put(dvp);
14434 			return error;
14435 		}
14436 
14437 		if ((vfs_context_issuser(ctx) == 0) &&
14438 		    (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14439 		    (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
14440 			error = EPERM;
14441 		}
14442 		vnode_put(dvp);
14443 		vnode_put(devvp);
14444 
14445 		if (error) {
14446 			return error;
14447 		}
14448 	}
14449 
14450 	switch (uap->op) {
14451 	case SNAPSHOT_OP_CREATE:
14452 		error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
14453 		break;
14454 	case SNAPSHOT_OP_DELETE:
14455 		error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
14456 		break;
14457 	case SNAPSHOT_OP_RENAME:
14458 		error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
14459 		    uap->flags, ctx);
14460 		break;
14461 	case SNAPSHOT_OP_MOUNT:
14462 		error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
14463 		    uap->data, uap->flags, ctx);
14464 		break;
14465 	case SNAPSHOT_OP_REVERT:
14466 		error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
14467 		break;
14468 #if CONFIG_MNT_ROOTSNAP
14469 	case SNAPSHOT_OP_ROOT:
14470 		error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
14471 		break;
14472 #endif /* CONFIG_MNT_ROOTSNAP */
14473 	default:
14474 		error = ENOSYS;
14475 	}
14476 
14477 	return error;
14478 }
14479