xref: /xnu-8796.141.3/bsd/vfs/vfs_syscalls.c (revision 1b191cb58250d0705d8a51287127505aa4bc0789)
1 /*
2  * Copyright (c) 1995-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1989, 1993
30  *	The Regents of the University of California.  All rights reserved.
31  * (c) UNIX System Laboratories, Inc.
32  * All or some portions of this file are derived from material licensed
33  * to the University of California by American Telephone and Telegraph
34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35  * the permission of UNIX System Laboratories, Inc.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. All advertising materials mentioning features or use of this software
46  *    must display the following acknowledgement:
47  *	This product includes software developed by the University of
48  *	California, Berkeley and its contributors.
49  * 4. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.41 (Berkeley) 6/15/95
66  */
67 /*
68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69  * support for mandatory and extensible security protections.  This notice
70  * is included in support of clause 2.2 (b) of the Apple Public License,
71  * Version 2.0.
72  */
73 
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112 
113 #include <vfs/vfs_disk_conditioner.h>
114 
115 #include <security/audit/audit.h>
116 #include <bsm/audit_kevents.h>
117 
118 #include <mach/mach_types.h>
119 #include <kern/kern_types.h>
120 #include <kern/kalloc.h>
121 #include <kern/task.h>
122 
123 #include <vm/vm_pageout.h>
124 #include <vm/vm_protos.h>
125 
126 #include <libkern/OSAtomic.h>
127 #include <os/atomic_private.h>
128 #include <pexpert/pexpert.h>
129 #include <IOKit/IOBSD.h>
130 
131 // deps for MIG call
132 #include <kern/host.h>
133 #include <kern/ipc_misc.h>
134 #include <mach/host_priv.h>
135 #include <mach/vfs_nspace.h>
136 #include <os/log.h>
137 
138 #include <nfs/nfs_conf.h>
139 
140 #if ROUTEFS
141 #include <miscfs/routefs/routefs.h>
142 #endif /* ROUTEFS */
143 
144 #if CONFIG_MACF
145 #include <security/mac.h>
146 #include <security/mac_framework.h>
147 #endif
148 
149 #if CONFIG_FSE
150 #define GET_PATH(x) \
151 	((x) = get_pathbuff())
152 #define RELEASE_PATH(x) \
153 	release_pathbuff(x)
154 #else
155 #define GET_PATH(x)     \
156 	((x) = zalloc(ZV_NAMEI))
157 #define RELEASE_PATH(x) \
158 	zfree(ZV_NAMEI, x)
159 #endif /* CONFIG_FSE */
160 
161 #ifndef HFS_GET_BOOT_INFO
162 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
163 #endif
164 
165 #ifndef HFS_SET_BOOT_INFO
166 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
167 #endif
168 
169 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
170 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
171 #endif
172 
173 extern void disk_conditioner_unmount(mount_t mp);
174 
175 /* struct for checkdirs iteration */
176 struct cdirargs {
177 	vnode_t olddp;
178 	vnode_t newdp;
179 };
180 /* callback  for checkdirs iteration */
181 static int checkdirs_callback(proc_t p, void * arg);
182 
183 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
184 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
185 void enablequotas(struct mount *mp, vfs_context_t ctx);
186 static int getfsstat_callback(mount_t mp, void * arg);
187 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
188 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
189 static int sync_callback(mount_t, void *);
190 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
191     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
192     boolean_t partial_copy);
193 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
194 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
195     struct componentname *cnp, user_addr_t fsmountargs,
196     int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
197 void vfs_notify_mount(vnode_t pdvp);
198 
199 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
200 
201 struct fd_vn_data * fg_vn_data_alloc(void);
202 
203 /*
204  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
205  * Concurrent lookups (or lookups by ids) on hard links can cause the
206  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
207  * does) to return ENOENT as the path cannot be returned from the name cache
208  * alone. We have no option but to retry and hope to get one namei->reverse path
209  * generation done without an intervening lookup, lookup by id on the hard link
210  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
211  * which currently are the MAC hooks for rename, unlink and rmdir.
212  */
213 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
214 
215 /* Max retry limit for rename due to vnode recycling. */
216 #define MAX_RENAME_ERECYCLE_RETRIES 1024
217 
218 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
219     int unlink_flags);
220 
221 #ifdef CONFIG_IMGSRC_ACCESS
222 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
223 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
224 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
225 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
226 static void mount_end_update(mount_t mp);
227 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
228 #endif /* CONFIG_IMGSRC_ACCESS */
229 
230 //snapshot functions
231 #if CONFIG_MNT_ROOTSNAP
232 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
233 #else
234 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
235 #endif
236 
237 __private_extern__
238 int sync_internal(void);
239 
240 __private_extern__
241 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
242 
243 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
244 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
245 
246 /* vars for sync mutex */
247 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
248 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
249 
250 extern lck_rw_t rootvnode_rw_lock;
251 
252 VFS_SMR_DECLARE;
253 extern uint32_t nc_smr_enabled;
254 
255 /*
256  * incremented each time a mount or unmount operation occurs
257  * used to invalidate the cached value of the rootvp in the
258  * mount structure utilized by cache_lookup_path
259  */
260 uint32_t mount_generation = 0;
261 
262 /* counts number of mount and unmount operations */
263 unsigned int vfs_nummntops = 0;
264 
265 /* system-wide, per-boot unique mount ID */
266 static _Atomic uint64_t mount_unique_id = 1;
267 
268 extern const struct fileops vnops;
269 #if CONFIG_APPLEDOUBLE
270 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
271 #endif /* CONFIG_APPLEDOUBLE */
272 
273 /* Maximum buffer length supported by fsgetpath(2) */
274 #define FSGETPATH_MAXBUFLEN  8192
275 
276 /*
277  * Virtual File System System Calls
278  */
279 
280 /*
281  * Private in-kernel mounting spi (specific use-cases only)
282  */
283 boolean_t
vfs_iskernelmount(mount_t mp)284 vfs_iskernelmount(mount_t mp)
285 {
286 	return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
287 }
288 
289 __private_extern__
290 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)291 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
292     void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
293     vfs_context_t ctx)
294 {
295 	struct nameidata nd;
296 	boolean_t did_namei;
297 	int error;
298 
299 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
300 	    UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
301 
302 	kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
303 
304 	/*
305 	 * Get the vnode to be covered if it's not supplied
306 	 */
307 	if (vp == NULLVP) {
308 		error = namei(&nd);
309 		if (error) {
310 			if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
311 				printf("failed to locate mount-on path: %s ", path);
312 			}
313 			return error;
314 		}
315 		vp = nd.ni_vp;
316 		pvp = nd.ni_dvp;
317 		did_namei = TRUE;
318 	} else {
319 		char *pnbuf = CAST_DOWN(char *, path);
320 
321 		nd.ni_cnd.cn_pnbuf = pnbuf;
322 		nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
323 		did_namei = FALSE;
324 	}
325 
326 	kern_flags |= KERNEL_MOUNT_KMOUNT;
327 	error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
328 	    syscall_flags, kern_flags, NULL, ctx);
329 
330 	if (did_namei) {
331 		vnode_put(vp);
332 		vnode_put(pvp);
333 		nameidone(&nd);
334 	}
335 
336 	return error;
337 }
338 
339 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)340 vfs_mount_at_path(const char *fstype, const char *path,
341     vnode_t pvp, vnode_t vp, void *data, size_t datalen,
342     int mnt_flags, int flags)
343 {
344 	int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
345 	int error, km_flags = 0;
346 	vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
347 
348 	/*
349 	 * This call is currently restricted to specific use cases.
350 	 */
351 	if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
352 		return ENOTSUP;
353 	}
354 
355 #if !defined(XNU_TARGET_OS_OSX)
356 	if (strcmp(fstype, "lifs") == 0) {
357 		syscall_flags |= MNT_NOEXEC;
358 	}
359 #endif
360 
361 	if (flags & VFS_MOUNT_FLAG_NOAUTH) {
362 		km_flags |= KERNEL_MOUNT_NOAUTH;
363 	}
364 	if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
365 		km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
366 	}
367 
368 	error = kernel_mount(fstype, pvp, vp, path, data, datalen,
369 	    syscall_flags, km_flags, ctx);
370 	if (error) {
371 		printf("%s: mount on %s failed, error %d\n", __func__, path,
372 		    error);
373 	}
374 
375 	return error;
376 }
377 
378 int
vfs_mount_override_type_name(mount_t mp,const char * name)379 vfs_mount_override_type_name(mount_t mp, const char *name)
380 {
381 	if (mp == NULL || name == NULL) {
382 		return EINVAL;
383 	}
384 
385 	/* Override the FS type name. */
386 	mount_lock_spin(mp);
387 	strlcpy(mp->fstypename_override, name, sizeof(mp->fstypename_override));
388 	mp->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
389 	mount_unlock(mp);
390 
391 	return 0;
392 }
393 
394 /*
395  * Mount a file system.
396  */
397 /* ARGSUSED */
398 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)399 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
400 {
401 	struct __mac_mount_args muap;
402 
403 	muap.type = uap->type;
404 	muap.path = uap->path;
405 	muap.flags = uap->flags;
406 	muap.data = uap->data;
407 	muap.mac_p = USER_ADDR_NULL;
408 	return __mac_mount(p, &muap, retval);
409 }
410 
411 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)412 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
413 {
414 	struct componentname    cn;
415 	vfs_context_t           ctx = vfs_context_current();
416 	size_t                  dummy = 0;
417 	int                     error;
418 	int                     flags = uap->flags;
419 	char                    fstypename[MFSNAMELEN];
420 	char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
421 	vnode_t                 pvp;
422 	vnode_t                 vp;
423 
424 	AUDIT_ARG(fd, uap->fd);
425 	AUDIT_ARG(fflags, flags);
426 	/* fstypename will get audited by mount_common */
427 
428 	/* Sanity check the flags */
429 	if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
430 		return ENOTSUP;
431 	}
432 
433 	if (flags & MNT_UNION) {
434 		return EPERM;
435 	}
436 
437 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
438 	if (error) {
439 		return error;
440 	}
441 
442 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
443 		return error;
444 	}
445 
446 	if ((error = vnode_getwithref(vp)) != 0) {
447 		file_drop(uap->fd);
448 		return error;
449 	}
450 
451 	pvp = vnode_getparent(vp);
452 	if (pvp == NULL) {
453 		if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
454 			error = EBUSY;
455 		} else {
456 			error = EINVAL;
457 		}
458 		vnode_put(vp);
459 		file_drop(uap->fd);
460 		return error;
461 	}
462 
463 	memset(&cn, 0, sizeof(struct componentname));
464 	cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
465 	cn.cn_pnlen = MAXPATHLEN;
466 
467 	if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
468 		zfree(ZV_NAMEI, cn.cn_pnbuf);
469 		vnode_put(pvp);
470 		vnode_put(vp);
471 		file_drop(uap->fd);
472 		return error;
473 	}
474 
475 	error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
476 
477 	zfree(ZV_NAMEI, cn.cn_pnbuf);
478 	vnode_put(pvp);
479 	vnode_put(vp);
480 	file_drop(uap->fd);
481 
482 	return error;
483 }
484 
485 #define MAX_GRAFT_METADATA_SIZE             16384 /* bytes */
486 
487 /*
488  * Get the size of a graft file (a manifest or payload file).
489  * The vp should be an iocounted vnode.
490  */
491 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)492 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
493 {
494 	struct stat64 sb = {};
495 	int error;
496 
497 	*size = 0;
498 
499 	error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
500 	if (error) {
501 		return error;
502 	}
503 
504 	if (sb.st_size == 0) {
505 		error = ENODATA;
506 	} else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
507 		error = EFBIG;
508 	} else {
509 		*size = (size_t) sb.st_size;
510 	}
511 
512 	return error;
513 }
514 
515 /*
516  * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
517  * `size` must already be validated.
518  */
519 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)520 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
521 {
522 	return vn_rdwr(UIO_READ, graft_vp,
523 	           (caddr_t) buf, (int) size, /* offset */ 0,
524 	           UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
525 	           vfs_context_ucred(vctx), /* resid */ NULL,
526 	           vfs_context_proc(vctx));
527 }
528 
529 /*
530  * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
531  * and read it into `buf`.
532  */
533 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,size_t * size,void * buf)534 graft_secureboot_read_fd(int fd, vfs_context_t vctx, size_t *size, void *buf)
535 {
536 	vnode_t metadata_vp = NULLVP;
537 	int error;
538 
539 	// Convert this graft fd to a vnode.
540 	if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
541 		goto out;
542 	}
543 
544 	// Get (and validate) size information.
545 	if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
546 		goto out;
547 	}
548 
549 	// Read each file into the provided buffer - we must get the expected amount of bytes.
550 	if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
551 		goto out;
552 	}
553 
554 out:
555 	if (metadata_vp) {
556 		vnode_put(metadata_vp);
557 		metadata_vp = NULLVP;
558 	}
559 
560 	return error;
561 }
562 
563 /*
564  * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
565  * provided in `gfs`, saving the size of data read in `gfs`.
566  */
567 static int
graft_secureboot_read_metadata(secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)568 graft_secureboot_read_metadata(secure_boot_cryptex_args_t *sbc_args, vfs_context_t vctx,
569     fsioc_graft_fs_t *gfs)
570 {
571 	int error;
572 
573 	// Read the authentic manifest.
574 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
575 	    &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
576 		return error;
577 	}
578 
579 	// The user manifest is currently unused, but set its size.
580 	gfs->user_manifest_size = 0;
581 
582 	// Read the payload.
583 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
584 	    &gfs->payload_size, gfs->payload))) {
585 		return error;
586 	}
587 
588 	return 0;
589 }
590 
591 /*
592  * Call into the filesystem to verify and graft a cryptex.
593  */
594 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)595 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
596     vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
597 {
598 	fsioc_graft_fs_t gfs = {};
599 	uint64_t graft_dir_ino = 0;
600 	struct stat64 sb = {};
601 	int error;
602 
603 	// Pre-flight arguments.
604 	if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
605 		// Make sure that this graft version matches what we support.
606 		return ENOTSUP;
607 	} else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
608 		// For this type, cryptex VP must live on same volume as the target of graft.
609 		return EXDEV;
610 	} else if (mounton_vp && mounton_vp->v_type != VDIR) {
611 		// We cannot graft upon non-directories.
612 		return ENOTDIR;
613 	} else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
614 	    sbc_args->sbc_payload_fd < 0) {
615 		// We cannot graft without a manifest and payload.
616 		return EINVAL;
617 	}
618 
619 	if (mounton_vp) {
620 		// Get the mounton's inode number.
621 		error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
622 		if (error) {
623 			return error;
624 		}
625 		graft_dir_ino = (uint64_t) sb.st_ino;
626 	}
627 
628 	// Create buffers (of our maximum-defined size) to store authentication info.
629 	gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
630 	gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
631 
632 	if (!gfs.authentic_manifest || !gfs.payload) {
633 		error = ENOMEM;
634 		goto out;
635 	}
636 
637 	// Read our fd's into our buffers.
638 	// (Note that this will set the buffer size fields in `gfs`.)
639 	error = graft_secureboot_read_metadata(sbc_args, vctx, &gfs);
640 	if (error) {
641 		goto out;
642 	}
643 
644 	gfs.graft_version = FSIOC_GRAFT_VERSION;
645 	gfs.graft_type = graft_type;
646 	gfs.graft_4cc = sbc_args->sbc_4cc;
647 	if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
648 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
649 	}
650 	if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
651 		gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
652 	}
653 	if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
654 		gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
655 	}
656 	if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
657 		gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
658 	}
659 	if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
660 		gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
661 	}
662 	if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
663 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
664 	}
665 	gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
666 
667 	// Call into the FS to perform the graft (and validation).
668 	error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
669 
670 out:
671 	if (gfs.authentic_manifest) {
672 		kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
673 		gfs.authentic_manifest = NULL;
674 	}
675 	if (gfs.payload) {
676 		kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
677 		gfs.payload = NULL;
678 	}
679 
680 	return error;
681 }
682 
683 #define GRAFTDMG_ENTITLEMENT  "com.apple.private.vfs.graftdmg"
684 
685 /*
686  * Graft a cryptex disk image (via FD) onto the appropriate mount-point
687  * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
688  */
689 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)690 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
691 {
692 	int ua_dmgfd = uap->dmg_fd;
693 	user_addr_t ua_mountdir = uap->mountdir;
694 	uint32_t ua_grafttype = uap->graft_type;
695 	user_addr_t ua_graftargs = uap->gda;
696 
697 	graftdmg_args_un kern_gda = {};
698 	int error = 0;
699 	secure_boot_cryptex_args_t *sbc_args = NULL;
700 
701 	vnode_t cryptex_vp = NULLVP;
702 	vnode_t mounton_vp = NULLVP;
703 	struct nameidata nd = {};
704 	vfs_context_t ctx = vfs_context_current();
705 
706 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
707 		return EPERM;
708 	}
709 
710 	error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
711 	if (error) {
712 		return error;
713 	}
714 
715 	// Copy mount dir in, if provided.
716 	if (ua_mountdir != USER_ADDR_NULL) {
717 		// Acquire vnode for mount-on path
718 		NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
719 		    UIO_USERSPACE, ua_mountdir, ctx);
720 
721 		error = namei(&nd);
722 		if (error) {
723 			return error;
724 		}
725 		mounton_vp = nd.ni_vp;
726 	}
727 
728 	// Convert fd to vnode.
729 	error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
730 	if (error) {
731 		goto graftout;
732 	}
733 
734 	if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_MAX) {
735 		error = EINVAL;
736 	} else {
737 		sbc_args = &kern_gda.sbc_args;
738 		error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp);
739 	}
740 
741 graftout:
742 	if (cryptex_vp) {
743 		vnode_put(cryptex_vp);
744 		cryptex_vp = NULLVP;
745 	}
746 	if (mounton_vp) {
747 		vnode_put(mounton_vp);
748 		mounton_vp = NULLVP;
749 	}
750 	if (ua_mountdir != USER_ADDR_NULL) {
751 		nameidone(&nd);
752 	}
753 
754 	return error;
755 }
756 
757 /*
758  * Ungraft a cryptex disk image (via mount dir FD)
759  * { int ungraftdmg(const char *mountdir, uint64_t flags); }
760  */
761 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)762 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
763 {
764 	int error = 0;
765 	user_addr_t ua_mountdir = uap->mountdir;
766 	fsioc_ungraft_fs_t ugfs;
767 	vnode_t mounton_vp = NULLVP;
768 	struct nameidata nd = {};
769 	vfs_context_t ctx = vfs_context_current();
770 
771 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
772 		return EPERM;
773 	}
774 
775 	if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
776 		return EINVAL;
777 	}
778 
779 	ugfs.ungraft_flags = 0;
780 
781 	// Acquire vnode for mount-on path
782 	NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
783 	    UIO_USERSPACE, ua_mountdir, ctx);
784 
785 	error = namei(&nd);
786 	if (error) {
787 		return error;
788 	}
789 	mounton_vp = nd.ni_vp;
790 
791 	// Call into the FS to perform the ungraft
792 	error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
793 
794 	vnode_put(mounton_vp);
795 	nameidone(&nd);
796 
797 	return error;
798 }
799 
800 
801 void
vfs_notify_mount(vnode_t pdvp)802 vfs_notify_mount(vnode_t pdvp)
803 {
804 	vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
805 	lock_vnode_and_post(pdvp, NOTE_WRITE);
806 }
807 
808 /*
809  * __mac_mount:
810  *	Mount a file system taking into account MAC label behavior.
811  *	See mount(2) man page for more information
812  *
813  * Parameters:    p                        Process requesting the mount
814  *                uap                      User argument descriptor (see below)
815  *                retval                   (ignored)
816  *
817  * Indirect:      uap->type                Filesystem type
818  *                uap->path                Path to mount
819  *                uap->data                Mount arguments
820  *                uap->mac_p               MAC info
821  *                uap->flags               Mount flags
822  *
823  *
824  * Returns:        0                       Success
825  *                !0                       Not success
826  */
827 boolean_t root_fs_upgrade_try = FALSE;
828 
829 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)830 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
831 {
832 	vnode_t pvp = NULL;
833 	vnode_t vp = NULL;
834 	int need_nameidone = 0;
835 	vfs_context_t ctx = vfs_context_current();
836 	char fstypename[MFSNAMELEN];
837 	struct nameidata nd;
838 	size_t dummy = 0;
839 	char *labelstr = NULL;
840 	size_t labelsz = 0;
841 	int flags = uap->flags;
842 	int error;
843 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
844 	boolean_t is_64bit = IS_64BIT_PROCESS(p);
845 #else
846 #pragma unused(p)
847 #endif
848 	/*
849 	 * Get the fs type name from user space
850 	 */
851 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
852 	if (error) {
853 		return error;
854 	}
855 
856 	/*
857 	 * Get the vnode to be covered
858 	 */
859 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
860 	    UIO_USERSPACE, uap->path, ctx);
861 	if (flags & MNT_NOFOLLOW) {
862 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
863 	}
864 	error = namei(&nd);
865 	if (error) {
866 		goto out;
867 	}
868 	need_nameidone = 1;
869 	vp = nd.ni_vp;
870 	pvp = nd.ni_dvp;
871 
872 #ifdef CONFIG_IMGSRC_ACCESS
873 	/* Mounting image source cannot be batched with other operations */
874 	if (flags == MNT_IMGSRC_BY_INDEX) {
875 		error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
876 		    ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
877 		goto out;
878 	}
879 #endif /* CONFIG_IMGSRC_ACCESS */
880 
881 #if CONFIG_MACF
882 	/*
883 	 * Get the label string (if any) from user space
884 	 */
885 	if (uap->mac_p != USER_ADDR_NULL) {
886 		struct user_mac mac;
887 		size_t ulen = 0;
888 
889 		if (is_64bit) {
890 			struct user64_mac mac64;
891 			error = copyin(uap->mac_p, &mac64, sizeof(mac64));
892 			mac.m_buflen = (user_size_t)mac64.m_buflen;
893 			mac.m_string = (user_addr_t)mac64.m_string;
894 		} else {
895 			struct user32_mac mac32;
896 			error = copyin(uap->mac_p, &mac32, sizeof(mac32));
897 			mac.m_buflen = mac32.m_buflen;
898 			mac.m_string = mac32.m_string;
899 		}
900 		if (error) {
901 			goto out;
902 		}
903 		if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
904 		    (mac.m_buflen < 2)) {
905 			error = EINVAL;
906 			goto out;
907 		}
908 		labelsz = mac.m_buflen;
909 		labelstr = kalloc_data(labelsz, Z_WAITOK);
910 		error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
911 		if (error) {
912 			goto out;
913 		}
914 		AUDIT_ARG(mac_string, labelstr);
915 	}
916 #endif /* CONFIG_MACF */
917 
918 	AUDIT_ARG(fflags, flags);
919 
920 #if !CONFIG_UNION_MOUNTS
921 	if (flags & MNT_UNION) {
922 		error = EPERM;
923 		goto out;
924 	}
925 #endif
926 
927 	if ((vp->v_flag & VROOT) &&
928 	    (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
929 #if CONFIG_UNION_MOUNTS
930 		if (!(flags & MNT_UNION)) {
931 			flags |= MNT_UPDATE;
932 		} else {
933 			/*
934 			 * For a union mount on '/', treat it as fresh
935 			 * mount instead of update.
936 			 * Otherwise, union mouting on '/' used to panic the
937 			 * system before, since mnt_vnodecovered was found to
938 			 * be NULL for '/' which is required for unionlookup
939 			 * after it gets ENOENT on union mount.
940 			 */
941 			flags = (flags & ~(MNT_UPDATE));
942 		}
943 #else
944 		flags |= MNT_UPDATE;
945 #endif /* CONFIG_UNION_MOUNTS */
946 
947 #if SECURE_KERNEL
948 		if ((flags & MNT_RDONLY) == 0) {
949 			/* Release kernels are not allowed to mount "/" as rw */
950 			error = EPERM;
951 			goto out;
952 		}
953 #endif
954 
955 		/*
956 		 * See 7392553 for more details on why this check exists.
957 		 * Suffice to say: If this check is ON and something tries
958 		 * to mount the rootFS RW, we'll turn off the codesign
959 		 * bitmap optimization.
960 		 */
961 #if CHECK_CS_VALIDATION_BITMAP
962 		if ((flags & MNT_RDONLY) == 0) {
963 			root_fs_upgrade_try = TRUE;
964 		}
965 #endif
966 	}
967 
968 	error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
969 	    labelstr, ctx);
970 
971 out:
972 
973 #if CONFIG_MACF
974 	kfree_data(labelstr, labelsz);
975 #endif /* CONFIG_MACF */
976 
977 	if (vp) {
978 		vnode_put(vp);
979 	}
980 	if (pvp) {
981 		vnode_put(pvp);
982 	}
983 	if (need_nameidone) {
984 		nameidone(&nd);
985 	}
986 
987 	return error;
988 }
989 
990 /*
991  * common mount implementation (final stage of mounting)
992  *
993  * Arguments:
994  *  fstypename	file system type (ie it's vfs name)
995  *  pvp		parent of covered vnode
996  *  vp		covered vnode
997  *  cnp		component name (ie path) of covered vnode
998  *  flags	generic mount flags
999  *  fsmountargs	file system specific data
1000  *  labelstr	optional MAC label
1001  *  kernelmount	TRUE for mounts initiated from inside the kernel
1002  *  ctx		caller's context
1003  */
1004 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)1005 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
1006     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
1007     char *labelstr, vfs_context_t ctx)
1008 {
1009 #if !CONFIG_MACF
1010 #pragma unused(labelstr)
1011 #endif
1012 	struct vnode *devvp = NULLVP;
1013 	struct vnode *device_vnode = NULLVP;
1014 #if CONFIG_MACF
1015 	struct vnode *rvp;
1016 #endif
1017 	struct mount *mp = NULL;
1018 	struct vfstable *vfsp = (struct vfstable *)0;
1019 	struct proc *p = vfs_context_proc(ctx);
1020 	int error, flag = 0;
1021 	bool flag_set = false;
1022 	user_addr_t devpath = USER_ADDR_NULL;
1023 	int ronly = 0;
1024 	int mntalloc = 0;
1025 	boolean_t vfsp_ref = FALSE;
1026 	boolean_t is_rwlock_locked = FALSE;
1027 	boolean_t did_rele = FALSE;
1028 	boolean_t have_usecount = FALSE;
1029 	boolean_t did_set_lmount = FALSE;
1030 	boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1031 
1032 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1033 	/* Check for mutually-exclusive flag bits */
1034 	uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1035 	int bitcount = 0;
1036 	while (checkflags != 0) {
1037 		checkflags &= (checkflags - 1);
1038 		bitcount++;
1039 	}
1040 
1041 	if (bitcount > 1) {
1042 		//not allowed to request multiple mount-by-role flags
1043 		error = EINVAL;
1044 		goto out1;
1045 	}
1046 #endif
1047 
1048 	/*
1049 	 * Process an update for an existing mount
1050 	 */
1051 	if (flags & MNT_UPDATE) {
1052 		if ((vp->v_flag & VROOT) == 0) {
1053 			error = EINVAL;
1054 			goto out1;
1055 		}
1056 		mp = vp->v_mount;
1057 
1058 		/* if unmount or mount in progress, return error */
1059 		mount_lock_spin(mp);
1060 		if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1061 			mount_unlock(mp);
1062 			error = EBUSY;
1063 			goto out1;
1064 		}
1065 		mp->mnt_lflag |= MNT_LMOUNT;
1066 		did_set_lmount = TRUE;
1067 		mount_unlock(mp);
1068 		lck_rw_lock_exclusive(&mp->mnt_rwlock);
1069 		is_rwlock_locked = TRUE;
1070 		/*
1071 		 * We only allow the filesystem to be reloaded if it
1072 		 * is currently mounted read-only.
1073 		 */
1074 		if ((flags & MNT_RELOAD) &&
1075 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1076 			error = ENOTSUP;
1077 			goto out1;
1078 		}
1079 
1080 		/*
1081 		 * If content protection is enabled, update mounts are not
1082 		 * allowed to turn it off.
1083 		 */
1084 		if ((mp->mnt_flag & MNT_CPROTECT) &&
1085 		    ((flags & MNT_CPROTECT) == 0)) {
1086 			error = EINVAL;
1087 			goto out1;
1088 		}
1089 
1090 		/*
1091 		 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1092 		 * failure to return an error for this so we'll just silently
1093 		 * add it if it is not passed in.
1094 		 */
1095 		if ((mp->mnt_flag & MNT_REMOVABLE) &&
1096 		    ((flags & MNT_REMOVABLE) == 0)) {
1097 			flags |= MNT_REMOVABLE;
1098 		}
1099 
1100 		/* Can't downgrade the backer of the root FS */
1101 		if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1102 		    (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1103 			error = ENOTSUP;
1104 			goto out1;
1105 		}
1106 
1107 		/*
1108 		 * Only root, or the user that did the original mount is
1109 		 * permitted to update it.
1110 		 */
1111 		if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1112 		    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1113 			goto out1;
1114 		}
1115 #if CONFIG_MACF
1116 		error = mac_mount_check_remount(ctx, mp);
1117 		if (error != 0) {
1118 			goto out1;
1119 		}
1120 #endif
1121 		/*
1122 		 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1123 		 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1124 		 */
1125 		if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1126 			flags |= MNT_NOSUID | MNT_NODEV;
1127 			if (mp->mnt_flag & MNT_NOEXEC) {
1128 				flags |= MNT_NOEXEC;
1129 			}
1130 		}
1131 		flag = mp->mnt_flag;
1132 		flag_set = true;
1133 
1134 
1135 
1136 		mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1137 
1138 		vfsp = mp->mnt_vtable;
1139 		goto update;
1140 	} // MNT_UPDATE
1141 
1142 	/*
1143 	 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1144 	 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1145 	 */
1146 	if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1147 		flags |= MNT_NOSUID | MNT_NODEV;
1148 		if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1149 			flags |= MNT_NOEXEC;
1150 		}
1151 	}
1152 
1153 	/* XXXAUDIT: Should we capture the type on the error path as well? */
1154 	/* XXX cast-away const (audit_arg_text() does not modify its input) */
1155 	AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1156 	mount_list_lock();
1157 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1158 		if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1159 			vfsp->vfc_refcount++;
1160 			vfsp_ref = TRUE;
1161 			break;
1162 		}
1163 	}
1164 	mount_list_unlock();
1165 	if (vfsp == NULL) {
1166 		error = ENODEV;
1167 		goto out1;
1168 	}
1169 
1170 	/*
1171 	 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1172 	 * except in ROSV configs and for the initial BaseSystem root.
1173 	 */
1174 	if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1175 	    ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1176 	    ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1177 		error = EINVAL;  /* unsupported request */
1178 		goto out1;
1179 	}
1180 
1181 	error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1182 	if (error != 0) {
1183 		goto out1;
1184 	}
1185 
1186 	/*
1187 	 * Allocate and initialize the filesystem (mount_t)
1188 	 */
1189 	mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1190 	mntalloc = 1;
1191 
1192 	/* Initialize the default IO constraints */
1193 	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1194 	mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1195 	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1196 	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1197 	mp->mnt_devblocksize = DEV_BSIZE;
1198 	mp->mnt_alignmentmask = PAGE_MASK;
1199 	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1200 	mp->mnt_ioscale = 1;
1201 	mp->mnt_ioflags = 0;
1202 	mp->mnt_realrootvp = NULLVP;
1203 	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1204 
1205 	mp->mnt_lflag |= MNT_LMOUNT;
1206 	did_set_lmount = TRUE;
1207 
1208 	TAILQ_INIT(&mp->mnt_vnodelist);
1209 	TAILQ_INIT(&mp->mnt_workerqueue);
1210 	TAILQ_INIT(&mp->mnt_newvnodes);
1211 	mount_lock_init(mp);
1212 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
1213 	is_rwlock_locked = TRUE;
1214 	mp->mnt_op = vfsp->vfc_vfsops;
1215 	mp->mnt_vtable = vfsp;
1216 	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
1217 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1218 	strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1219 	do {
1220 		size_t pathlen = MAXPATHLEN;
1221 
1222 		if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1223 			strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1224 		}
1225 	} while (0);
1226 	mp->mnt_vnodecovered = vp;
1227 	mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1228 	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1229 	mp->mnt_devbsdunit = 0;
1230 	mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1231 
1232 	/* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1233 	vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1234 
1235 	if (kernelmount) {
1236 		mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1237 	}
1238 	if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1239 		mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1240 	}
1241 
1242 	if (KERNEL_MOUNT_DEVFS & internal_flags) {
1243 		// kernel mounted devfs
1244 		mp->mnt_kern_flag |= MNTK_SYSTEM;
1245 	}
1246 
1247 update:
1248 
1249 	/*
1250 	 * Set the mount level flags.
1251 	 */
1252 	if (flags & MNT_RDONLY) {
1253 		mp->mnt_flag |= MNT_RDONLY;
1254 	} else if (mp->mnt_flag & MNT_RDONLY) {
1255 		// disallow read/write upgrades of file systems that
1256 		// had the TYPENAME_OVERRIDE feature set.
1257 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1258 			error = EPERM;
1259 			goto out1;
1260 		}
1261 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
1262 	}
1263 	mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1264 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1265 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1266 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1267 	    MNT_QUARANTINE | MNT_CPROTECT);
1268 
1269 #if SECURE_KERNEL
1270 #if !CONFIG_MNT_SUID
1271 	/*
1272 	 * On release builds of iOS based platforms, always enforce NOSUID on
1273 	 * all mounts. We do this here because we can catch update mounts as well as
1274 	 * non-update mounts in this case.
1275 	 */
1276 	mp->mnt_flag |= (MNT_NOSUID);
1277 #endif
1278 #endif
1279 
1280 	mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1281 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1282 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1283 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1284 	    MNT_QUARANTINE | MNT_CPROTECT);
1285 
1286 #if CONFIG_MACF
1287 	if (flags & MNT_MULTILABEL) {
1288 		if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1289 			error = EINVAL;
1290 			goto out1;
1291 		}
1292 		mp->mnt_flag |= MNT_MULTILABEL;
1293 	}
1294 #endif
1295 	/*
1296 	 * Process device path for local file systems if requested.
1297 	 *
1298 	 * Snapshot and mount-by-role mounts do not use this path; they are
1299 	 * passing other opaque data in the device path field.
1300 	 *
1301 	 * Basesystemroot mounts pass a device path to be resolved here,
1302 	 * but it's just a char * already inside the kernel, which
1303 	 * kernel_mount() shoved into a user_addr_t to call us. So for such
1304 	 * mounts we must skip copyin (both of the address and of the string
1305 	 * (in NDINIT).
1306 	 */
1307 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1308 	    !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1309 		boolean_t do_copyin_devpath = true;
1310 #if CONFIG_BASESYSTEMROOT
1311 		if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1312 			// KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1313 			// We have been passed fsmountargs, which is typed as a user_addr_t,
1314 			// but is actually a char ** pointing to a (kernelspace) string.
1315 			// We manually unpack it with a series of casts and dereferences
1316 			// that reverses what was done just above us on the stack in
1317 			// imageboot_pivot_image().
1318 			// After retrieving the path to the dev node (which we will NDINIT
1319 			// in a moment), we pass NULL fsmountargs on to the filesystem.
1320 			_Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1321 			char **devnamepp = (char **)fsmountargs;
1322 			char *devnamep = *devnamepp;
1323 			devpath = CAST_USER_ADDR_T(devnamep);
1324 			do_copyin_devpath = false;
1325 			fsmountargs = USER_ADDR_NULL;
1326 
1327 			//Now that we have a mp, denote that this mount is for the basesystem.
1328 			mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1329 		}
1330 #endif // CONFIG_BASESYSTEMROOT
1331 
1332 		if (do_copyin_devpath) {
1333 			if (vfs_context_is64bit(ctx)) {
1334 				if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1335 					goto out1;
1336 				}
1337 				fsmountargs += sizeof(devpath);
1338 			} else {
1339 				user32_addr_t tmp;
1340 				if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1341 					goto out1;
1342 				}
1343 				/* munge into LP64 addr */
1344 				devpath = CAST_USER_ADDR_T(tmp);
1345 				fsmountargs += sizeof(tmp);
1346 			}
1347 		}
1348 
1349 		/* Lookup device and authorize access to it */
1350 		if ((devpath)) {
1351 			struct nameidata nd;
1352 
1353 			enum uio_seg seg = UIO_USERSPACE;
1354 #if CONFIG_BASESYSTEMROOT
1355 			if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1356 				seg = UIO_SYSSPACE;
1357 			}
1358 #endif // CONFIG_BASESYSTEMROOT
1359 
1360 			NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1361 			if ((error = namei(&nd))) {
1362 				goto out1;
1363 			}
1364 
1365 			strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1366 			devvp = nd.ni_vp;
1367 
1368 			nameidone(&nd);
1369 
1370 			if (devvp->v_type != VBLK) {
1371 				error = ENOTBLK;
1372 				goto out2;
1373 			}
1374 			if (major(devvp->v_rdev) >= nblkdev) {
1375 				error = ENXIO;
1376 				goto out2;
1377 			}
1378 			/*
1379 			 * If mount by non-root, then verify that user has necessary
1380 			 * permissions on the device.
1381 			 */
1382 			if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1383 				mode_t accessmode = KAUTH_VNODE_READ_DATA;
1384 
1385 				if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1386 					accessmode |= KAUTH_VNODE_WRITE_DATA;
1387 				}
1388 				if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1389 					goto out2;
1390 				}
1391 			}
1392 		}
1393 		/* On first mount, preflight and open device */
1394 		if (devpath && ((flags & MNT_UPDATE) == 0)) {
1395 			if ((error = vnode_ref(devvp))) {
1396 				goto out2;
1397 			}
1398 			/*
1399 			 * Disallow multiple mounts of the same device.
1400 			 * Disallow mounting of a device that is currently in use
1401 			 * (except for root, which might share swap device for miniroot).
1402 			 * Flush out any old buffers remaining from a previous use.
1403 			 */
1404 			if ((error = vfs_mountedon(devvp))) {
1405 				goto out3;
1406 			}
1407 
1408 			if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1409 				error = EBUSY;
1410 				goto out3;
1411 			}
1412 			if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1413 				error = ENOTBLK;
1414 				goto out3;
1415 			}
1416 			if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1417 				goto out3;
1418 			}
1419 
1420 			ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1421 #if CONFIG_MACF
1422 			error = mac_vnode_check_open(ctx,
1423 			    devvp,
1424 			    ronly ? FREAD : FREAD | FWRITE);
1425 			if (error) {
1426 				goto out3;
1427 			}
1428 #endif /* MAC */
1429 			if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1430 				goto out3;
1431 			}
1432 
1433 			mp->mnt_devvp = devvp;
1434 			device_vnode = devvp;
1435 		} else if ((mp->mnt_flag & MNT_RDONLY) &&
1436 		    (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1437 		    (device_vnode = mp->mnt_devvp)) {
1438 			dev_t dev;
1439 			int maj;
1440 			/*
1441 			 * If upgrade to read-write by non-root, then verify
1442 			 * that user has necessary permissions on the device.
1443 			 */
1444 			vnode_getalways(device_vnode);
1445 
1446 			if (suser(vfs_context_ucred(ctx), NULL) &&
1447 			    (error = vnode_authorize(device_vnode, NULL,
1448 			    KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1449 			    ctx)) != 0) {
1450 				vnode_put(device_vnode);
1451 				goto out2;
1452 			}
1453 
1454 			/* Tell the device that we're upgrading */
1455 			dev = (dev_t)device_vnode->v_rdev;
1456 			maj = major(dev);
1457 
1458 			if ((u_int)maj >= (u_int)nblkdev) {
1459 				panic("Volume mounted on a device with invalid major number.");
1460 			}
1461 
1462 			error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1463 			vnode_put(device_vnode);
1464 			device_vnode = NULLVP;
1465 			if (error != 0) {
1466 				goto out2;
1467 			}
1468 		}
1469 	} // localargs && !(snapshot | data | vm)
1470 
1471 #if CONFIG_MACF
1472 	if ((flags & MNT_UPDATE) == 0) {
1473 		mac_mount_label_init(mp);
1474 		mac_mount_label_associate(ctx, mp);
1475 	}
1476 	if (labelstr) {
1477 		if ((flags & MNT_UPDATE) != 0) {
1478 			error = mac_mount_check_label_update(ctx, mp);
1479 			if (error != 0) {
1480 				goto out3;
1481 			}
1482 		}
1483 	}
1484 #endif
1485 	/*
1486 	 * Mount the filesystem.  We already asserted that internal_flags
1487 	 * cannot have more than one mount-by-role bit set.
1488 	 */
1489 	if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1490 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1491 		    (caddr_t)fsmountargs, 0, ctx);
1492 	} else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1493 #if CONFIG_ROSV_STARTUP
1494 		struct mount *origin_mp = (struct mount*)fsmountargs;
1495 		fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1496 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1497 		if (error) {
1498 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1499 		} else {
1500 			/* Mark volume associated with system volume */
1501 			mp->mnt_kern_flag |= MNTK_SYSTEM;
1502 
1503 			/* Attempt to acquire the mnt_devvp and set it up */
1504 			struct vnode *mp_devvp = NULL;
1505 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1506 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1507 				    0, &mp_devvp, vfs_context_kernel());
1508 				if (!lerr) {
1509 					mp->mnt_devvp = mp_devvp;
1510 					//vnode_lookup took an iocount, need to drop it.
1511 					vnode_put(mp_devvp);
1512 					// now set `device_vnode` to the devvp that was acquired.
1513 					// this is needed in order to ensure vfs_init_io_attributes is invoked.
1514 					// note that though the iocount above was dropped, the mount acquires
1515 					// an implicit reference against the device.
1516 					device_vnode = mp_devvp;
1517 				}
1518 			}
1519 		}
1520 #else
1521 		error = EINVAL;
1522 #endif
1523 	} else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1524 #if CONFIG_MOUNT_VM
1525 		struct mount *origin_mp = (struct mount*)fsmountargs;
1526 		fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1527 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1528 		if (error) {
1529 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1530 		} else {
1531 			/* Mark volume associated with system volume and a swap mount */
1532 			mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1533 			/* Attempt to acquire the mnt_devvp and set it up */
1534 			struct vnode *mp_devvp = NULL;
1535 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1536 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1537 				    0, &mp_devvp, vfs_context_kernel());
1538 				if (!lerr) {
1539 					mp->mnt_devvp = mp_devvp;
1540 					//vnode_lookup took an iocount, need to drop it.
1541 					vnode_put(mp_devvp);
1542 
1543 					// now set `device_vnode` to the devvp that was acquired.
1544 					// note that though the iocount above was dropped, the mount acquires
1545 					// an implicit reference against the device.
1546 					device_vnode = mp_devvp;
1547 				}
1548 			}
1549 		}
1550 #else
1551 		error = EINVAL;
1552 #endif
1553 	} else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1554 #if CONFIG_MOUNT_PREBOOTRECOVERY
1555 		struct mount *origin_mp = (struct mount*)fsmountargs;
1556 		uint32_t mount_role = 0;
1557 		if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1558 			mount_role = VFS_PREBOOT_ROLE;
1559 		} else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1560 			mount_role = VFS_RECOVERY_ROLE;
1561 		}
1562 
1563 		if (mount_role != 0) {
1564 			fs_role_mount_args_t frma = {origin_mp, mount_role};
1565 			error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1566 			if (error) {
1567 				printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1568 			} else {
1569 				// NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1570 				/* Mark volume associated with system volume */
1571 				//mp->mnt_kern_flag |= MNTK_SYSTEM;
1572 				/* Attempt to acquire the mnt_devvp and set it up */
1573 				struct vnode *mp_devvp = NULL;
1574 				if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1575 					errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1576 					    0, &mp_devvp, vfs_context_kernel());
1577 					if (!lerr) {
1578 						mp->mnt_devvp = mp_devvp;
1579 						//vnode_lookup took an iocount, need to drop it.
1580 						vnode_put(mp_devvp);
1581 
1582 						// now set `device_vnode` to the devvp that was acquired.
1583 						// note that though the iocount above was dropped, the mount acquires
1584 						// an implicit reference against the device.
1585 						device_vnode = mp_devvp;
1586 					}
1587 				}
1588 			}
1589 		} else {
1590 			printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1591 			error = EINVAL;
1592 		}
1593 #else
1594 		error = EINVAL;
1595 #endif
1596 	} else {
1597 		error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1598 	}
1599 
1600 	if (flags & MNT_UPDATE) {
1601 		if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1602 			mp->mnt_flag &= ~MNT_RDONLY;
1603 		}
1604 		mp->mnt_flag &= ~
1605 		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1606 		mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1607 		if (error) {
1608 			mp->mnt_flag = flag;  /* restore flag value */
1609 		}
1610 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1611 		lck_rw_done(&mp->mnt_rwlock);
1612 		is_rwlock_locked = FALSE;
1613 		if (!error) {
1614 			enablequotas(mp, ctx);
1615 		}
1616 		goto exit;
1617 	}
1618 
1619 	/*
1620 	 * Put the new filesystem on the mount list after root.
1621 	 */
1622 	if (error == 0) {
1623 		struct vfs_attr vfsattr;
1624 		if (device_vnode) {
1625 			/*
1626 			 *   cache the IO attributes for the underlying physical media...
1627 			 *   an error return indicates the underlying driver doesn't
1628 			 *   support all the queries necessary... however, reasonable
1629 			 *   defaults will have been set, so no reason to bail or care
1630 			 *
1631 			 *   Need to do this before calling the MAC hook as it needs
1632 			 *   information from this call.
1633 			 */
1634 			vfs_init_io_attributes(device_vnode, mp);
1635 		}
1636 
1637 #if CONFIG_MACF
1638 		error = mac_mount_check_mount_late(ctx, mp);
1639 		if (error != 0) {
1640 			goto out4;
1641 		}
1642 
1643 		if (vfs_flags(mp) & MNT_MULTILABEL) {
1644 			error = VFS_ROOT(mp, &rvp, ctx);
1645 			if (error) {
1646 				printf("%s() VFS_ROOT returned %d\n", __func__, error);
1647 				goto out4;
1648 			}
1649 			error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1650 			/*
1651 			 * drop reference provided by VFS_ROOT
1652 			 */
1653 			vnode_put(rvp);
1654 
1655 			if (error) {
1656 				goto out4;
1657 			}
1658 		}
1659 #endif  /* MAC */
1660 
1661 		vnode_lock_spin(vp);
1662 		CLR(vp->v_flag, VMOUNT);
1663 		vp->v_mountedhere = mp;
1664 		SET(vp->v_flag, VMOUNTEDHERE);
1665 		vnode_unlock(vp);
1666 
1667 		/*
1668 		 * taking the name_cache_lock exclusively will
1669 		 * insure that everyone is out of the fast path who
1670 		 * might be trying to use a now stale copy of
1671 		 * vp->v_mountedhere->mnt_realrootvp
1672 		 * bumping mount_generation causes the cached values
1673 		 * to be invalidated
1674 		 */
1675 		name_cache_lock();
1676 		mount_generation++;
1677 		name_cache_unlock();
1678 
1679 		error = vnode_ref(vp);
1680 		if (error != 0) {
1681 			goto out4;
1682 		}
1683 
1684 		have_usecount = TRUE;
1685 
1686 		error = checkdirs(vp, ctx);
1687 		if (error != 0) {
1688 			/* Unmount the filesystem as cdir/rdirs cannot be updated */
1689 			goto out4;
1690 		}
1691 		/*
1692 		 * there is no cleanup code here so I have made it void
1693 		 * we need to revisit this
1694 		 */
1695 		(void)VFS_START(mp, 0, ctx);
1696 
1697 		if (mount_list_add(mp) != 0) {
1698 			/*
1699 			 * The system is shutting down trying to umount
1700 			 * everything, so fail with a plausible errno.
1701 			 */
1702 			error = EBUSY;
1703 			goto out4;
1704 		}
1705 		lck_rw_done(&mp->mnt_rwlock);
1706 		is_rwlock_locked = FALSE;
1707 
1708 		/* Check if this mounted file system supports EAs or named streams. */
1709 		/* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1710 		VFSATTR_INIT(&vfsattr);
1711 		VFSATTR_WANTED(&vfsattr, f_capabilities);
1712 		if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1713 		    vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1714 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1715 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1716 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1717 				mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1718 			}
1719 #if NAMEDSTREAMS
1720 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1721 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1722 				mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1723 			}
1724 #endif
1725 			/* Check if this file system supports path from id lookups. */
1726 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1727 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1728 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1729 			} else if (mp->mnt_flag & MNT_DOVOLFS) {
1730 				/* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1731 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1732 			}
1733 
1734 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1735 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1736 				mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1737 			}
1738 		}
1739 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1740 			mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1741 		}
1742 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1743 			mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1744 		}
1745 		/* increment the operations count */
1746 		OSAddAtomic(1, &vfs_nummntops);
1747 		enablequotas(mp, ctx);
1748 
1749 		if (device_vnode) {
1750 			device_vnode->v_specflags |= SI_MOUNTEDON;
1751 		}
1752 
1753 		/* Now that mount is setup, notify the listeners */
1754 		vfs_notify_mount(pvp);
1755 		IOBSDMountChange(mp, kIOMountChangeMount);
1756 	} else {
1757 		/* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1758 		if (mp->mnt_vnodelist.tqh_first != NULL) {
1759 			panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1760 			    mp->mnt_vtable->vfc_name, error);
1761 		}
1762 
1763 		vnode_lock_spin(vp);
1764 		CLR(vp->v_flag, VMOUNT);
1765 		vnode_unlock(vp);
1766 		mount_list_lock();
1767 		mp->mnt_vtable->vfc_refcount--;
1768 		mount_list_unlock();
1769 
1770 		if (device_vnode) {
1771 			vnode_rele(device_vnode);
1772 			VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1773 		}
1774 		lck_rw_done(&mp->mnt_rwlock);
1775 		is_rwlock_locked = FALSE;
1776 
1777 		if (nc_smr_enabled) {
1778 			vfs_smr_synchronize();
1779 		}
1780 
1781 		/*
1782 		 * if we get here, we have a mount structure that needs to be freed,
1783 		 * but since the coveredvp hasn't yet been updated to point at it,
1784 		 * no need to worry about other threads holding a crossref on this mp
1785 		 * so it's ok to just free it
1786 		 */
1787 		mount_lock_destroy(mp);
1788 #if CONFIG_MACF
1789 		mac_mount_label_destroy(mp);
1790 #endif
1791 		zfree(mount_zone, mp);
1792 		did_set_lmount = false;
1793 	}
1794 exit:
1795 	/*
1796 	 * drop I/O count on the device vp if there was one
1797 	 */
1798 	if (devpath && devvp) {
1799 		vnode_put(devvp);
1800 	}
1801 
1802 	if (did_set_lmount) {
1803 		mount_lock_spin(mp);
1804 		mp->mnt_lflag &= ~MNT_LMOUNT;
1805 		mount_unlock(mp);
1806 	}
1807 
1808 	return error;
1809 
1810 /* Error condition exits */
1811 out4:
1812 	(void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1813 
1814 	/*
1815 	 * If the mount has been placed on the covered vp,
1816 	 * it may have been discovered by now, so we have
1817 	 * to treat this just like an unmount
1818 	 */
1819 	mount_lock_spin(mp);
1820 	mp->mnt_lflag |= MNT_LDEAD;
1821 	mount_unlock(mp);
1822 
1823 	if (device_vnode != NULLVP) {
1824 		vnode_rele(device_vnode);
1825 		VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1826 		    ctx);
1827 		did_rele = TRUE;
1828 	}
1829 
1830 	vnode_lock_spin(vp);
1831 
1832 	mp->mnt_crossref++;
1833 	CLR(vp->v_flag, VMOUNTEDHERE);
1834 	vp->v_mountedhere = (mount_t) 0;
1835 
1836 	vnode_unlock(vp);
1837 
1838 	if (have_usecount) {
1839 		vnode_rele(vp);
1840 	}
1841 out3:
1842 	if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1843 		vnode_rele(devvp);
1844 	}
1845 out2:
1846 	if (devpath && devvp) {
1847 		vnode_put(devvp);
1848 	}
1849 out1:
1850 	/* Release mnt_rwlock only when it was taken */
1851 	if (is_rwlock_locked == TRUE) {
1852 		if (flag_set) {
1853 			mp->mnt_flag = flag;  /* restore mnt_flag value */
1854 		}
1855 		lck_rw_done(&mp->mnt_rwlock);
1856 	}
1857 
1858 	if (did_set_lmount) {
1859 		mount_lock_spin(mp);
1860 		mp->mnt_lflag &= ~MNT_LMOUNT;
1861 		mount_unlock(mp);
1862 	}
1863 
1864 	if (mntalloc) {
1865 		if (mp->mnt_crossref) {
1866 			mount_dropcrossref(mp, vp, 0);
1867 		} else {
1868 			if (nc_smr_enabled) {
1869 				vfs_smr_synchronize();
1870 			}
1871 
1872 			mount_lock_destroy(mp);
1873 #if CONFIG_MACF
1874 			mac_mount_label_destroy(mp);
1875 #endif
1876 			zfree(mount_zone, mp);
1877 		}
1878 	}
1879 	if (vfsp_ref) {
1880 		mount_list_lock();
1881 		vfsp->vfc_refcount--;
1882 		mount_list_unlock();
1883 	}
1884 
1885 	return error;
1886 }
1887 
1888 /*
1889  * Flush in-core data, check for competing mount attempts,
1890  * and set VMOUNT
1891  */
1892 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)1893 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1894 {
1895 #if !CONFIG_MACF
1896 #pragma unused(cnp,fsname)
1897 #endif
1898 	struct vnode_attr va;
1899 	int error;
1900 	boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1901 	boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1902 	boolean_t is_busy;
1903 
1904 	if (!skip_auth) {
1905 		/*
1906 		 * If the user is not root, ensure that they own the directory
1907 		 * onto which we are attempting to mount.
1908 		 */
1909 		VATTR_INIT(&va);
1910 		VATTR_WANTED(&va, va_uid);
1911 		if ((error = vnode_getattr(vp, &va, ctx)) ||
1912 		    (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1913 		    (!vfs_context_issuser(ctx)))) {
1914 			error = EPERM;
1915 			goto out;
1916 		}
1917 	}
1918 
1919 	if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1920 		goto out;
1921 	}
1922 
1923 	if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1924 		goto out;
1925 	}
1926 
1927 	if (vp->v_type != VDIR) {
1928 		error = ENOTDIR;
1929 		goto out;
1930 	}
1931 
1932 	vnode_lock_spin(vp);
1933 	is_busy = is_fmount ?
1934 	    (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL)) :
1935 	    (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL));
1936 	if (is_busy) {
1937 		vnode_unlock(vp);
1938 		error = EBUSY;
1939 		goto out;
1940 	}
1941 	SET(vp->v_flag, VMOUNT);
1942 	vnode_unlock(vp);
1943 
1944 #if CONFIG_MACF
1945 	error = mac_mount_check_mount(ctx, vp,
1946 	    cnp, fsname);
1947 	if (error != 0) {
1948 		vnode_lock_spin(vp);
1949 		CLR(vp->v_flag, VMOUNT);
1950 		vnode_unlock(vp);
1951 	}
1952 #endif
1953 
1954 out:
1955 	return error;
1956 }
1957 
1958 #if CONFIG_IMGSRC_ACCESS
1959 
1960 #define DEBUG_IMGSRC 0
1961 
1962 #if DEBUG_IMGSRC
1963 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1964 #else
1965 #define IMGSRC_DEBUG(args...) do { } while(0)
1966 #endif
1967 
1968 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)1969 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1970 {
1971 	struct nameidata nd;
1972 	vnode_t vp, realdevvp;
1973 	mode_t accessmode;
1974 	int error;
1975 	enum uio_seg uio = UIO_USERSPACE;
1976 
1977 	if (ctx == vfs_context_kernel()) {
1978 		uio = UIO_SYSSPACE;
1979 	}
1980 
1981 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1982 	if ((error = namei(&nd))) {
1983 		IMGSRC_DEBUG("namei() failed with %d\n", error);
1984 		return error;
1985 	}
1986 
1987 	vp = nd.ni_vp;
1988 
1989 	if (!vnode_isblk(vp)) {
1990 		IMGSRC_DEBUG("Not block device.\n");
1991 		error = ENOTBLK;
1992 		goto out;
1993 	}
1994 
1995 	realdevvp = mp->mnt_devvp;
1996 	if (realdevvp == NULLVP) {
1997 		IMGSRC_DEBUG("No device backs the mount.\n");
1998 		error = ENXIO;
1999 		goto out;
2000 	}
2001 
2002 	error = vnode_getwithref(realdevvp);
2003 	if (error != 0) {
2004 		IMGSRC_DEBUG("Coudn't get iocount on device.\n");
2005 		goto out;
2006 	}
2007 
2008 	if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
2009 		IMGSRC_DEBUG("Wrong dev_t.\n");
2010 		error = ENXIO;
2011 		goto out1;
2012 	}
2013 
2014 	strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2015 
2016 	/*
2017 	 * If mount by non-root, then verify that user has necessary
2018 	 * permissions on the device.
2019 	 */
2020 	if (!vfs_context_issuser(ctx)) {
2021 		accessmode = KAUTH_VNODE_READ_DATA;
2022 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2023 			accessmode |= KAUTH_VNODE_WRITE_DATA;
2024 		}
2025 		if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2026 			IMGSRC_DEBUG("Access denied.\n");
2027 			goto out1;
2028 		}
2029 	}
2030 
2031 	*devvpp = vp;
2032 
2033 out1:
2034 	vnode_put(realdevvp);
2035 
2036 out:
2037 	nameidone(&nd);
2038 
2039 	if (error) {
2040 		vnode_put(vp);
2041 	}
2042 
2043 	return error;
2044 }
2045 
2046 /*
2047  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2048  * and call checkdirs()
2049  */
2050 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2051 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2052 {
2053 	int error;
2054 
2055 	mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2056 
2057 	IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2058 	    mp->mnt_vtable->vfc_name, vnode_getname(vp));
2059 
2060 	vnode_lock_spin(vp);
2061 	CLR(vp->v_flag, VMOUNT);
2062 	vp->v_mountedhere = mp;
2063 	SET(vp->v_flag, VMOUNTEDHERE);
2064 	vnode_unlock(vp);
2065 
2066 	/*
2067 	 * taking the name_cache_lock exclusively will
2068 	 * insure that everyone is out of the fast path who
2069 	 * might be trying to use a now stale copy of
2070 	 * vp->v_mountedhere->mnt_realrootvp
2071 	 * bumping mount_generation causes the cached values
2072 	 * to be invalidated
2073 	 */
2074 	name_cache_lock();
2075 	mount_generation++;
2076 	name_cache_unlock();
2077 
2078 	error = vnode_ref(vp);
2079 	if (error != 0) {
2080 		goto out;
2081 	}
2082 
2083 	error = checkdirs(vp, ctx);
2084 	if (error != 0) {
2085 		/* Unmount the filesystem as cdir/rdirs cannot be updated */
2086 		vnode_rele(vp);
2087 		goto out;
2088 	}
2089 
2090 out:
2091 	if (error != 0) {
2092 		mp->mnt_vnodecovered = NULLVP;
2093 	}
2094 	return error;
2095 }
2096 
2097 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2098 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2099 {
2100 	vnode_rele(vp);
2101 	vnode_lock_spin(vp);
2102 	CLR(vp->v_flag, (VMOUNT | VMOUNTEDHERE));
2103 	vp->v_mountedhere = (mount_t)NULL;
2104 	vnode_unlock(vp);
2105 
2106 	mp->mnt_vnodecovered = NULLVP;
2107 }
2108 
2109 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2110 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2111 {
2112 	int error;
2113 
2114 	/* unmount in progress return error */
2115 	mount_lock_spin(mp);
2116 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2117 		mount_unlock(mp);
2118 		return EBUSY;
2119 	}
2120 	mount_unlock(mp);
2121 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2122 
2123 	/*
2124 	 * We only allow the filesystem to be reloaded if it
2125 	 * is currently mounted read-only.
2126 	 */
2127 	if ((flags & MNT_RELOAD) &&
2128 	    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2129 		error = ENOTSUP;
2130 		goto out;
2131 	}
2132 
2133 	/*
2134 	 * Only root, or the user that did the original mount is
2135 	 * permitted to update it.
2136 	 */
2137 	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2138 	    (!vfs_context_issuser(ctx))) {
2139 		error = EPERM;
2140 		goto out;
2141 	}
2142 #if CONFIG_MACF
2143 	error = mac_mount_check_remount(ctx, mp);
2144 	if (error != 0) {
2145 		goto out;
2146 	}
2147 #endif
2148 
2149 out:
2150 	if (error) {
2151 		lck_rw_done(&mp->mnt_rwlock);
2152 	}
2153 
2154 	return error;
2155 }
2156 
2157 static void
mount_end_update(mount_t mp)2158 mount_end_update(mount_t mp)
2159 {
2160 	lck_rw_done(&mp->mnt_rwlock);
2161 }
2162 
2163 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2164 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2165 {
2166 	vnode_t vp;
2167 
2168 	if (height >= MAX_IMAGEBOOT_NESTING) {
2169 		return EINVAL;
2170 	}
2171 
2172 	vp = imgsrc_rootvnodes[height];
2173 	if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2174 		*rvpp = vp;
2175 		return 0;
2176 	} else {
2177 		return ENOENT;
2178 	}
2179 }
2180 
2181 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2182 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2183     struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2184     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2185 {
2186 	int error;
2187 	mount_t mp;
2188 	boolean_t placed = FALSE;
2189 	struct vfstable *vfsp;
2190 	user_addr_t devpath;
2191 	char *old_mntonname;
2192 	vnode_t rvp;
2193 	vnode_t devvp;
2194 	uint32_t height;
2195 	uint32_t flags;
2196 
2197 	/* If we didn't imageboot, nothing to move */
2198 	if (imgsrc_rootvnodes[0] == NULLVP) {
2199 		return EINVAL;
2200 	}
2201 
2202 	/* Only root can do this */
2203 	if (!vfs_context_issuser(ctx)) {
2204 		return EPERM;
2205 	}
2206 
2207 	IMGSRC_DEBUG("looking for root vnode.\n");
2208 
2209 	/*
2210 	 * Get root vnode of filesystem we're moving.
2211 	 */
2212 	if (by_index) {
2213 		if (is64bit) {
2214 			struct user64_mnt_imgsrc_args mia64;
2215 			error = copyin(fsmountargs, &mia64, sizeof(mia64));
2216 			if (error != 0) {
2217 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2218 				return error;
2219 			}
2220 
2221 			height = mia64.mi_height;
2222 			flags = mia64.mi_flags;
2223 			devpath = (user_addr_t)mia64.mi_devpath;
2224 		} else {
2225 			struct user32_mnt_imgsrc_args mia32;
2226 			error = copyin(fsmountargs, &mia32, sizeof(mia32));
2227 			if (error != 0) {
2228 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2229 				return error;
2230 			}
2231 
2232 			height = mia32.mi_height;
2233 			flags = mia32.mi_flags;
2234 			devpath = mia32.mi_devpath;
2235 		}
2236 	} else {
2237 		/*
2238 		 * For binary compatibility--assumes one level of nesting.
2239 		 */
2240 		if (is64bit) {
2241 			if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2242 				return error;
2243 			}
2244 		} else {
2245 			user32_addr_t tmp;
2246 			if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2247 				return error;
2248 			}
2249 
2250 			/* munge into LP64 addr */
2251 			devpath = CAST_USER_ADDR_T(tmp);
2252 		}
2253 
2254 		height = 0;
2255 		flags = 0;
2256 	}
2257 
2258 	if (flags != 0) {
2259 		IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2260 		return EINVAL;
2261 	}
2262 
2263 	error = get_imgsrc_rootvnode(height, &rvp);
2264 	if (error != 0) {
2265 		IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2266 		return error;
2267 	}
2268 
2269 	IMGSRC_DEBUG("got old root vnode\n");
2270 
2271 	old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2272 
2273 	/* Can only move once */
2274 	mp = vnode_mount(rvp);
2275 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2276 		IMGSRC_DEBUG("Already moved.\n");
2277 		error = EBUSY;
2278 		goto out0;
2279 	}
2280 
2281 	IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2282 	IMGSRC_DEBUG("Starting updated.\n");
2283 
2284 	/* Get exclusive rwlock on mount, authorize update on mp */
2285 	error = mount_begin_update(mp, ctx, 0);
2286 	if (error != 0) {
2287 		IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2288 		goto out0;
2289 	}
2290 
2291 	/*
2292 	 * It can only be moved once.  Flag is set under the rwlock,
2293 	 * so we're now safe to proceed.
2294 	 */
2295 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2296 		IMGSRC_DEBUG("Already moved [2]\n");
2297 		goto out1;
2298 	}
2299 
2300 	IMGSRC_DEBUG("Preparing coveredvp.\n");
2301 
2302 	/* Mark covered vnode as mount in progress, authorize placing mount on top */
2303 	error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2304 	if (error != 0) {
2305 		IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2306 		goto out1;
2307 	}
2308 
2309 	IMGSRC_DEBUG("Covered vp OK.\n");
2310 
2311 	/* Sanity check the name caller has provided */
2312 	vfsp = mp->mnt_vtable;
2313 	if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2314 		IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2315 		    vfsp->vfc_name, fsname);
2316 		error = EINVAL;
2317 		goto out2;
2318 	}
2319 
2320 	/* Check the device vnode and update mount-from name, for local filesystems */
2321 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2322 		IMGSRC_DEBUG("Local, doing device validation.\n");
2323 
2324 		if (devpath != USER_ADDR_NULL) {
2325 			error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2326 			if (error) {
2327 				IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2328 				goto out2;
2329 			}
2330 
2331 			vnode_put(devvp);
2332 		}
2333 	}
2334 
2335 	/*
2336 	 * Place mp on top of vnode, ref the vnode,  call checkdirs(),
2337 	 * and increment the name cache's mount generation
2338 	 */
2339 
2340 	IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2341 	error = place_mount_and_checkdirs(mp, vp, ctx);
2342 	if (error != 0) {
2343 		goto out2;
2344 	}
2345 
2346 	placed = TRUE;
2347 
2348 	strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2349 	strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2350 
2351 	/* Forbid future moves */
2352 	mount_lock(mp);
2353 	mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2354 	mount_unlock(mp);
2355 
2356 	/* Finally, add to mount list, completely ready to go */
2357 	if (mount_list_add(mp) != 0) {
2358 		/*
2359 		 * The system is shutting down trying to umount
2360 		 * everything, so fail with a plausible errno.
2361 		 */
2362 		error = EBUSY;
2363 		goto out3;
2364 	}
2365 
2366 	mount_end_update(mp);
2367 	vnode_put(rvp);
2368 	zfree(ZV_NAMEI, old_mntonname);
2369 
2370 	vfs_notify_mount(pvp);
2371 
2372 	return 0;
2373 out3:
2374 	strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2375 
2376 	mount_lock(mp);
2377 	mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2378 	mount_unlock(mp);
2379 
2380 out2:
2381 	/*
2382 	 * Placing the mp on the vnode clears VMOUNT,
2383 	 * so cleanup is different after that point
2384 	 */
2385 	if (placed) {
2386 		/* Rele the vp, clear VMOUNT and v_mountedhere */
2387 		undo_place_on_covered_vp(mp, vp);
2388 	} else {
2389 		vnode_lock_spin(vp);
2390 		CLR(vp->v_flag, VMOUNT);
2391 		vnode_unlock(vp);
2392 	}
2393 out1:
2394 	mount_end_update(mp);
2395 
2396 out0:
2397 	vnode_put(rvp);
2398 	zfree(ZV_NAMEI, old_mntonname);
2399 	return error;
2400 }
2401 
2402 #endif /* CONFIG_IMGSRC_ACCESS */
2403 
2404 void
enablequotas(struct mount * mp,vfs_context_t ctx)2405 enablequotas(struct mount *mp, vfs_context_t ctx)
2406 {
2407 	struct nameidata qnd;
2408 	int type;
2409 	char qfpath[MAXPATHLEN];
2410 	const char *qfname = QUOTAFILENAME;
2411 	const char *qfopsname = QUOTAOPSNAME;
2412 	const char *qfextension[] = INITQFNAMES;
2413 
2414 	/* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2415 	if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2416 		return;
2417 	}
2418 	/*
2419 	 * Enable filesystem disk quotas if necessary.
2420 	 * We ignore errors as this should not interfere with final mount
2421 	 */
2422 	for (type = 0; type < MAXQUOTAS; type++) {
2423 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2424 		NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2425 		    CAST_USER_ADDR_T(qfpath), ctx);
2426 		if (namei(&qnd) != 0) {
2427 			continue;           /* option file to trigger quotas is not present */
2428 		}
2429 		vnode_put(qnd.ni_vp);
2430 		nameidone(&qnd);
2431 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2432 
2433 		(void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2434 	}
2435 	return;
2436 }
2437 
2438 
2439 static int
checkdirs_callback(proc_t p,void * arg)2440 checkdirs_callback(proc_t p, void * arg)
2441 {
2442 	struct cdirargs *cdrp = (struct cdirargs *)arg;
2443 	vnode_t olddp = cdrp->olddp;
2444 	vnode_t newdp = cdrp->newdp;
2445 	struct filedesc *fdp = &p->p_fd;
2446 	vnode_t new_cvp = newdp;
2447 	vnode_t new_rvp = newdp;
2448 	vnode_t old_cvp = NULL;
2449 	vnode_t old_rvp = NULL;
2450 
2451 	/*
2452 	 * XXX Also needs to iterate each thread in the process to see if it
2453 	 * XXX is using a per-thread current working directory, and, if so,
2454 	 * XXX update that as well.
2455 	 */
2456 
2457 	/*
2458 	 * First, with the proc_fdlock held, check to see if we will need
2459 	 * to do any work.  If not, we will get out fast.
2460 	 */
2461 	proc_fdlock(p);
2462 	if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2463 		proc_fdunlock(p);
2464 		return PROC_RETURNED;
2465 	}
2466 	proc_fdunlock(p);
2467 
2468 	/*
2469 	 * Ok, we will have to do some work.  Always take two refs
2470 	 * because we might need that many.  We'll dispose of whatever
2471 	 * we ended up not using.
2472 	 */
2473 	if (vnode_ref(newdp) != 0) {
2474 		return PROC_RETURNED;
2475 	}
2476 	if (vnode_ref(newdp) != 0) {
2477 		vnode_rele(newdp);
2478 		return PROC_RETURNED;
2479 	}
2480 
2481 	proc_dirs_lock_exclusive(p);
2482 	/*
2483 	 * Now do the work.  Note: we dropped the proc_fdlock, so we
2484 	 * have to do all of the checks again.
2485 	 */
2486 	proc_fdlock(p);
2487 	if (fdp->fd_cdir == olddp) {
2488 		old_cvp = olddp;
2489 		fdp->fd_cdir = newdp;
2490 		new_cvp = NULL;
2491 	}
2492 	if (fdp->fd_rdir == olddp) {
2493 		old_rvp = olddp;
2494 		fdp->fd_rdir = newdp;
2495 		new_rvp = NULL;
2496 	}
2497 	proc_fdunlock(p);
2498 	proc_dirs_unlock_exclusive(p);
2499 
2500 	/*
2501 	 * Dispose of any references that are no longer needed.
2502 	 */
2503 	if (old_cvp != NULL) {
2504 		vnode_rele(old_cvp);
2505 	}
2506 	if (old_rvp != NULL) {
2507 		vnode_rele(old_rvp);
2508 	}
2509 	if (new_cvp != NULL) {
2510 		vnode_rele(new_cvp);
2511 	}
2512 	if (new_rvp != NULL) {
2513 		vnode_rele(new_rvp);
2514 	}
2515 
2516 	return PROC_RETURNED;
2517 }
2518 
2519 
2520 
2521 /*
2522  * Scan all active processes to see if any of them have a current
2523  * or root directory onto which the new filesystem has just been
2524  * mounted. If so, replace them with the new mount point.
2525  */
2526 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2527 checkdirs(vnode_t olddp, vfs_context_t ctx)
2528 {
2529 	vnode_t newdp;
2530 	vnode_t tvp;
2531 	int err;
2532 	struct cdirargs cdr;
2533 
2534 	if (olddp->v_usecount == 1) {
2535 		return 0;
2536 	}
2537 	err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2538 
2539 	if (err != 0) {
2540 #if DIAGNOSTIC
2541 		panic("mount: lost mount: error %d", err);
2542 #endif
2543 		return err;
2544 	}
2545 
2546 	cdr.olddp = olddp;
2547 	cdr.newdp = newdp;
2548 	/* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2549 	proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2550 
2551 	if (rootvnode == olddp) {
2552 		vnode_ref(newdp);
2553 		lck_rw_lock_exclusive(&rootvnode_rw_lock);
2554 		tvp = rootvnode;
2555 		rootvnode = newdp;
2556 		lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2557 		vnode_rele(tvp);
2558 	}
2559 
2560 	vnode_put(newdp);
2561 	return 0;
2562 }
2563 
2564 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT        \
2565 	"com.apple.private.vfs.role-account-unmount"
2566 
2567 /*
2568  * Unmount a file system.
2569  *
2570  * Note: unmount takes a path to the vnode mounted on as argument,
2571  * not special file (as before).
2572  */
2573 /* ARGSUSED */
2574 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2575 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2576 {
2577 	vnode_t vp;
2578 	struct mount *mp;
2579 	int error;
2580 	struct nameidata nd;
2581 	vfs_context_t ctx;
2582 
2583 	/*
2584 	 * If the process has the entitlement, use the kernel's context when
2585 	 * performing lookup on the mount path as the process might lack proper
2586 	 * permission to access the directory.
2587 	 */
2588 	ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2589 	    vfs_context_kernel() : vfs_context_current();
2590 
2591 	NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2592 	    UIO_USERSPACE, uap->path, ctx);
2593 	error = namei(&nd);
2594 	if (error) {
2595 		return error;
2596 	}
2597 	vp = nd.ni_vp;
2598 	mp = vp->v_mount;
2599 	nameidone(&nd);
2600 
2601 #if CONFIG_MACF
2602 	error = mac_mount_check_umount(ctx, mp);
2603 	if (error != 0) {
2604 		vnode_put(vp);
2605 		return error;
2606 	}
2607 #endif
2608 	/*
2609 	 * Must be the root of the filesystem
2610 	 */
2611 	if ((vp->v_flag & VROOT) == 0) {
2612 		vnode_put(vp);
2613 		return EINVAL;
2614 	}
2615 	mount_ref(mp, 0);
2616 	vnode_put(vp);
2617 	/* safedounmount consumes the mount ref */
2618 	return safedounmount(mp, uap->flags, ctx);
2619 }
2620 
2621 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2622 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2623 {
2624 	mount_t mp;
2625 
2626 	mp = mount_list_lookupby_fsid(fsid, 0, 1);
2627 	if (mp == (mount_t)0) {
2628 		return ENOENT;
2629 	}
2630 	mount_ref(mp, 0);
2631 	mount_iterdrop(mp);
2632 	/* safedounmount consumes the mount ref */
2633 	return safedounmount(mp, flags, ctx);
2634 }
2635 
2636 /*
2637  * The mount struct comes with a mount ref which will be consumed.
2638  * Do the actual file system unmount, prevent some common foot shooting.
2639  */
2640 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2641 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2642 {
2643 	int error;
2644 	proc_t p = vfs_context_proc(ctx);
2645 
2646 	/*
2647 	 * If the file system is not responding and MNT_NOBLOCK
2648 	 * is set and not a forced unmount then return EBUSY.
2649 	 */
2650 	if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2651 	    (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2652 		error = EBUSY;
2653 		goto out;
2654 	}
2655 
2656 	/*
2657 	 * Skip authorization in two cases:
2658 	 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2659 	 *   This entitlement allows non-root processes unmount volumes mounted by
2660 	 *   other processes.
2661 	 * - If the mount is tagged as permissive and this is not a forced-unmount
2662 	 *   attempt.
2663 	 */
2664 	if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2665 	    (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2666 		/*
2667 		 * Only root, or the user that did the original mount is
2668 		 * permitted to unmount this filesystem.
2669 		 */
2670 		if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2671 		    (error = suser(kauth_cred_get(), &p->p_acflag))) {
2672 			goto out;
2673 		}
2674 	}
2675 	/*
2676 	 * Don't allow unmounting the root file system, or other volumes
2677 	 * associated with it (for example, the associated VM or DATA mounts) .
2678 	 */
2679 	if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2680 		if (!(mp->mnt_flag & MNT_ROOTFS)) {
2681 			printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2682 			    mp->mnt_vfsstat.f_mntonname);
2683 		}
2684 		error = EBUSY; /* the root (or associated volumes) is always busy */
2685 		goto out;
2686 	}
2687 
2688 	/*
2689 	 * If the mount is providing the root filesystem's disk image
2690 	 * (i.e. imageboot), don't allow unmounting
2691 	 */
2692 	if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2693 		error = EBUSY;
2694 		goto out;
2695 	}
2696 
2697 	return dounmount(mp, flags, 1, ctx);
2698 
2699 out:
2700 	mount_drop(mp, 0);
2701 	return error;
2702 }
2703 
2704 /*
2705  * Do the actual file system unmount.
2706  */
2707 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2708 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2709 {
2710 	vnode_t coveredvp = (vnode_t)0;
2711 	int error;
2712 	int needwakeup = 0;
2713 	int forcedunmount = 0;
2714 	int lflags = 0;
2715 	struct vnode *devvp = NULLVP;
2716 #if CONFIG_TRIGGERS
2717 	proc_t p = vfs_context_proc(ctx);
2718 	int did_vflush = 0;
2719 	int pflags_save = 0;
2720 #endif /* CONFIG_TRIGGERS */
2721 
2722 #if CONFIG_FSE
2723 	if (!(flags & MNT_FORCE)) {
2724 		fsevent_unmount(mp, ctx);  /* has to come first! */
2725 	}
2726 #endif
2727 
2728 	mount_lock(mp);
2729 
2730 	/*
2731 	 * If already an unmount in progress just return EBUSY.
2732 	 * Even a forced unmount cannot override.
2733 	 */
2734 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2735 		if (withref != 0) {
2736 			mount_drop(mp, 1);
2737 		}
2738 		mount_unlock(mp);
2739 		return EBUSY;
2740 	}
2741 
2742 	if (flags & MNT_FORCE) {
2743 		forcedunmount = 1;
2744 		mp->mnt_lflag |= MNT_LFORCE;
2745 	}
2746 
2747 #if CONFIG_TRIGGERS
2748 	if (flags & MNT_NOBLOCK && p != kernproc) {
2749 		pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2750 	}
2751 #endif
2752 
2753 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
2754 	mp->mnt_lflag |= MNT_LUNMOUNT;
2755 	mp->mnt_flag &= ~MNT_ASYNC;
2756 	/*
2757 	 * anyone currently in the fast path that
2758 	 * trips over the cached rootvp will be
2759 	 * dumped out and forced into the slow path
2760 	 * to regenerate a new cached value
2761 	 */
2762 	mp->mnt_realrootvp = NULLVP;
2763 	mount_unlock(mp);
2764 
2765 	if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2766 		/*
2767 		 * Force unmount any mounts in this filesystem.
2768 		 * If any unmounts fail - just leave them dangling.
2769 		 * Avoids recursion.
2770 		 */
2771 		(void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2772 	}
2773 
2774 	/*
2775 	 * taking the name_cache_lock exclusively will
2776 	 * insure that everyone is out of the fast path who
2777 	 * might be trying to use a now stale copy of
2778 	 * vp->v_mountedhere->mnt_realrootvp
2779 	 * bumping mount_generation causes the cached values
2780 	 * to be invalidated
2781 	 */
2782 	name_cache_lock();
2783 	mount_generation++;
2784 	name_cache_unlock();
2785 
2786 
2787 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2788 	if (withref != 0) {
2789 		mount_drop(mp, 0);
2790 	}
2791 	error = 0;
2792 	if (forcedunmount == 0) {
2793 		ubc_umount(mp); /* release cached vnodes */
2794 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2795 			error = VFS_SYNC(mp, MNT_WAIT, ctx);
2796 			if (error) {
2797 				mount_lock(mp);
2798 				mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2799 				mp->mnt_lflag &= ~MNT_LUNMOUNT;
2800 				mp->mnt_lflag &= ~MNT_LFORCE;
2801 				goto out;
2802 			}
2803 		}
2804 	}
2805 
2806 	IOBSDMountChange(mp, kIOMountChangeUnmount);
2807 
2808 #if CONFIG_TRIGGERS
2809 	vfs_nested_trigger_unmounts(mp, flags, ctx);
2810 	did_vflush = 1;
2811 #endif
2812 	if (forcedunmount) {
2813 		lflags |= FORCECLOSE;
2814 	}
2815 	error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2816 	if ((forcedunmount == 0) && error) {
2817 		mount_lock(mp);
2818 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2819 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2820 		mp->mnt_lflag &= ~MNT_LFORCE;
2821 		goto out;
2822 	}
2823 
2824 	/* make sure there are no one in the mount iterations or lookup */
2825 	mount_iterdrain(mp);
2826 
2827 	error = VFS_UNMOUNT(mp, flags, ctx);
2828 	if (error) {
2829 		mount_iterreset(mp);
2830 		mount_lock(mp);
2831 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2832 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2833 		mp->mnt_lflag &= ~MNT_LFORCE;
2834 		goto out;
2835 	}
2836 
2837 	/* increment the operations count */
2838 	if (!error) {
2839 		OSAddAtomic(1, &vfs_nummntops);
2840 	}
2841 
2842 	if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2843 		/* hold an io reference and drop the usecount before close */
2844 		devvp = mp->mnt_devvp;
2845 		vnode_getalways(devvp);
2846 		vnode_rele(devvp);
2847 		VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2848 		    ctx);
2849 		vnode_clearmountedon(devvp);
2850 		vnode_put(devvp);
2851 	}
2852 	lck_rw_done(&mp->mnt_rwlock);
2853 	mount_list_remove(mp);
2854 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2855 
2856 	/* mark the mount point hook in the vp but not drop the ref yet */
2857 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2858 		/*
2859 		 * The covered vnode needs special handling. Trying to get an
2860 		 * iocount must not block here as this may lead to deadlocks
2861 		 * if the Filesystem to which the covered vnode belongs is
2862 		 * undergoing forced unmounts. Since we hold a usecount, the
2863 		 * vnode cannot be reused (it can, however, still be terminated)
2864 		 */
2865 		vnode_getalways(coveredvp);
2866 		vnode_lock_spin(coveredvp);
2867 
2868 		mp->mnt_crossref++;
2869 		coveredvp->v_mountedhere = (struct mount *)0;
2870 		CLR(coveredvp->v_flag, VMOUNT | VMOUNTEDHERE);
2871 		vnode_unlock(coveredvp);
2872 		vnode_put(coveredvp);
2873 	}
2874 
2875 	mount_list_lock();
2876 	mp->mnt_vtable->vfc_refcount--;
2877 	mount_list_unlock();
2878 
2879 	cache_purgevfs(mp);     /* remove cache entries for this file sys */
2880 	vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2881 	mount_lock(mp);
2882 	mp->mnt_lflag |= MNT_LDEAD;
2883 
2884 	if (mp->mnt_lflag & MNT_LWAIT) {
2885 		/*
2886 		 * do the wakeup here
2887 		 * in case we block in mount_refdrain
2888 		 * which will drop the mount lock
2889 		 * and allow anyone blocked in vfs_busy
2890 		 * to wakeup and see the LDEAD state
2891 		 */
2892 		mp->mnt_lflag &= ~MNT_LWAIT;
2893 		wakeup((caddr_t)mp);
2894 	}
2895 	mount_refdrain(mp);
2896 
2897 	/* free disk_conditioner_info structure for this mount */
2898 	disk_conditioner_unmount(mp);
2899 
2900 out:
2901 	if (mp->mnt_lflag & MNT_LWAIT) {
2902 		mp->mnt_lflag &= ~MNT_LWAIT;
2903 		needwakeup = 1;
2904 	}
2905 
2906 #if CONFIG_TRIGGERS
2907 	if (flags & MNT_NOBLOCK && p != kernproc) {
2908 		// Restore P_NOREMOTEHANG bit to its previous value
2909 		if ((pflags_save & P_NOREMOTEHANG) == 0) {
2910 			OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2911 		}
2912 	}
2913 
2914 	/*
2915 	 * Callback and context are set together under the mount lock, and
2916 	 * never cleared, so we're safe to examine them here, drop the lock,
2917 	 * and call out.
2918 	 */
2919 	if (mp->mnt_triggercallback != NULL) {
2920 		mount_unlock(mp);
2921 		if (error == 0) {
2922 			mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2923 		} else if (did_vflush) {
2924 			mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2925 		}
2926 	} else {
2927 		mount_unlock(mp);
2928 	}
2929 #else
2930 	mount_unlock(mp);
2931 #endif /* CONFIG_TRIGGERS */
2932 
2933 	lck_rw_done(&mp->mnt_rwlock);
2934 
2935 	if (needwakeup) {
2936 		wakeup((caddr_t)mp);
2937 	}
2938 
2939 	if (!error) {
2940 		if ((coveredvp != NULLVP)) {
2941 			vnode_t pvp = NULLVP;
2942 
2943 			/*
2944 			 * The covered vnode needs special handling. Trying to
2945 			 * get an iocount must not block here as this may lead
2946 			 * to deadlocks if the Filesystem to which the covered
2947 			 * vnode belongs is undergoing forced unmounts. Since we
2948 			 * hold a usecount, the  vnode cannot be reused
2949 			 * (it can, however, still be terminated).
2950 			 */
2951 			vnode_getalways(coveredvp);
2952 
2953 			mount_dropcrossref(mp, coveredvp, 0);
2954 			/*
2955 			 * We'll _try_ to detect if this really needs to be
2956 			 * done. The coveredvp can only be in termination (or
2957 			 * terminated) if the coveredvp's mount point is in a
2958 			 * forced unmount (or has been) since we still hold the
2959 			 * ref.
2960 			 */
2961 			if (!vnode_isrecycled(coveredvp)) {
2962 				pvp = vnode_getparent(coveredvp);
2963 #if CONFIG_TRIGGERS
2964 				if (coveredvp->v_resolve) {
2965 					vnode_trigger_rearm(coveredvp, ctx);
2966 				}
2967 #endif
2968 			}
2969 
2970 			vnode_rele(coveredvp);
2971 			vnode_put(coveredvp);
2972 			coveredvp = NULLVP;
2973 
2974 			if (pvp) {
2975 				lock_vnode_and_post(pvp, NOTE_WRITE);
2976 				vnode_put(pvp);
2977 			}
2978 		} else if (mp->mnt_flag & MNT_ROOTFS) {
2979 			if (nc_smr_enabled) {
2980 				vfs_smr_synchronize();
2981 			}
2982 
2983 			mount_lock_destroy(mp);
2984 #if CONFIG_MACF
2985 			mac_mount_label_destroy(mp);
2986 #endif
2987 			zfree(mount_zone, mp);
2988 		} else {
2989 			panic("dounmount: no coveredvp");
2990 		}
2991 	}
2992 	return error;
2993 }
2994 
2995 /*
2996  * Unmount any mounts in this filesystem.
2997  */
2998 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)2999 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
3000 {
3001 	mount_t smp;
3002 	fsid_t *fsids, fsid;
3003 	int fsids_sz;
3004 	int count = 0, i, m = 0;
3005 	vnode_t vp;
3006 
3007 	mount_list_lock();
3008 
3009 	// Get an array to hold the submounts fsids.
3010 	TAILQ_FOREACH(smp, &mountlist, mnt_list)
3011 	count++;
3012 	fsids_sz = count * sizeof(fsid_t);
3013 	fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3014 	if (fsids == NULL) {
3015 		mount_list_unlock();
3016 		goto out;
3017 	}
3018 	fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
3019 
3020 	/*
3021 	 * Fill the array with submount fsids.
3022 	 * Since mounts are always added to the tail of the mount list, the
3023 	 * list is always in mount order.
3024 	 * For each mount check if the mounted-on vnode belongs to a
3025 	 * mount that's already added to our array of mounts to be unmounted.
3026 	 */
3027 	for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3028 		vp = smp->mnt_vnodecovered;
3029 		if (vp == NULL) {
3030 			continue;
3031 		}
3032 		fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
3033 		for (i = 0; i <= m; i++) {
3034 			if (fsids[i].val[0] == fsid.val[0] &&
3035 			    fsids[i].val[1] == fsid.val[1]) {
3036 				fsids[++m] = smp->mnt_vfsstat.f_fsid;
3037 				break;
3038 			}
3039 		}
3040 	}
3041 	mount_list_unlock();
3042 
3043 	// Unmount the submounts in reverse order. Ignore errors.
3044 	for (i = m; i > 0; i--) {
3045 		smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3046 		if (smp) {
3047 			mount_ref(smp, 0);
3048 			mount_iterdrop(smp);
3049 			(void) dounmount(smp, flags, 1, ctx);
3050 		}
3051 	}
3052 out:
3053 	kfree_data(fsids, fsids_sz);
3054 }
3055 
3056 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3057 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3058 {
3059 	vnode_hold(dp);
3060 	vnode_lock(dp);
3061 	mp->mnt_crossref--;
3062 
3063 	if (mp->mnt_crossref < 0) {
3064 		panic("mount cross refs -ve");
3065 	}
3066 
3067 	if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3068 		if (need_put) {
3069 			vnode_put_locked(dp);
3070 		}
3071 		vnode_drop_and_unlock(dp);
3072 
3073 		if (nc_smr_enabled) {
3074 			vfs_smr_synchronize();
3075 		}
3076 
3077 		mount_lock_destroy(mp);
3078 #if CONFIG_MACF
3079 		mac_mount_label_destroy(mp);
3080 #endif
3081 		zfree(mount_zone, mp);
3082 		return;
3083 	}
3084 	if (need_put) {
3085 		vnode_put_locked(dp);
3086 	}
3087 	vnode_drop_and_unlock(dp);
3088 }
3089 
3090 
3091 /*
3092  * Sync each mounted filesystem.
3093  */
3094 #if DIAGNOSTIC
3095 int syncprt = 0;
3096 #endif
3097 
3098 int print_vmpage_stat = 0;
3099 
3100 /*
3101  * sync_callback:	simple wrapper that calls VFS_SYNC() on volumes
3102  *			mounted read-write with the passed waitfor value.
3103  *
3104  * Parameters:	mp	mount-point descriptor per mounted file-system instance.
3105  *		arg	user argument (please see below)
3106  *
3107  * User argument is a pointer to 32 bit unsigned integer which describes the
3108  * type of waitfor value to set for calling VFS_SYNC().  If user argument is
3109  * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3110  * waitfor value.
3111  *
3112  * Returns:		VFS_RETURNED
3113  */
3114 static int
sync_callback(mount_t mp,void * arg)3115 sync_callback(mount_t mp, void *arg)
3116 {
3117 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3118 		int asyncflag = mp->mnt_flag & MNT_ASYNC;
3119 		unsigned waitfor = MNT_NOWAIT;
3120 
3121 		if (arg) {
3122 			waitfor = *(uint32_t*)arg;
3123 		}
3124 
3125 		/* Sanity check for flags - these are the only valid combinations for the flag bits*/
3126 		if (waitfor != MNT_WAIT &&
3127 		    waitfor != (MNT_WAIT | MNT_VOLUME) &&
3128 		    waitfor != MNT_NOWAIT &&
3129 		    waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3130 		    waitfor != MNT_DWAIT &&
3131 		    waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3132 			panic("Passed inappropriate waitfor %u to "
3133 			    "sync_callback()", waitfor);
3134 		}
3135 
3136 		mp->mnt_flag &= ~MNT_ASYNC;
3137 		(void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3138 		if (asyncflag) {
3139 			mp->mnt_flag |= MNT_ASYNC;
3140 		}
3141 	}
3142 
3143 	return VFS_RETURNED;
3144 }
3145 
3146 /* ARGSUSED */
3147 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3148 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3149 {
3150 	vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3151 
3152 	if (print_vmpage_stat) {
3153 		vm_countdirtypages();
3154 	}
3155 
3156 #if DIAGNOSTIC
3157 	if (syncprt) {
3158 		vfs_bufstats();
3159 	}
3160 #endif /* DIAGNOSTIC */
3161 	return 0;
3162 }
3163 
3164 typedef enum {
3165 	SYNC_ALL = 0,
3166 	SYNC_ONLY_RELIABLE_MEDIA = 1,
3167 	SYNC_ONLY_UNRELIABLE_MEDIA = 2
3168 } sync_type_t;
3169 
3170 static int
sync_internal_callback(mount_t mp,void * arg)3171 sync_internal_callback(mount_t mp, void *arg)
3172 {
3173 	if (arg) {
3174 		int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3175 		    (mp->mnt_flag & MNT_LOCAL);
3176 		sync_type_t sync_type = *((sync_type_t *)arg);
3177 
3178 		if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3179 			return VFS_RETURNED;
3180 		} else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3181 			return VFS_RETURNED;
3182 		}
3183 	}
3184 
3185 	(void)sync_callback(mp, NULL);
3186 
3187 	return VFS_RETURNED;
3188 }
3189 
3190 int sync_thread_state = 0;
3191 int sync_timeout_seconds = 5;
3192 
3193 #define SYNC_THREAD_RUN       0x0001
3194 #define SYNC_THREAD_RUNNING   0x0002
3195 
3196 #if CONFIG_PHYS_WRITE_ACCT
3197 thread_t pm_sync_thread;
3198 #endif /* CONFIG_PHYS_WRITE_ACCT */
3199 
3200 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3201 sync_thread(__unused void *arg, __unused wait_result_t wr)
3202 {
3203 	sync_type_t sync_type;
3204 #if CONFIG_PHYS_WRITE_ACCT
3205 	pm_sync_thread = current_thread();
3206 #endif /* CONFIG_PHYS_WRITE_ACCT */
3207 
3208 	lck_mtx_lock(&sync_mtx_lck);
3209 	while (sync_thread_state & SYNC_THREAD_RUN) {
3210 		sync_thread_state &= ~SYNC_THREAD_RUN;
3211 		lck_mtx_unlock(&sync_mtx_lck);
3212 
3213 		sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3214 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3215 		sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3216 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3217 
3218 		lck_mtx_lock(&sync_mtx_lck);
3219 	}
3220 	/*
3221 	 * This wakeup _has_ to be issued before the lock is released otherwise
3222 	 * we may end up waking up a thread in sync_internal which is
3223 	 * expecting a wakeup from a thread it just created and not from this
3224 	 * thread which is about to exit.
3225 	 */
3226 	wakeup(&sync_thread_state);
3227 	sync_thread_state &= ~SYNC_THREAD_RUNNING;
3228 #if CONFIG_PHYS_WRITE_ACCT
3229 	pm_sync_thread = NULL;
3230 #endif /* CONFIG_PHYS_WRITE_ACCT */
3231 	lck_mtx_unlock(&sync_mtx_lck);
3232 
3233 	if (print_vmpage_stat) {
3234 		vm_countdirtypages();
3235 	}
3236 
3237 #if DIAGNOSTIC
3238 	if (syncprt) {
3239 		vfs_bufstats();
3240 	}
3241 #endif /* DIAGNOSTIC */
3242 }
3243 
3244 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3245 
3246 /*
3247  * An in-kernel sync for power management to call.
3248  * This function always returns within sync_timeout seconds.
3249  */
3250 __private_extern__ int
sync_internal(void)3251 sync_internal(void)
3252 {
3253 	thread_t thd = NULL;
3254 	int error;
3255 	int thread_created = FALSE;
3256 	struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3257 
3258 	lck_mtx_lock(&sync_mtx_lck);
3259 	sync_thread_state |= SYNC_THREAD_RUN;
3260 	if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3261 		int kr;
3262 
3263 		sync_thread_state |= SYNC_THREAD_RUNNING;
3264 		kr = kernel_thread_start(sync_thread, NULL, &thd);
3265 		if (kr != KERN_SUCCESS) {
3266 			sync_thread_state &= ~SYNC_THREAD_RUNNING;
3267 			lck_mtx_unlock(&sync_mtx_lck);
3268 			printf("sync_thread failed\n");
3269 			return 0;
3270 		}
3271 		thread_created = TRUE;
3272 	}
3273 
3274 	error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3275 	    (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3276 	if (error) {
3277 		struct timeval now;
3278 
3279 		microtime(&now);
3280 		if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3281 			printf("sync timed out: %d sec\n", sync_timeout_seconds);
3282 			sync_timeout_last_print.tv_sec = now.tv_sec;
3283 		}
3284 	}
3285 
3286 	if (thread_created) {
3287 		thread_deallocate(thd);
3288 	}
3289 
3290 	return 0;
3291 } /* end of sync_internal call */
3292 
3293 /*
3294  * Change filesystem quotas.
3295  */
3296 #if QUOTA
3297 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3298 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3299 {
3300 	struct mount *mp;
3301 	int error, quota_cmd, quota_status = 0;
3302 	caddr_t datap;
3303 	size_t fnamelen;
3304 	struct nameidata nd;
3305 	vfs_context_t ctx = vfs_context_current();
3306 	struct dqblk my_dqblk = {};
3307 
3308 	AUDIT_ARG(uid, uap->uid);
3309 	AUDIT_ARG(cmd, uap->cmd);
3310 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3311 	    uap->path, ctx);
3312 	error = namei(&nd);
3313 	if (error) {
3314 		return error;
3315 	}
3316 	mp = nd.ni_vp->v_mount;
3317 	mount_ref(mp, 0);
3318 	vnode_put(nd.ni_vp);
3319 	nameidone(&nd);
3320 
3321 #if CONFIG_MACF
3322 	error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3323 	if (error != 0) {
3324 		goto out;
3325 	}
3326 #endif
3327 
3328 	/* copyin any data we will need for downstream code */
3329 	quota_cmd = uap->cmd >> SUBCMDSHIFT;
3330 
3331 	switch (quota_cmd) {
3332 	case Q_QUOTAON:
3333 		/* uap->arg specifies a file from which to take the quotas */
3334 		fnamelen = MAXPATHLEN;
3335 		datap = zalloc(ZV_NAMEI);
3336 		error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3337 		break;
3338 	case Q_GETQUOTA:
3339 		/* uap->arg is a pointer to a dqblk structure. */
3340 		datap = (caddr_t) &my_dqblk;
3341 		break;
3342 	case Q_SETQUOTA:
3343 	case Q_SETUSE:
3344 		/* uap->arg is a pointer to a dqblk structure. */
3345 		datap = (caddr_t) &my_dqblk;
3346 		if (proc_is64bit(p)) {
3347 			struct user_dqblk       my_dqblk64;
3348 			error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3349 			if (error == 0) {
3350 				munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3351 			}
3352 		} else {
3353 			error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3354 		}
3355 		break;
3356 	case Q_QUOTASTAT:
3357 		/* uap->arg is a pointer to an integer */
3358 		datap = (caddr_t) &quota_status;
3359 		break;
3360 	default:
3361 		datap = NULL;
3362 		break;
3363 	} /* switch */
3364 
3365 	if (error == 0) {
3366 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3367 	}
3368 
3369 	switch (quota_cmd) {
3370 	case Q_QUOTAON:
3371 		if (datap != NULL) {
3372 			zfree(ZV_NAMEI, datap);
3373 		}
3374 		break;
3375 	case Q_GETQUOTA:
3376 		/* uap->arg is a pointer to a dqblk structure we need to copy out to */
3377 		if (error == 0) {
3378 			if (proc_is64bit(p)) {
3379 				struct user_dqblk       my_dqblk64;
3380 
3381 				memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3382 				munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3383 				error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3384 			} else {
3385 				error = copyout(datap, uap->arg, sizeof(struct dqblk));
3386 			}
3387 		}
3388 		break;
3389 	case Q_QUOTASTAT:
3390 		/* uap->arg is a pointer to an integer */
3391 		if (error == 0) {
3392 			error = copyout(datap, uap->arg, sizeof(quota_status));
3393 		}
3394 		break;
3395 	default:
3396 		break;
3397 	} /* switch */
3398 
3399 out:
3400 	mount_drop(mp, 0);
3401 	return error;
3402 }
3403 #else
3404 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3405 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3406 {
3407 	return EOPNOTSUPP;
3408 }
3409 #endif /* QUOTA */
3410 
3411 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3412 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3413 {
3414 	int error;
3415 	vfs_context_t ctx = vfs_context_current();
3416 
3417 #if CONFIG_MACF
3418 	error = mac_mount_check_stat(ctx, mp);
3419 	if (error != 0) {
3420 		return error;
3421 	}
3422 #endif
3423 
3424 	error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3425 	if (error != 0) {
3426 		return error;
3427 	}
3428 
3429 	return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3430 }
3431 
3432 /*
3433  * Get filesystem statistics.
3434  *
3435  * Returns:	0			Success
3436  *	namei:???
3437  *	vfs_update_vfsstat:???
3438  *	munge_statfs:EFAULT
3439  */
3440 /* ARGSUSED */
3441 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3442 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3443 {
3444 	int error;
3445 	struct mount *mp;
3446 	struct nameidata nd;
3447 	vfs_context_t ctx = vfs_context_current();
3448 	vnode_t vp;
3449 
3450 	NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3451 	    UIO_USERSPACE, uap->path, ctx);
3452 	error = namei(&nd);
3453 	if (error != 0) {
3454 		return error;
3455 	}
3456 	vp = nd.ni_vp;
3457 	mp = vp->v_mount;
3458 	nameidone(&nd);
3459 
3460 	error = statfs_internal(p, mp, uap->buf);
3461 	vnode_put(vp);
3462 
3463 	return error;
3464 }
3465 
3466 /*
3467  * Get filesystem statistics.
3468  */
3469 /* ARGSUSED */
3470 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3471 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3472 {
3473 	int error;
3474 	vnode_t vp = NULL;
3475 	struct mount *mp;
3476 
3477 	AUDIT_ARG(fd, uap->fd);
3478 
3479 	if ((error = file_vnode(uap->fd, &vp)) ||
3480 	    (error = vnode_getwithref(vp))) {
3481 		goto out;
3482 	}
3483 
3484 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3485 
3486 	mp = vp->v_mount;
3487 	if (!mp) {
3488 		error = EBADF;
3489 		goto out_vnode;
3490 	}
3491 
3492 	error = statfs_internal(p, mp, uap->buf);
3493 
3494 out_vnode:
3495 	vnode_put(vp);
3496 
3497 out:
3498 	if (vp != NULL) {
3499 		file_drop(uap->fd);
3500 	}
3501 
3502 	return error;
3503 }
3504 
3505 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3506 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3507 {
3508 	struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3509 
3510 	bzero(sfs, sizeof(*sfs));
3511 
3512 	sfs->f_bsize = vsfs->f_bsize;
3513 	sfs->f_iosize = (int32_t)vsfs->f_iosize;
3514 	sfs->f_blocks = vsfs->f_blocks;
3515 	sfs->f_bfree = vsfs->f_bfree;
3516 	sfs->f_bavail = vsfs->f_bavail;
3517 	sfs->f_files = vsfs->f_files;
3518 	sfs->f_ffree = vsfs->f_ffree;
3519 	sfs->f_fsid = vsfs->f_fsid;
3520 	sfs->f_owner = vsfs->f_owner;
3521 	sfs->f_type = mp->mnt_vtable->vfc_typenum;
3522 	sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3523 	sfs->f_fssubtype = vsfs->f_fssubtype;
3524 	sfs->f_flags_ext = (mp->mnt_kern_flag & MNTK_SYSTEMDATA) ? MNT_EXT_ROOT_DATA_VOL : 0;
3525 	if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3526 		strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3527 	} else {
3528 		strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
3529 	}
3530 	strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3531 	strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3532 }
3533 
3534 /*
3535  * Get file system statistics in 64-bit mode
3536  */
3537 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3538 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3539 {
3540 	struct mount *mp;
3541 	int error;
3542 	struct nameidata *ndp;
3543 	struct statfs64 *sfsp;
3544 	vfs_context_t ctxp = vfs_context_current();
3545 	vnode_t vp;
3546 	struct {
3547 		struct nameidata nd;
3548 		struct statfs64 sfs;
3549 	} *__nameidata_statfs64;
3550 
3551 	__nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3552 	    Z_WAITOK);
3553 	ndp = &__nameidata_statfs64->nd;
3554 
3555 	NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3556 	    UIO_USERSPACE, uap->path, ctxp);
3557 	error = namei(ndp);
3558 	if (error != 0) {
3559 		goto out;
3560 	}
3561 	vp = ndp->ni_vp;
3562 	mp = vp->v_mount;
3563 	nameidone(ndp);
3564 
3565 #if CONFIG_MACF
3566 	error = mac_mount_check_stat(ctxp, mp);
3567 	if (error != 0) {
3568 		vnode_put(vp);
3569 		goto out;
3570 	}
3571 #endif
3572 
3573 	error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3574 	if (error != 0) {
3575 		vnode_put(vp);
3576 		goto out;
3577 	}
3578 
3579 	sfsp = &__nameidata_statfs64->sfs;
3580 	vfs_get_statfs64(mp, sfsp);
3581 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3582 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3583 		/* This process does not want to see a seperate data volume mountpoint */
3584 		strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3585 	}
3586 	error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3587 	vnode_put(vp);
3588 
3589 out:
3590 	kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3591 
3592 	return error;
3593 }
3594 
3595 /*
3596  * Get file system statistics in 64-bit mode
3597  */
3598 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3599 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3600 {
3601 	struct vnode *vp;
3602 	struct mount *mp;
3603 	struct statfs64 sfs;
3604 	int error;
3605 
3606 	AUDIT_ARG(fd, uap->fd);
3607 
3608 	if ((error = file_vnode(uap->fd, &vp))) {
3609 		return error;
3610 	}
3611 
3612 	error = vnode_getwithref(vp);
3613 	if (error) {
3614 		file_drop(uap->fd);
3615 		return error;
3616 	}
3617 
3618 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3619 
3620 	mp = vp->v_mount;
3621 	if (!mp) {
3622 		error = EBADF;
3623 		goto out;
3624 	}
3625 
3626 #if CONFIG_MACF
3627 	error = mac_mount_check_stat(vfs_context_current(), mp);
3628 	if (error != 0) {
3629 		goto out;
3630 	}
3631 #endif
3632 
3633 	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3634 		goto out;
3635 	}
3636 
3637 	vfs_get_statfs64(mp, &sfs);
3638 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3639 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3640 		/* This process does not want to see a seperate data volume mountpoint */
3641 		strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3642 	}
3643 	error = copyout(&sfs, uap->buf, sizeof(sfs));
3644 
3645 out:
3646 	file_drop(uap->fd);
3647 	vnode_put(vp);
3648 
3649 	return error;
3650 }
3651 
3652 struct getfsstat_struct {
3653 	user_addr_t     sfsp;
3654 	user_addr_t     *mp;
3655 	int             count;
3656 	int             maxcount;
3657 	int             flags;
3658 	int             error;
3659 };
3660 
3661 
3662 static int
getfsstat_callback(mount_t mp,void * arg)3663 getfsstat_callback(mount_t mp, void * arg)
3664 {
3665 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3666 	struct vfsstatfs *sp;
3667 	int error, my_size;
3668 	vfs_context_t ctx = vfs_context_current();
3669 
3670 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3671 #if CONFIG_MACF
3672 		error = mac_mount_check_stat(ctx, mp);
3673 		if (error != 0) {
3674 			fstp->error = error;
3675 			return VFS_RETURNED_DONE;
3676 		}
3677 #endif
3678 		sp = &mp->mnt_vfsstat;
3679 		/*
3680 		 * If MNT_NOWAIT is specified, do not refresh the
3681 		 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3682 		 */
3683 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3684 		    (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3685 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3686 		    (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3687 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3688 			return VFS_RETURNED;
3689 		}
3690 
3691 		/*
3692 		 * Need to handle LP64 version of struct statfs
3693 		 */
3694 		error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3695 		if (error) {
3696 			fstp->error = error;
3697 			return VFS_RETURNED_DONE;
3698 		}
3699 		fstp->sfsp += my_size;
3700 
3701 		if (fstp->mp) {
3702 #if CONFIG_MACF
3703 			error = mac_mount_label_get(mp, *fstp->mp);
3704 			if (error) {
3705 				fstp->error = error;
3706 				return VFS_RETURNED_DONE;
3707 			}
3708 #endif
3709 			fstp->mp++;
3710 		}
3711 	}
3712 	fstp->count++;
3713 	return VFS_RETURNED;
3714 }
3715 
3716 /*
3717  * Get statistics on all filesystems.
3718  */
3719 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3720 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3721 {
3722 	struct __mac_getfsstat_args muap;
3723 
3724 	muap.buf = uap->buf;
3725 	muap.bufsize = uap->bufsize;
3726 	muap.mac = USER_ADDR_NULL;
3727 	muap.macsize = 0;
3728 	muap.flags = uap->flags;
3729 
3730 	return __mac_getfsstat(p, &muap, retval);
3731 }
3732 
3733 /*
3734  * __mac_getfsstat: Get MAC-related file system statistics
3735  *
3736  * Parameters:    p                        (ignored)
3737  *                uap                      User argument descriptor (see below)
3738  *                retval                   Count of file system statistics (N stats)
3739  *
3740  * Indirect:      uap->bufsize             Buffer size
3741  *                uap->macsize             MAC info size
3742  *                uap->buf                 Buffer where information will be returned
3743  *                uap->mac                 MAC info
3744  *                uap->flags               File system flags
3745  *
3746  *
3747  * Returns:        0                       Success
3748  *                !0                       Not success
3749  *
3750  */
3751 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3752 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3753 {
3754 	user_addr_t sfsp;
3755 	user_addr_t *mp;
3756 	size_t count, maxcount, bufsize, macsize;
3757 	struct getfsstat_struct fst;
3758 
3759 	if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3760 		return EINVAL;
3761 	}
3762 
3763 	bufsize = (size_t) uap->bufsize;
3764 	macsize = (size_t) uap->macsize;
3765 
3766 	if (IS_64BIT_PROCESS(p)) {
3767 		maxcount = bufsize / sizeof(struct user64_statfs);
3768 	} else {
3769 		maxcount = bufsize / sizeof(struct user32_statfs);
3770 	}
3771 	sfsp = uap->buf;
3772 	count = 0;
3773 
3774 	mp = NULL;
3775 
3776 #if CONFIG_MACF
3777 	if (uap->mac != USER_ADDR_NULL) {
3778 		u_int32_t *mp0;
3779 		int error;
3780 		unsigned int i;
3781 
3782 		count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3783 		if (count != maxcount) {
3784 			return EINVAL;
3785 		}
3786 
3787 		/* Copy in the array */
3788 		mp0 = kalloc_data(macsize, Z_WAITOK);
3789 		if (mp0 == NULL) {
3790 			return ENOMEM;
3791 		}
3792 
3793 		error = copyin(uap->mac, mp0, macsize);
3794 		if (error) {
3795 			kfree_data(mp0, macsize);
3796 			return error;
3797 		}
3798 
3799 		/* Normalize to an array of user_addr_t */
3800 		mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3801 		if (mp == NULL) {
3802 			kfree_data(mp0, macsize);
3803 			return ENOMEM;
3804 		}
3805 
3806 		for (i = 0; i < count; i++) {
3807 			if (IS_64BIT_PROCESS(p)) {
3808 				mp[i] = ((user_addr_t *)mp0)[i];
3809 			} else {
3810 				mp[i] = (user_addr_t)mp0[i];
3811 			}
3812 		}
3813 		kfree_data(mp0, macsize);
3814 	}
3815 #endif
3816 
3817 
3818 	fst.sfsp = sfsp;
3819 	fst.mp = mp;
3820 	fst.flags = uap->flags;
3821 	fst.count = 0;
3822 	fst.error = 0;
3823 	fst.maxcount = (int)maxcount;
3824 
3825 
3826 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3827 
3828 	if (mp) {
3829 		kfree_data(mp, count * sizeof(user_addr_t));
3830 	}
3831 
3832 	if (fst.error) {
3833 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3834 		return fst.error;
3835 	}
3836 
3837 	if (fst.sfsp && fst.count > fst.maxcount) {
3838 		*retval = fst.maxcount;
3839 	} else {
3840 		*retval = fst.count;
3841 	}
3842 	return 0;
3843 }
3844 
3845 static int
getfsstat64_callback(mount_t mp,void * arg)3846 getfsstat64_callback(mount_t mp, void * arg)
3847 {
3848 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3849 	struct vfsstatfs *sp;
3850 	struct statfs64 sfs;
3851 	int error;
3852 
3853 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3854 #if CONFIG_MACF
3855 		error = mac_mount_check_stat(vfs_context_current(), mp);
3856 		if (error != 0) {
3857 			fstp->error = error;
3858 			return VFS_RETURNED_DONE;
3859 		}
3860 #endif
3861 		sp = &mp->mnt_vfsstat;
3862 		/*
3863 		 * If MNT_NOWAIT is specified, do not refresh the fsstat
3864 		 * cache. MNT_WAIT overrides MNT_NOWAIT.
3865 		 *
3866 		 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3867 		 * getfsstat, since the constants are out of the same
3868 		 * namespace.
3869 		 */
3870 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3871 		    ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3872 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3873 		    (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3874 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3875 			return VFS_RETURNED;
3876 		}
3877 
3878 		vfs_get_statfs64(mp, &sfs);
3879 		error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3880 		if (error) {
3881 			fstp->error = error;
3882 			return VFS_RETURNED_DONE;
3883 		}
3884 		fstp->sfsp += sizeof(sfs);
3885 	}
3886 	fstp->count++;
3887 	return VFS_RETURNED;
3888 }
3889 
3890 /*
3891  * Get statistics on all file systems in 64 bit mode.
3892  */
3893 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)3894 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3895 {
3896 	user_addr_t sfsp;
3897 	int count, maxcount;
3898 	struct getfsstat_struct fst;
3899 
3900 	maxcount = uap->bufsize / sizeof(struct statfs64);
3901 
3902 	sfsp = uap->buf;
3903 	count = 0;
3904 
3905 	fst.sfsp = sfsp;
3906 	fst.flags = uap->flags;
3907 	fst.count = 0;
3908 	fst.error = 0;
3909 	fst.maxcount = maxcount;
3910 
3911 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3912 
3913 	if (fst.error) {
3914 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3915 		return fst.error;
3916 	}
3917 
3918 	if (fst.sfsp && fst.count > fst.maxcount) {
3919 		*retval = fst.maxcount;
3920 	} else {
3921 		*retval = fst.count;
3922 	}
3923 
3924 	return 0;
3925 }
3926 
3927 /*
3928  * gets the associated vnode with the file descriptor passed.
3929  * as input
3930  *
3931  * INPUT
3932  * ctx - vfs context of caller
3933  * fd - file descriptor for which vnode is required.
3934  * vpp - Pointer to pointer to vnode to be returned.
3935  *
3936  * The vnode is returned with an iocount so any vnode obtained
3937  * by this call needs a vnode_put
3938  *
3939  */
3940 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)3941 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3942 {
3943 	int error;
3944 	vnode_t vp;
3945 	struct fileproc *fp;
3946 	proc_t p = vfs_context_proc(ctx);
3947 
3948 	*vpp =  NULLVP;
3949 
3950 	error = fp_getfvp(p, fd, &fp, &vp);
3951 	if (error) {
3952 		return error;
3953 	}
3954 
3955 	error = vnode_getwithref(vp);
3956 	if (error) {
3957 		(void)fp_drop(p, fd, fp, 0);
3958 		return error;
3959 	}
3960 
3961 	(void)fp_drop(p, fd, fp, 0);
3962 	*vpp = vp;
3963 	return error;
3964 }
3965 
3966 /*
3967  * Wrapper function around namei to start lookup from a directory
3968  * specified by a file descriptor ni_dirfd.
3969  *
3970  * In addition to all the errors returned by namei, this call can
3971  * return ENOTDIR if the file descriptor does not refer to a directory.
3972  * and EBADF if the file descriptor is not valid.
3973  */
3974 int
nameiat(struct nameidata * ndp,int dirfd)3975 nameiat(struct nameidata *ndp, int dirfd)
3976 {
3977 	if ((dirfd != AT_FDCWD) &&
3978 	    !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3979 	    !(ndp->ni_cnd.cn_flags & USEDVP)) {
3980 		int error = 0;
3981 		char c;
3982 
3983 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3984 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
3985 			if (error) {
3986 				return error;
3987 			}
3988 		} else {
3989 			c = *((char *)(ndp->ni_dirp));
3990 		}
3991 
3992 		if (c != '/') {
3993 			vnode_t dvp_at;
3994 
3995 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3996 			    &dvp_at);
3997 			if (error) {
3998 				return error;
3999 			}
4000 
4001 			if (vnode_vtype(dvp_at) != VDIR) {
4002 				vnode_put(dvp_at);
4003 				return ENOTDIR;
4004 			}
4005 
4006 			ndp->ni_dvp = dvp_at;
4007 			ndp->ni_cnd.cn_flags |= USEDVP;
4008 			error = namei(ndp);
4009 			ndp->ni_cnd.cn_flags &= ~USEDVP;
4010 			vnode_put(dvp_at);
4011 			return error;
4012 		}
4013 	}
4014 
4015 	return namei(ndp);
4016 }
4017 
4018 /*
4019  * Change current working directory to a given file descriptor.
4020  */
4021 /* ARGSUSED */
4022 static int
common_fchdir(proc_t p,struct fchdir_args * uap,int per_thread)4023 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
4024 {
4025 	vnode_t vp;
4026 	vnode_t tdp;
4027 	vnode_t tvp;
4028 	struct mount *mp;
4029 	int error, should_put = 1;
4030 	vfs_context_t ctx = vfs_context_current();
4031 
4032 	AUDIT_ARG(fd, uap->fd);
4033 	if (per_thread && uap->fd == -1) {
4034 		/*
4035 		 * Switching back from per-thread to per process CWD; verify we
4036 		 * in fact have one before proceeding.  The only success case
4037 		 * for this code path is to return 0 preemptively after zapping
4038 		 * the thread structure contents.
4039 		 */
4040 		thread_t th = vfs_context_thread(ctx);
4041 		if (th) {
4042 			uthread_t uth = get_bsdthread_info(th);
4043 			tvp = uth->uu_cdir;
4044 			uth->uu_cdir = NULLVP;
4045 			if (tvp != NULLVP) {
4046 				vnode_rele(tvp);
4047 				return 0;
4048 			}
4049 		}
4050 		return EBADF;
4051 	}
4052 
4053 	if ((error = file_vnode(uap->fd, &vp))) {
4054 		return error;
4055 	}
4056 	if ((error = vnode_getwithref(vp))) {
4057 		file_drop(uap->fd);
4058 		return error;
4059 	}
4060 
4061 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4062 
4063 	if (vp->v_type != VDIR) {
4064 		error = ENOTDIR;
4065 		goto out;
4066 	}
4067 
4068 #if CONFIG_MACF
4069 	error = mac_vnode_check_chdir(ctx, vp);
4070 	if (error) {
4071 		goto out;
4072 	}
4073 #endif
4074 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4075 	if (error) {
4076 		goto out;
4077 	}
4078 
4079 	while (!error && (mp = vp->v_mountedhere) != NULL) {
4080 		if (vfs_busy(mp, LK_NOWAIT)) {
4081 			error = EACCES;
4082 			goto out;
4083 		}
4084 		error = VFS_ROOT(mp, &tdp, ctx);
4085 		vfs_unbusy(mp);
4086 		if (error) {
4087 			break;
4088 		}
4089 		vnode_put(vp);
4090 		vp = tdp;
4091 	}
4092 	if (error) {
4093 		goto out;
4094 	}
4095 	if ((error = vnode_ref(vp))) {
4096 		goto out;
4097 	}
4098 	vnode_put(vp);
4099 	should_put = 0;
4100 
4101 	if (per_thread) {
4102 		thread_t th = vfs_context_thread(ctx);
4103 		if (th) {
4104 			uthread_t uth = get_bsdthread_info(th);
4105 			tvp = uth->uu_cdir;
4106 			uth->uu_cdir = vp;
4107 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4108 		} else {
4109 			vnode_rele(vp);
4110 			error = ENOENT;
4111 			goto out;
4112 		}
4113 	} else {
4114 		proc_dirs_lock_exclusive(p);
4115 		proc_fdlock(p);
4116 		tvp = p->p_fd.fd_cdir;
4117 		p->p_fd.fd_cdir = vp;
4118 		proc_fdunlock(p);
4119 		proc_dirs_unlock_exclusive(p);
4120 	}
4121 
4122 	if (tvp) {
4123 		vnode_rele(tvp);
4124 	}
4125 
4126 out:
4127 	if (should_put) {
4128 		vnode_put(vp);
4129 	}
4130 	file_drop(uap->fd);
4131 
4132 	return error;
4133 }
4134 
4135 int
fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4136 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4137 {
4138 	return common_fchdir(p, uap, 0);
4139 }
4140 
4141 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4142 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4143 {
4144 	return common_fchdir(p, (void *)uap, 1);
4145 }
4146 
4147 
4148 /*
4149  * Change current working directory (".").
4150  *
4151  * Returns:	0			Success
4152  *	change_dir:ENOTDIR
4153  *	change_dir:???
4154  *	vnode_ref:ENOENT		No such file or directory
4155  */
4156 /* ARGSUSED */
4157 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4158 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4159 {
4160 	int error;
4161 	vnode_t tvp;
4162 
4163 	error = change_dir(ndp, ctx);
4164 	if (error) {
4165 		return error;
4166 	}
4167 	if ((error = vnode_ref(ndp->ni_vp))) {
4168 		vnode_put(ndp->ni_vp);
4169 		return error;
4170 	}
4171 	/*
4172 	 * drop the iocount we picked up in change_dir
4173 	 */
4174 	vnode_put(ndp->ni_vp);
4175 
4176 	if (per_thread) {
4177 		thread_t th = vfs_context_thread(ctx);
4178 		if (th) {
4179 			uthread_t uth = get_bsdthread_info(th);
4180 			tvp = uth->uu_cdir;
4181 			uth->uu_cdir = ndp->ni_vp;
4182 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4183 		} else {
4184 			vnode_rele(ndp->ni_vp);
4185 			return ENOENT;
4186 		}
4187 	} else {
4188 		proc_dirs_lock_exclusive(p);
4189 		proc_fdlock(p);
4190 		tvp = p->p_fd.fd_cdir;
4191 		p->p_fd.fd_cdir = ndp->ni_vp;
4192 		proc_fdunlock(p);
4193 		proc_dirs_unlock_exclusive(p);
4194 	}
4195 
4196 	if (tvp) {
4197 		vnode_rele(tvp);
4198 	}
4199 
4200 	return 0;
4201 }
4202 
4203 
4204 /*
4205  * Change current working directory (".").
4206  *
4207  * Returns:	0			Success
4208  *	chdir_internal:ENOTDIR
4209  *	chdir_internal:ENOENT		No such file or directory
4210  *	chdir_internal:???
4211  */
4212 /* ARGSUSED */
4213 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4214 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4215 {
4216 	struct nameidata nd;
4217 	vfs_context_t ctx = vfs_context_current();
4218 
4219 	NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4220 	    UIO_USERSPACE, uap->path, ctx);
4221 
4222 	return chdir_internal(p, ctx, &nd, per_thread);
4223 }
4224 
4225 
4226 /*
4227  * chdir
4228  *
4229  * Change current working directory (".") for the entire process
4230  *
4231  * Parameters:  p       Process requesting the call
4232  *              uap     User argument descriptor (see below)
4233  *              retval  (ignored)
4234  *
4235  * Indirect parameters:	uap->path	Directory path
4236  *
4237  * Returns:	0			Success
4238  *              common_chdir: ENOTDIR
4239  *              common_chdir: ENOENT	No such file or directory
4240  *              common_chdir: ???
4241  *
4242  */
4243 int
chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4244 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4245 {
4246 	return common_chdir(p, (void *)uap, 0);
4247 }
4248 
4249 /*
4250  * __pthread_chdir
4251  *
4252  * Change current working directory (".") for a single thread
4253  *
4254  * Parameters:  p       Process requesting the call
4255  *              uap     User argument descriptor (see below)
4256  *              retval  (ignored)
4257  *
4258  * Indirect parameters:	uap->path	Directory path
4259  *
4260  * Returns:	0			Success
4261  *              common_chdir: ENOTDIR
4262  *		common_chdir: ENOENT	No such file or directory
4263  *		common_chdir: ???
4264  *
4265  */
4266 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4267 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4268 {
4269 	return common_chdir(p, (void *)uap, 1);
4270 }
4271 
4272 
4273 /*
4274  * Change notion of root (``/'') directory.
4275  */
4276 /* ARGSUSED */
4277 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4278 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4279 {
4280 	struct filedesc *fdp = &p->p_fd;
4281 	int error;
4282 	struct nameidata nd;
4283 	vnode_t tvp;
4284 	vfs_context_t ctx = vfs_context_current();
4285 
4286 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4287 		return error;
4288 	}
4289 
4290 	NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4291 	    UIO_USERSPACE, uap->path, ctx);
4292 	error = change_dir(&nd, ctx);
4293 	if (error) {
4294 		return error;
4295 	}
4296 
4297 #if CONFIG_MACF
4298 	error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4299 	    &nd.ni_cnd);
4300 	if (error) {
4301 		vnode_put(nd.ni_vp);
4302 		return error;
4303 	}
4304 #endif
4305 
4306 	if ((error = vnode_ref(nd.ni_vp))) {
4307 		vnode_put(nd.ni_vp);
4308 		return error;
4309 	}
4310 	vnode_put(nd.ni_vp);
4311 
4312 	/*
4313 	 * This lock provides the guarantee that as long as you hold the lock
4314 	 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4315 	 * on a referenced vnode in namei when determining the rootvnode for
4316 	 * a process.
4317 	 */
4318 	/* needed for synchronization with lookup */
4319 	proc_dirs_lock_exclusive(p);
4320 	/* needed for setting the flag and other activities on the fd itself */
4321 	proc_fdlock(p);
4322 	tvp = fdp->fd_rdir;
4323 	fdp->fd_rdir = nd.ni_vp;
4324 	fdt_flag_set(fdp, FD_CHROOT);
4325 	proc_fdunlock(p);
4326 	proc_dirs_unlock_exclusive(p);
4327 
4328 	if (tvp != NULL) {
4329 		vnode_rele(tvp);
4330 	}
4331 
4332 	return 0;
4333 }
4334 
4335 #define PATHSTATICBUFLEN 256
4336 #define PIVOT_ROOT_ENTITLEMENT              \
4337        "com.apple.private.vfs.pivot-root"
4338 
4339 #if defined(XNU_TARGET_OS_OSX)
4340 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4341 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4342 {
4343 	int error;
4344 	char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4345 	char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4346 	char *new_rootfs_path_before_buf = NULL;
4347 	char *old_rootfs_path_after_buf = NULL;
4348 	char *incoming = NULL;
4349 	char *outgoing = NULL;
4350 	vnode_t incoming_rootvp = NULLVP;
4351 	size_t bytes_copied;
4352 
4353 	/*
4354 	 * XXX : Additional restrictions needed
4355 	 * - perhaps callable only once.
4356 	 */
4357 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4358 		return error;
4359 	}
4360 
4361 	/*
4362 	 * pivot_root can be executed by launchd only.
4363 	 * Enforce entitlement.
4364 	 */
4365 	if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4366 		return EPERM;
4367 	}
4368 
4369 	error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4370 	if (error == ENAMETOOLONG) {
4371 		new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4372 		error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4373 	}
4374 
4375 	if (error) {
4376 		goto out;
4377 	}
4378 
4379 	error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4380 	if (error == ENAMETOOLONG) {
4381 		old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4382 		error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4383 	}
4384 	if (error) {
4385 		goto out;
4386 	}
4387 
4388 	if (new_rootfs_path_before_buf) {
4389 		incoming = new_rootfs_path_before_buf;
4390 	} else {
4391 		incoming = &new_rootfs_path_before[0];
4392 	}
4393 
4394 	if (old_rootfs_path_after_buf) {
4395 		outgoing = old_rootfs_path_after_buf;
4396 	} else {
4397 		outgoing = &old_rootfs_path_after[0];
4398 	}
4399 
4400 	/*
4401 	 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4402 	 * Userland is not allowed to pivot to an image.
4403 	 */
4404 	error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4405 	if (error) {
4406 		goto out;
4407 	}
4408 	error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4409 	if (error) {
4410 		goto out;
4411 	}
4412 
4413 	error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4414 
4415 out:
4416 	if (incoming_rootvp != NULLVP) {
4417 		vnode_put(incoming_rootvp);
4418 		incoming_rootvp = NULLVP;
4419 	}
4420 
4421 	if (old_rootfs_path_after_buf) {
4422 		zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4423 	}
4424 
4425 	if (new_rootfs_path_before_buf) {
4426 		zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4427 	}
4428 
4429 	return error;
4430 }
4431 #else
4432 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4433 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4434 {
4435 	return nosys(p, NULL, retval);
4436 }
4437 #endif /* XNU_TARGET_OS_OSX */
4438 
4439 /*
4440  * Common routine for chroot and chdir.
4441  *
4442  * Returns:	0			Success
4443  *		ENOTDIR			Not a directory
4444  *		namei:???		[anything namei can return]
4445  *		vnode_authorize:???	[anything vnode_authorize can return]
4446  */
4447 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4448 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4449 {
4450 	vnode_t vp;
4451 	int error;
4452 
4453 	if ((error = namei(ndp))) {
4454 		return error;
4455 	}
4456 	nameidone(ndp);
4457 	vp = ndp->ni_vp;
4458 
4459 	if (vp->v_type != VDIR) {
4460 		vnode_put(vp);
4461 		return ENOTDIR;
4462 	}
4463 
4464 #if CONFIG_MACF
4465 	error = mac_vnode_check_chdir(ctx, vp);
4466 	if (error) {
4467 		vnode_put(vp);
4468 		return error;
4469 	}
4470 #endif
4471 
4472 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4473 	if (error) {
4474 		vnode_put(vp);
4475 		return error;
4476 	}
4477 
4478 	return error;
4479 }
4480 
4481 /*
4482  * Free the vnode data (for directories) associated with the file glob.
4483  */
4484 struct fd_vn_data *
fg_vn_data_alloc(void)4485 fg_vn_data_alloc(void)
4486 {
4487 	struct fd_vn_data *fvdata;
4488 
4489 	/* Allocate per fd vnode data */
4490 	fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4491 	lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4492 	return fvdata;
4493 }
4494 
4495 /*
4496  * Free the vnode data (for directories) associated with the file glob.
4497  */
4498 void
fg_vn_data_free(void * fgvndata)4499 fg_vn_data_free(void *fgvndata)
4500 {
4501 	struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4502 
4503 	kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4504 	lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4505 	kfree_type(struct fd_vn_data, fvdata);
4506 }
4507 
4508 /*
4509  * Check permissions, allocate an open file structure,
4510  * and call the device open routine if any.
4511  *
4512  * Returns:	0			Success
4513  *		EINVAL
4514  *		EINTR
4515  *	falloc:ENFILE
4516  *	falloc:EMFILE
4517  *	falloc:ENOMEM
4518  *	vn_open_auth:???
4519  *	dupfdopen:???
4520  *	VNOP_ADVLOCK:???
4521  *	vnode_setsize:???
4522  *
4523  * XXX Need to implement uid, gid
4524  */
4525 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4526 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4527     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4528 {
4529 	proc_t p = vfs_context_proc(ctx);
4530 	uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4531 	struct fileproc *fp;
4532 	vnode_t vp;
4533 	int flags, oflags, amode;
4534 	int type, indx, error;
4535 	struct vfs_context context;
4536 	vnode_t authvp = NULLVP;
4537 
4538 	oflags = uflags;
4539 
4540 	amode = oflags & O_ACCMODE;
4541 	/*
4542 	 * Because O_RDONLY is 0, it is not possible to distinguish between
4543 	 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4544 	 * with FREAD/FWRITE.
4545 	 */
4546 	if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4547 		return EINVAL;
4548 	}
4549 
4550 	flags = FFLAGS(uflags);
4551 	CLR(flags, FENCRYPTED);
4552 	CLR(flags, FUNENCRYPTED);
4553 
4554 	AUDIT_ARG(fflags, oflags);
4555 	AUDIT_ARG(mode, vap->va_mode);
4556 
4557 	if ((error = falloc_withinit(p, &fp, &indx, ctx, fp_init, initarg)) != 0) {
4558 		return error;
4559 	}
4560 	if (flags & O_CLOEXEC) {
4561 		fp->fp_flags |= FP_CLOEXEC;
4562 	}
4563 	if (flags & O_CLOFORK) {
4564 		fp->fp_flags |= FP_CLOFORK;
4565 	}
4566 
4567 	/* setup state to recognize when fdesc_open was called */
4568 	uu->uu_dupfd = -1;
4569 
4570 	/*
4571 	 * Disable read/write access if file is opened with O_EVTONLY and
4572 	 * the process has requested to deny read/write access.
4573 	 */
4574 	if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4575 		flags &= ~(FREAD | FWRITE);
4576 	}
4577 
4578 	if (authfd != AUTH_OPEN_NOAUTHFD) {
4579 		error = vnode_getfromfd(ctx, authfd, &authvp);
4580 		if (error) {
4581 			fp_free(p, indx, fp);
4582 			return error;
4583 		}
4584 	}
4585 
4586 	if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4587 		if (authvp != NULLVP) {
4588 			vnode_put(authvp);
4589 		}
4590 		if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4591 			if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4592 				*retval = indx;
4593 				return 0;
4594 			}
4595 		}
4596 		if (error == ERESTART) {
4597 			error = EINTR;
4598 		}
4599 		fp_free(p, indx, fp);
4600 		return error;
4601 	}
4602 
4603 	if (authvp != NULLVP) {
4604 		vnode_put(authvp);
4605 	}
4606 
4607 	uu->uu_dupfd = 0;
4608 	vp = ndp->ni_vp;
4609 
4610 	fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4611 	fp->fp_glob->fg_ops = &vnops;
4612 	fp_set_data(fp, vp);
4613 
4614 #if CONFIG_FILE_LEASES
4615 	/*
4616 	 * If we are creating a file or open with truncate, we need to break the
4617 	 * lease if there is a read lease placed on the parent dir.
4618 	 */
4619 	if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4620 		vnode_breakdirlease(vp, true, oflags);
4621 	}
4622 	/* Now check if there is a lease placed on the file itself. */
4623 	error = vnode_breaklease(vp, oflags, ctx);
4624 	if (error) {
4625 		goto bad;
4626 	}
4627 #endif /* CONFIG_FILE_LEASES */
4628 
4629 	if (flags & (O_EXLOCK | O_SHLOCK)) {
4630 		struct flock lf = {
4631 			.l_whence = SEEK_SET,
4632 		};
4633 
4634 		if (flags & O_EXLOCK) {
4635 			lf.l_type = F_WRLCK;
4636 		} else {
4637 			lf.l_type = F_RDLCK;
4638 		}
4639 		type = F_FLOCK;
4640 		if ((flags & FNONBLOCK) == 0) {
4641 			type |= F_WAIT;
4642 		}
4643 #if CONFIG_MACF
4644 		error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4645 		    F_SETLK, &lf);
4646 		if (error) {
4647 			goto bad;
4648 		}
4649 #endif
4650 		if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4651 			goto bad;
4652 		}
4653 		fp->fp_glob->fg_flag |= FWASLOCKED;
4654 	}
4655 
4656 	/* try to truncate by setting the size attribute */
4657 	if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4658 		goto bad;
4659 	}
4660 
4661 	/*
4662 	 * For directories we hold some additional information in the fd.
4663 	 */
4664 	if (vnode_vtype(vp) == VDIR) {
4665 		fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4666 	} else {
4667 		fp->fp_glob->fg_vn_data = NULL;
4668 	}
4669 
4670 #if CONFIG_SECLUDED_MEMORY
4671 	if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
4672 		memory_object_control_t moc;
4673 		const char *v_name;
4674 
4675 		moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4676 
4677 		if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4678 			/* nothing to do... */
4679 		} else if (fp->fp_glob->fg_flag & FWRITE) {
4680 			/* writable -> no longer  eligible for secluded pages */
4681 			memory_object_mark_eligible_for_secluded(moc,
4682 			    FALSE);
4683 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
4684 			char pathname[32] = { 0, };
4685 			size_t copied;
4686 			/* XXX FBDP: better way to detect /Applications/ ? */
4687 			if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4688 				(void)copyinstr(ndp->ni_dirp,
4689 				    pathname,
4690 				    sizeof(pathname),
4691 				    &copied);
4692 			} else {
4693 				copystr(CAST_DOWN(void *, ndp->ni_dirp),
4694 				    pathname,
4695 				    sizeof(pathname),
4696 				    &copied);
4697 			}
4698 			pathname[sizeof(pathname) - 1] = '\0';
4699 			if (strncmp(pathname,
4700 			    "/Applications/",
4701 			    strlen("/Applications/")) == 0 &&
4702 			    strncmp(pathname,
4703 			    "/Applications/Camera.app/",
4704 			    strlen("/Applications/Camera.app/")) != 0) {
4705 				/*
4706 				 * not writable
4707 				 * AND from "/Applications/"
4708 				 * AND not from "/Applications/Camera.app/"
4709 				 * ==> eligible for secluded
4710 				 */
4711 				memory_object_mark_eligible_for_secluded(moc,
4712 				    TRUE);
4713 			}
4714 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
4715 		    (v_name = vnode_getname(vp))) {
4716 			size_t len = strlen(v_name);
4717 
4718 			if (!strncmp(v_name, "dyld", len) ||
4719 			    !strncmp(v_name, "launchd", len) ||
4720 			    !strncmp(v_name, "Camera", len) ||
4721 			    !strncmp(v_name, "SpringBoard", len) ||
4722 			    !strncmp(v_name, "backboardd", len)) {
4723 				/*
4724 				 * This file matters when launching Camera:
4725 				 * do not store its contents in the secluded
4726 				 * pool that will be drained on Camera launch.
4727 				 */
4728 				memory_object_mark_eligible_for_secluded(moc,
4729 				    FALSE);
4730 			} else if (!strncmp(v_name, "mediaserverd", len)) {
4731 				memory_object_mark_eligible_for_secluded(moc,
4732 				    FALSE);
4733 				memory_object_mark_for_realtime(moc,
4734 				    true);
4735 			} else if (!strncmp(v_name, "bluetoothd", len)) {
4736 				/*
4737 				 * bluetoothd might be needed for realtime audio
4738 				 * playback.
4739 				 */
4740 				memory_object_mark_eligible_for_secluded(moc,
4741 				    FALSE);
4742 				memory_object_mark_for_realtime(moc,
4743 				    true);
4744 			} else {
4745 				char pathname[64] = { 0, };
4746 				size_t copied;
4747 				if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4748 					(void)copyinstr(ndp->ni_dirp,
4749 					    pathname,
4750 					    sizeof(pathname),
4751 					    &copied);
4752 				} else {
4753 					copystr(CAST_DOWN(void *, ndp->ni_dirp),
4754 					    pathname,
4755 					    sizeof(pathname),
4756 					    &copied);
4757 				}
4758 				pathname[sizeof(pathname) - 1] = '\0';
4759 				if (strncmp(pathname,
4760 				    "/Library/Audio/Plug-Ins/",
4761 				    strlen("/Library/Audio/Plug-Ins/")) == 0 ||
4762 				    strncmp(pathname,
4763 				    "/System/Library/Audio/Plug-Ins/",
4764 				    strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
4765 					/*
4766 					 * This may be an audio plugin required
4767 					 * for realtime playback.
4768 					 * ==> NOT eligible for secluded.
4769 					 */
4770 					memory_object_mark_eligible_for_secluded(moc,
4771 					    FALSE);
4772 					memory_object_mark_for_realtime(moc,
4773 					    true);
4774 				}
4775 			}
4776 			vnode_putname(v_name);
4777 		}
4778 	}
4779 #endif /* CONFIG_SECLUDED_MEMORY */
4780 
4781 	vnode_put(vp);
4782 
4783 	/*
4784 	 * The first terminal open (without a O_NOCTTY) by a session leader
4785 	 * results in it being set as the controlling terminal.
4786 	 */
4787 	if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4788 	    !(flags & O_NOCTTY)) {
4789 		int tmp = 0;
4790 
4791 		(void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4792 		    (caddr_t)&tmp, ctx);
4793 	}
4794 
4795 	proc_fdlock(p);
4796 	procfdtbl_releasefd(p, indx, NULL);
4797 
4798 	fp_drop(p, indx, fp, 1);
4799 	proc_fdunlock(p);
4800 
4801 	*retval = indx;
4802 
4803 	return 0;
4804 bad:
4805 	context = *vfs_context_current();
4806 	context.vc_ucred = fp->fp_glob->fg_cred;
4807 
4808 	if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4809 	    (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4810 		struct flock lf = {
4811 			.l_whence = SEEK_SET,
4812 			.l_type = F_UNLCK,
4813 		};
4814 
4815 		(void)VNOP_ADVLOCK(
4816 			vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4817 	}
4818 
4819 	vn_close(vp, fp->fp_glob->fg_flag, &context);
4820 	vnode_put(vp);
4821 	fp_free(p, indx, fp);
4822 
4823 	return error;
4824 }
4825 
4826 /*
4827  * While most of the *at syscall handlers can call nameiat() which
4828  * is a wrapper around namei, the use of namei and initialisation
4829  * of nameidata are far removed and in different functions  - namei
4830  * gets called in vn_open_auth for open1. So we'll just do here what
4831  * nameiat() does.
4832  */
4833 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)4834 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4835     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4836     int dirfd, int authfd)
4837 {
4838 	if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4839 		int error;
4840 		char c;
4841 
4842 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4843 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
4844 			if (error) {
4845 				return error;
4846 			}
4847 		} else {
4848 			c = *((char *)(ndp->ni_dirp));
4849 		}
4850 
4851 		if (c != '/') {
4852 			vnode_t dvp_at;
4853 
4854 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4855 			    &dvp_at);
4856 			if (error) {
4857 				return error;
4858 			}
4859 
4860 			if (vnode_vtype(dvp_at) != VDIR) {
4861 				vnode_put(dvp_at);
4862 				return ENOTDIR;
4863 			}
4864 
4865 			ndp->ni_dvp = dvp_at;
4866 			ndp->ni_cnd.cn_flags |= USEDVP;
4867 			error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4868 			    retval, authfd);
4869 			vnode_put(dvp_at);
4870 			return error;
4871 		}
4872 	}
4873 
4874 	return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
4875 }
4876 
4877 /*
4878  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4879  *
4880  * Parameters:	p			Process requesting the open
4881  *		uap			User argument descriptor (see below)
4882  *		retval			Pointer to an area to receive the
4883  *					return calue from the system call
4884  *
4885  * Indirect:	uap->path		Path to open (same as 'open')
4886  *		uap->flags		Flags to open (same as 'open'
4887  *		uap->uid		UID to set, if creating
4888  *		uap->gid		GID to set, if creating
4889  *		uap->mode		File mode, if creating (same as 'open')
4890  *		uap->xsecurity		ACL to set, if creating
4891  *
4892  * Returns:	0			Success
4893  *		!0			errno value
4894  *
4895  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
4896  *
4897  * XXX:		We should enummerate the possible errno values here, and where
4898  *		in the code they originated.
4899  */
4900 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)4901 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4902 {
4903 	int ciferror;
4904 	kauth_filesec_t xsecdst;
4905 	struct vnode_attr va;
4906 	struct nameidata nd;
4907 	int cmode;
4908 
4909 	AUDIT_ARG(owner, uap->uid, uap->gid);
4910 
4911 	xsecdst = NULL;
4912 	if ((uap->xsecurity != USER_ADDR_NULL) &&
4913 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4914 		return ciferror;
4915 	}
4916 
4917 	VATTR_INIT(&va);
4918 	cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4919 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4920 	if (uap->uid != KAUTH_UID_NONE) {
4921 		VATTR_SET(&va, va_uid, uap->uid);
4922 	}
4923 	if (uap->gid != KAUTH_GID_NONE) {
4924 		VATTR_SET(&va, va_gid, uap->gid);
4925 	}
4926 	if (xsecdst != NULL) {
4927 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4928 		va.va_vaflags |= VA_FILESEC_ACL;
4929 	}
4930 
4931 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4932 	    uap->path, vfs_context_current());
4933 
4934 	ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4935 	    NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
4936 	if (xsecdst != NULL) {
4937 		kauth_filesec_free(xsecdst);
4938 	}
4939 
4940 	return ciferror;
4941 }
4942 
4943 /*
4944  * Go through the data-protected atomically controlled open (2)
4945  *
4946  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4947  */
4948 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)4949 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4950     int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
4951 {
4952 	/*
4953 	 * Follow the same path as normal open(2)
4954 	 * Look up the item if it exists, and acquire the vnode.
4955 	 */
4956 	struct vnode_attr va;
4957 	struct nameidata nd;
4958 	int cmode;
4959 	int error;
4960 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
4961 
4962 	VATTR_INIT(&va);
4963 	/* Mask off all but regular access permissions */
4964 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4965 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4966 
4967 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
4968 	    path, ctx);
4969 
4970 	/*
4971 	 * Initialize the extra fields in vnode_attr to pass down our
4972 	 * extra fields.
4973 	 * 1. target cprotect class.
4974 	 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4975 	 */
4976 	if (flags & O_CREAT) {
4977 		/* lower level kernel code validates that the class is valid before applying it. */
4978 		if (class != PROTECTION_CLASS_DEFAULT) {
4979 			/*
4980 			 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4981 			 * file behave the same as open (2)
4982 			 */
4983 			VATTR_SET(&va, va_dataprotect_class, class);
4984 		}
4985 	}
4986 
4987 	if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
4988 		if (flags & (O_RDWR | O_WRONLY)) {
4989 			/*
4990 			 * Not allowed to write raw encrypted bytes or when opening authenticated.
4991 			 */
4992 			return EINVAL;
4993 		}
4994 		if (dpflags & O_DP_GETRAWENCRYPTED) {
4995 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4996 		}
4997 		if (dpflags & O_DP_GETRAWUNENCRYPTED) {
4998 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4999 		}
5000 		if (dpflags & O_DP_AUTHENTICATE) {
5001 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
5002 		}
5003 	}
5004 
5005 	error = open1at(vfs_context_current(), &nd, flags, &va,
5006 	    NULL, NULL, retval, fd, authfd);
5007 
5008 	return error;
5009 }
5010 
5011 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)5012 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5013 {
5014 	if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5015 		return EINVAL;
5016 	}
5017 
5018 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5019 	           uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
5020 }
5021 
5022 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)5023 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5024 {
5025 	if (uap->dpflags & O_DP_AUTHENTICATE) {
5026 		return EINVAL;
5027 	}
5028 
5029 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5030 	           uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5031 }
5032 
5033 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)5034 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5035     int fd, enum uio_seg segflg, int *retval)
5036 {
5037 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5038 	struct {
5039 		struct vnode_attr va;
5040 		struct nameidata nd;
5041 	} *__open_data;
5042 	struct vnode_attr *vap;
5043 	struct nameidata *ndp;
5044 	int cmode;
5045 	int error;
5046 
5047 	__open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5048 	vap = &__open_data->va;
5049 	ndp = &__open_data->nd;
5050 
5051 	VATTR_INIT(vap);
5052 	/* Mask off all but regular access permissions */
5053 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5054 	VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5055 
5056 	NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5057 	    segflg, path, ctx);
5058 
5059 	error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5060 
5061 	kfree_type(typeof(*__open_data), __open_data);
5062 
5063 	return error;
5064 }
5065 
5066 int
open(proc_t p,struct open_args * uap,int32_t * retval)5067 open(proc_t p, struct open_args *uap, int32_t *retval)
5068 {
5069 	__pthread_testcancel(1);
5070 	return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5071 }
5072 
5073 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5074 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5075     int32_t *retval)
5076 {
5077 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5078 	           uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
5079 }
5080 
5081 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5082 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5083     int32_t *retval)
5084 {
5085 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5086 	           uap->mode, uap->fd, UIO_USERSPACE, retval);
5087 }
5088 
5089 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5090 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5091 {
5092 	__pthread_testcancel(1);
5093 	return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5094 }
5095 
5096 #define OPEN_BY_ID_ENTITLEMENT  "com.apple.private.vfs.open-by-id"
5097 
5098 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5099 vfs_context_can_open_by_id(vfs_context_t ctx)
5100 {
5101 	if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5102 		return TRUE;
5103 	}
5104 
5105 	return IOTaskHasEntitlement(vfs_context_task(ctx),
5106 	           OPEN_BY_ID_ENTITLEMENT);
5107 }
5108 
5109 /*
5110  * openbyid_np: open a file given a file system id and a file system object id
5111  *	the hfs file system object id is an fsobj_id_t {uint32, uint32}
5112  *	file systems that don't support object ids it is a node id (uint64_t).
5113  *
5114  * Parameters:	p			Process requesting the open
5115  *		uap			User argument descriptor (see below)
5116  *		retval			Pointer to an area to receive the
5117  *					return calue from the system call
5118  *
5119  * Indirect:	uap->path		Path to open (same as 'open')
5120  *
5121  *		uap->fsid		id of target file system
5122  *		uap->objid		id of target file system object
5123  *		uap->flags		Flags to open (same as 'open')
5124  *
5125  * Returns:	0			Success
5126  *		!0			errno value
5127  *
5128  *
5129  * XXX:		We should enummerate the possible errno values here, and where
5130  *		in the code they originated.
5131  */
5132 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5133 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5134 {
5135 	fsid_t fsid;
5136 	uint64_t objid;
5137 	int error;
5138 	char *buf = NULL;
5139 	int buflen = MAXPATHLEN;
5140 	int pathlen = 0;
5141 	vfs_context_t ctx = vfs_context_current();
5142 
5143 	if (!vfs_context_can_open_by_id(ctx)) {
5144 		return EPERM;
5145 	}
5146 
5147 	if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5148 		return error;
5149 	}
5150 
5151 	/*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5152 	if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5153 		return error;
5154 	}
5155 
5156 	AUDIT_ARG(value32, fsid.val[0]);
5157 	AUDIT_ARG(value64, objid);
5158 
5159 	/*resolve path from fsis, objid*/
5160 	do {
5161 		buf = kalloc_data(buflen + 1, Z_WAITOK);
5162 		if (buf == NULL) {
5163 			return ENOMEM;
5164 		}
5165 
5166 		error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5167 		    buf, FSOPT_ISREALFSID, &pathlen);
5168 
5169 		if (error) {
5170 			kfree_data(buf, buflen + 1);
5171 			buf = NULL;
5172 		}
5173 	} while (error == ENOSPC && (buflen += MAXPATHLEN));
5174 
5175 	if (error) {
5176 		return error;
5177 	}
5178 
5179 	buf[pathlen] = 0;
5180 
5181 	error = openat_internal(
5182 		ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
5183 
5184 	kfree_data(buf, buflen + 1);
5185 
5186 	return error;
5187 }
5188 
5189 
5190 /*
5191  * Create a special file.
5192  */
5193 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5194     int fd);
5195 
5196 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5197 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5198     mode_t mode, int fd)
5199 {
5200 	vfs_context_t ctx = vfs_context_current();
5201 	struct nameidata nd;
5202 	vnode_t vp, dvp;
5203 	int error;
5204 
5205 	/* If it's a mknod() of a FIFO, call mkfifo1() instead */
5206 	if ((mode & S_IFMT) == S_IFIFO) {
5207 		return mkfifo1(ctx, upath, vap, fd);
5208 	}
5209 
5210 	AUDIT_ARG(mode, mode);
5211 	AUDIT_ARG(value32, vap->va_rdev);
5212 
5213 	if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5214 		return error;
5215 	}
5216 	NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5217 	    UIO_USERSPACE, upath, ctx);
5218 	error = nameiat(&nd, fd);
5219 	if (error) {
5220 		return error;
5221 	}
5222 	dvp = nd.ni_dvp;
5223 	vp = nd.ni_vp;
5224 
5225 	if (vp != NULL) {
5226 		error = EEXIST;
5227 		goto out;
5228 	}
5229 
5230 	switch (mode & S_IFMT) {
5231 	case S_IFCHR:
5232 		VATTR_SET(vap, va_type, VCHR);
5233 		break;
5234 	case S_IFBLK:
5235 		VATTR_SET(vap, va_type, VBLK);
5236 		break;
5237 	default:
5238 		error = EINVAL;
5239 		goto out;
5240 	}
5241 
5242 #if CONFIG_MACF
5243 	error = mac_vnode_check_create(ctx,
5244 	    nd.ni_dvp, &nd.ni_cnd, vap);
5245 	if (error) {
5246 		goto out;
5247 	}
5248 #endif
5249 
5250 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5251 		goto out;
5252 	}
5253 
5254 #if CONFIG_FILE_LEASES
5255 	vnode_breakdirlease(dvp, false, O_WRONLY);
5256 #endif
5257 
5258 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5259 		goto out;
5260 	}
5261 
5262 	if (vp) {
5263 		int     update_flags = 0;
5264 
5265 		// Make sure the name & parent pointers are hooked up
5266 		if (vp->v_name == NULL) {
5267 			update_flags |= VNODE_UPDATE_NAME;
5268 		}
5269 		if (vp->v_parent == NULLVP) {
5270 			update_flags |= VNODE_UPDATE_PARENT;
5271 		}
5272 
5273 		if (update_flags) {
5274 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5275 		}
5276 
5277 #if CONFIG_FSE
5278 		add_fsevent(FSE_CREATE_FILE, ctx,
5279 		    FSE_ARG_VNODE, vp,
5280 		    FSE_ARG_DONE);
5281 #endif
5282 	}
5283 
5284 out:
5285 	/*
5286 	 * nameidone has to happen before we vnode_put(dvp)
5287 	 * since it may need to release the fs_nodelock on the dvp
5288 	 */
5289 	nameidone(&nd);
5290 
5291 	if (vp) {
5292 		vnode_put(vp);
5293 	}
5294 	vnode_put(dvp);
5295 
5296 	return error;
5297 }
5298 
5299 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5300 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5301 {
5302 	struct vnode_attr va;
5303 
5304 	VATTR_INIT(&va);
5305 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5306 	VATTR_SET(&va, va_rdev, uap->dev);
5307 
5308 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5309 }
5310 
5311 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5312 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5313 {
5314 	struct vnode_attr va;
5315 
5316 	VATTR_INIT(&va);
5317 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5318 	VATTR_SET(&va, va_rdev, uap->dev);
5319 
5320 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5321 }
5322 
5323 /*
5324  * Create a named pipe.
5325  *
5326  * Returns:	0			Success
5327  *		EEXIST
5328  *	namei:???
5329  *	vnode_authorize:???
5330  *	vn_create:???
5331  */
5332 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5333 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5334 {
5335 	vnode_t vp, dvp;
5336 	int error;
5337 	struct nameidata nd;
5338 
5339 	NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5340 	    UIO_USERSPACE, upath, ctx);
5341 	error = nameiat(&nd, fd);
5342 	if (error) {
5343 		return error;
5344 	}
5345 	dvp = nd.ni_dvp;
5346 	vp = nd.ni_vp;
5347 
5348 	/* check that this is a new file and authorize addition */
5349 	if (vp != NULL) {
5350 		error = EEXIST;
5351 		goto out;
5352 	}
5353 	VATTR_SET(vap, va_type, VFIFO);
5354 
5355 	if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5356 		goto out;
5357 	}
5358 
5359 	error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5360 out:
5361 	/*
5362 	 * nameidone has to happen before we vnode_put(dvp)
5363 	 * since it may need to release the fs_nodelock on the dvp
5364 	 */
5365 	nameidone(&nd);
5366 
5367 	if (vp) {
5368 		vnode_put(vp);
5369 	}
5370 	vnode_put(dvp);
5371 
5372 	return error;
5373 }
5374 
5375 
5376 /*
5377  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5378  *
5379  * Parameters:	p			Process requesting the open
5380  *		uap			User argument descriptor (see below)
5381  *		retval			(Ignored)
5382  *
5383  * Indirect:	uap->path		Path to fifo (same as 'mkfifo')
5384  *		uap->uid		UID to set
5385  *		uap->gid		GID to set
5386  *		uap->mode		File mode to set (same as 'mkfifo')
5387  *		uap->xsecurity		ACL to set, if creating
5388  *
5389  * Returns:	0			Success
5390  *		!0			errno value
5391  *
5392  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
5393  *
5394  * XXX:		We should enummerate the possible errno values here, and where
5395  *		in the code they originated.
5396  */
5397 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5398 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5399 {
5400 	int ciferror;
5401 	kauth_filesec_t xsecdst;
5402 	struct vnode_attr va;
5403 
5404 	AUDIT_ARG(owner, uap->uid, uap->gid);
5405 
5406 	xsecdst = KAUTH_FILESEC_NONE;
5407 	if (uap->xsecurity != USER_ADDR_NULL) {
5408 		if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5409 			return ciferror;
5410 		}
5411 	}
5412 
5413 	VATTR_INIT(&va);
5414 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5415 	if (uap->uid != KAUTH_UID_NONE) {
5416 		VATTR_SET(&va, va_uid, uap->uid);
5417 	}
5418 	if (uap->gid != KAUTH_GID_NONE) {
5419 		VATTR_SET(&va, va_gid, uap->gid);
5420 	}
5421 	if (xsecdst != KAUTH_FILESEC_NONE) {
5422 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5423 		va.va_vaflags |= VA_FILESEC_ACL;
5424 	}
5425 
5426 	ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5427 
5428 	if (xsecdst != KAUTH_FILESEC_NONE) {
5429 		kauth_filesec_free(xsecdst);
5430 	}
5431 	return ciferror;
5432 }
5433 
5434 /* ARGSUSED */
5435 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5436 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5437 {
5438 	struct vnode_attr va;
5439 
5440 	VATTR_INIT(&va);
5441 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5442 
5443 	return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5444 }
5445 
5446 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5447 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5448 {
5449 	struct vnode_attr va;
5450 
5451 	VATTR_INIT(&va);
5452 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5453 
5454 	return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5455 }
5456 
5457 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5458 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5459 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5460 
5461 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5462 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5463 {
5464 	int ret, len = _len;
5465 
5466 	*truncated_path = 0;
5467 
5468 	if (firmlink) {
5469 		ret = vn_getpath(dvp, path, &len);
5470 	} else {
5471 		ret = vn_getpath_no_firmlink(dvp, path, &len);
5472 	}
5473 	if (ret == 0 && len < (MAXPATHLEN - 1)) {
5474 		if (leafname) {
5475 			path[len - 1] = '/';
5476 			len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5477 			if (len > MAXPATHLEN) {
5478 				char *ptr;
5479 
5480 				// the string got truncated!
5481 				*truncated_path = 1;
5482 				ptr = strrchr(path, '/');
5483 				if (ptr) {
5484 					*ptr = '\0';   // chop off the string at the last directory component
5485 				}
5486 				len = (int)strlen(path) + 1;
5487 			}
5488 		}
5489 	} else if (ret == 0) {
5490 		*truncated_path = 1;
5491 	} else if (ret != 0) {
5492 		struct vnode *mydvp = dvp;
5493 
5494 		if (ret != ENOSPC) {
5495 			printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5496 			    dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5497 		}
5498 		*truncated_path = 1;
5499 
5500 		do {
5501 			if (mydvp->v_parent != NULL) {
5502 				mydvp = mydvp->v_parent;
5503 			} else if (mydvp->v_mount) {
5504 				strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5505 				break;
5506 			} else {
5507 				// no parent and no mount point?  only thing is to punt and say "/" changed
5508 				strlcpy(path, "/", _len);
5509 				len = 2;
5510 				mydvp = NULL;
5511 			}
5512 
5513 			if (mydvp == NULL) {
5514 				break;
5515 			}
5516 
5517 			len = _len;
5518 			if (firmlink) {
5519 				ret = vn_getpath(mydvp, path, &len);
5520 			} else {
5521 				ret = vn_getpath_no_firmlink(mydvp, path, &len);
5522 			}
5523 		} while (ret == ENOSPC);
5524 	}
5525 
5526 	return len;
5527 }
5528 
5529 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5530 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5531 {
5532 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5533 }
5534 
5535 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5536 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5537 {
5538 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5539 }
5540 
5541 /*
5542  * Make a hard file link.
5543  *
5544  * Returns:	0			Success
5545  *		EPERM
5546  *		EEXIST
5547  *		EXDEV
5548  *	namei:???
5549  *	vnode_authorize:???
5550  *	VNOP_LINK:???
5551  */
5552 /* ARGSUSED */
5553 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5554 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5555     user_addr_t link, int flag, enum uio_seg segflg)
5556 {
5557 	vnode_t vp, pvp, dvp, lvp;
5558 	struct nameidata nd;
5559 	int follow;
5560 	int error;
5561 #if CONFIG_FSE
5562 	fse_info finfo;
5563 #endif
5564 	int need_event, has_listeners, need_kpath2;
5565 	char *target_path = NULL;
5566 	char  *no_firmlink_path = NULL;
5567 	int truncated = 0;
5568 	int truncated_no_firmlink_path = 0;
5569 
5570 	vp = dvp = lvp = NULLVP;
5571 
5572 	/* look up the object we are linking to */
5573 	follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5574 	NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5575 	    segflg, path, ctx);
5576 
5577 	error = nameiat(&nd, fd1);
5578 	if (error) {
5579 		return error;
5580 	}
5581 	vp = nd.ni_vp;
5582 
5583 	nameidone(&nd);
5584 
5585 	/*
5586 	 * Normally, linking to directories is not supported.
5587 	 * However, some file systems may have limited support.
5588 	 */
5589 	if (vp->v_type == VDIR) {
5590 		if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5591 			error = EPERM;   /* POSIX */
5592 			goto out;
5593 		}
5594 
5595 		/* Linking to a directory requires ownership. */
5596 		if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5597 			struct vnode_attr dva;
5598 
5599 			VATTR_INIT(&dva);
5600 			VATTR_WANTED(&dva, va_uid);
5601 			if (vnode_getattr(vp, &dva, ctx) != 0 ||
5602 			    !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5603 			    (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5604 				error = EACCES;
5605 				goto out;
5606 			}
5607 		}
5608 	}
5609 
5610 	/* lookup the target node */
5611 #if CONFIG_TRIGGERS
5612 	nd.ni_op = OP_LINK;
5613 #endif
5614 	nd.ni_cnd.cn_nameiop = CREATE;
5615 	nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5616 	nd.ni_dirp = link;
5617 	error = nameiat(&nd, fd2);
5618 	if (error != 0) {
5619 		goto out;
5620 	}
5621 	dvp = nd.ni_dvp;
5622 	lvp = nd.ni_vp;
5623 
5624 #if CONFIG_MACF
5625 	if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5626 		goto out2;
5627 	}
5628 #endif
5629 
5630 	/* or to anything that kauth doesn't want us to (eg. immutable items) */
5631 	if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5632 		goto out2;
5633 	}
5634 
5635 	/* target node must not exist */
5636 	if (lvp != NULLVP) {
5637 		error = EEXIST;
5638 		goto out2;
5639 	}
5640 	/* cannot link across mountpoints */
5641 	if (vnode_mount(vp) != vnode_mount(dvp)) {
5642 		error = EXDEV;
5643 		goto out2;
5644 	}
5645 
5646 	/* authorize creation of the target note */
5647 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5648 		goto out2;
5649 	}
5650 
5651 #if CONFIG_FILE_LEASES
5652 	vnode_breakdirlease(dvp, false, O_WRONLY);
5653 #endif
5654 
5655 	/* and finally make the link */
5656 	error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5657 	if (error) {
5658 		goto out2;
5659 	}
5660 
5661 #if CONFIG_MACF
5662 	(void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5663 #endif
5664 
5665 #if CONFIG_FSE
5666 	need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5667 #else
5668 	need_event = 0;
5669 #endif
5670 	has_listeners = kauth_authorize_fileop_has_listeners();
5671 
5672 	need_kpath2 = 0;
5673 #if CONFIG_AUDIT
5674 	if (AUDIT_RECORD_EXISTS()) {
5675 		need_kpath2 = 1;
5676 	}
5677 #endif
5678 
5679 	if (need_event || has_listeners || need_kpath2) {
5680 		char *link_to_path = NULL;
5681 		int len, link_name_len;
5682 		int  len_no_firmlink_path = 0;
5683 
5684 		/* build the path to the new link file */
5685 		GET_PATH(target_path);
5686 
5687 		len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5688 		if (no_firmlink_path == NULL) {
5689 			GET_PATH(no_firmlink_path);
5690 		}
5691 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5692 
5693 		AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5694 
5695 		if (has_listeners) {
5696 			/* build the path to file we are linking to */
5697 			GET_PATH(link_to_path);
5698 
5699 			link_name_len = MAXPATHLEN;
5700 			if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5701 				/*
5702 				 * Call out to allow 3rd party notification of rename.
5703 				 * Ignore result of kauth_authorize_fileop call.
5704 				 */
5705 				kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5706 				    (uintptr_t)link_to_path,
5707 				    (uintptr_t)target_path);
5708 			}
5709 			if (link_to_path != NULL) {
5710 				RELEASE_PATH(link_to_path);
5711 			}
5712 		}
5713 #if CONFIG_FSE
5714 		if (need_event) {
5715 			/* construct fsevent */
5716 			if (get_fse_info(vp, &finfo, ctx) == 0) {
5717 				if (truncated_no_firmlink_path) {
5718 					finfo.mode |= FSE_TRUNCATED_PATH;
5719 				}
5720 
5721 				// build the path to the destination of the link
5722 				add_fsevent(FSE_CREATE_FILE, ctx,
5723 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5724 				    FSE_ARG_FINFO, &finfo,
5725 				    FSE_ARG_DONE);
5726 			}
5727 
5728 			pvp = vp->v_parent;
5729 			// need an iocount on parent vnode in this case
5730 			if (pvp && pvp != dvp) {
5731 				pvp = vnode_getparent_if_different(vp, dvp);
5732 			}
5733 			if (pvp) {
5734 				add_fsevent(FSE_STAT_CHANGED, ctx,
5735 				    FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5736 			}
5737 			if (pvp && pvp != dvp) {
5738 				vnode_put(pvp);
5739 			}
5740 		}
5741 #endif
5742 	}
5743 out2:
5744 	/*
5745 	 * nameidone has to happen before we vnode_put(dvp)
5746 	 * since it may need to release the fs_nodelock on the dvp
5747 	 */
5748 	nameidone(&nd);
5749 	if (target_path != NULL) {
5750 		RELEASE_PATH(target_path);
5751 	}
5752 	if (no_firmlink_path != NULL) {
5753 		RELEASE_PATH(no_firmlink_path);
5754 		no_firmlink_path = NULL;
5755 	}
5756 out:
5757 	if (lvp) {
5758 		vnode_put(lvp);
5759 	}
5760 	if (dvp) {
5761 		vnode_put(dvp);
5762 	}
5763 	vnode_put(vp);
5764 	return error;
5765 }
5766 
5767 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5768 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5769 {
5770 	return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5771 	           AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5772 }
5773 
5774 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5775 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5776 {
5777 	if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5778 		return EINVAL;
5779 	}
5780 
5781 	return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5782 	           uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5783 }
5784 
5785 /*
5786  * Make a symbolic link.
5787  *
5788  * We could add support for ACLs here too...
5789  */
5790 /* ARGSUSED */
5791 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5792 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5793     user_addr_t link, enum uio_seg segflg)
5794 {
5795 	struct vnode_attr va;
5796 	char *path;
5797 	int error;
5798 	struct nameidata nd;
5799 	vnode_t vp, dvp;
5800 	size_t dummy = 0;
5801 	proc_t p;
5802 
5803 	error = 0;
5804 	if (UIO_SEG_IS_USER_SPACE(segflg)) {
5805 		path = zalloc(ZV_NAMEI);
5806 		error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5807 	} else {
5808 		path = (char *)path_data;
5809 	}
5810 	if (error) {
5811 		goto out;
5812 	}
5813 	AUDIT_ARG(text, path);  /* This is the link string */
5814 
5815 	NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5816 	    segflg, link, ctx);
5817 
5818 	error = nameiat(&nd, fd);
5819 	if (error) {
5820 		goto out;
5821 	}
5822 	dvp = nd.ni_dvp;
5823 	vp = nd.ni_vp;
5824 
5825 	p = vfs_context_proc(ctx);
5826 	VATTR_INIT(&va);
5827 	VATTR_SET(&va, va_type, VLNK);
5828 	VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5829 
5830 #if CONFIG_MACF
5831 	error = mac_vnode_check_create(ctx,
5832 	    dvp, &nd.ni_cnd, &va);
5833 #endif
5834 	if (error != 0) {
5835 		goto skipit;
5836 	}
5837 
5838 	if (vp != NULL) {
5839 		error = EEXIST;
5840 		goto skipit;
5841 	}
5842 
5843 	/* authorize */
5844 	if (error == 0) {
5845 		error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5846 	}
5847 	/* get default ownership, etc. */
5848 	if (error == 0) {
5849 		error = vnode_authattr_new(dvp, &va, 0, ctx);
5850 	}
5851 
5852 #if CONFIG_FILE_LEASES
5853 	vnode_breakdirlease(dvp, false, O_WRONLY);
5854 #endif
5855 
5856 	if (error == 0) {
5857 		error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5858 	}
5859 
5860 	/* do fallback attribute handling */
5861 	if (error == 0 && vp) {
5862 		error = vnode_setattr_fallback(vp, &va, ctx);
5863 	}
5864 
5865 #if CONFIG_MACF
5866 	if (error == 0 && vp) {
5867 		error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5868 	}
5869 #endif
5870 
5871 	if (error == 0) {
5872 		int     update_flags = 0;
5873 
5874 		/*check if a new vnode was created, else try to get one*/
5875 		if (vp == NULL) {
5876 			nd.ni_cnd.cn_nameiop = LOOKUP;
5877 #if CONFIG_TRIGGERS
5878 			nd.ni_op = OP_LOOKUP;
5879 #endif
5880 			/*
5881 			 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5882 			 * reallocated again in namei().
5883 			 */
5884 			nd.ni_cnd.cn_flags &= HASBUF;
5885 			error = nameiat(&nd, fd);
5886 			if (error) {
5887 				goto skipit;
5888 			}
5889 			vp = nd.ni_vp;
5890 		}
5891 
5892 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5893 		/* call out to allow 3rd party notification of rename.
5894 		 * Ignore result of kauth_authorize_fileop call.
5895 		 */
5896 		if (kauth_authorize_fileop_has_listeners() &&
5897 		    namei(&nd) == 0) {
5898 			char *new_link_path = NULL;
5899 			int             len;
5900 
5901 			/* build the path to the new link file */
5902 			new_link_path = get_pathbuff();
5903 			len = MAXPATHLEN;
5904 			vn_getpath(dvp, new_link_path, &len);
5905 			if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5906 				new_link_path[len - 1] = '/';
5907 				strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5908 			}
5909 
5910 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5911 			    (uintptr_t)path, (uintptr_t)new_link_path);
5912 			if (new_link_path != NULL) {
5913 				release_pathbuff(new_link_path);
5914 			}
5915 		}
5916 #endif
5917 		// Make sure the name & parent pointers are hooked up
5918 		if (vp->v_name == NULL) {
5919 			update_flags |= VNODE_UPDATE_NAME;
5920 		}
5921 		if (vp->v_parent == NULLVP) {
5922 			update_flags |= VNODE_UPDATE_PARENT;
5923 		}
5924 
5925 		if (update_flags) {
5926 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5927 		}
5928 
5929 #if CONFIG_FSE
5930 		add_fsevent(FSE_CREATE_FILE, ctx,
5931 		    FSE_ARG_VNODE, vp,
5932 		    FSE_ARG_DONE);
5933 #endif
5934 	}
5935 
5936 skipit:
5937 	/*
5938 	 * nameidone has to happen before we vnode_put(dvp)
5939 	 * since it may need to release the fs_nodelock on the dvp
5940 	 */
5941 	nameidone(&nd);
5942 
5943 	if (vp) {
5944 		vnode_put(vp);
5945 	}
5946 	vnode_put(dvp);
5947 out:
5948 	if (path && (path != (char *)path_data)) {
5949 		zfree(ZV_NAMEI, path);
5950 	}
5951 
5952 	return error;
5953 }
5954 
5955 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)5956 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5957 {
5958 	return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5959 	           uap->link, UIO_USERSPACE);
5960 }
5961 
5962 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)5963 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5964     __unused int32_t *retval)
5965 {
5966 	return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5967 	           uap->path2, UIO_USERSPACE);
5968 }
5969 
5970 /*
5971  * Delete a whiteout from the filesystem.
5972  * No longer supported.
5973  */
5974 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)5975 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5976 {
5977 	return ENOTSUP;
5978 }
5979 
5980 /*
5981  * Delete a name from the filesystem.
5982  */
5983 /* ARGSUSED */
5984 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5985 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5986     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5987 {
5988 	struct {
5989 		struct nameidata nd;
5990 #if CONFIG_FSE
5991 		struct vnode_attr va;
5992 		fse_info finfo;
5993 #endif
5994 	} *__unlink_data;
5995 	struct nameidata *ndp;
5996 	vnode_t vp, dvp;
5997 	int error;
5998 	struct componentname *cnp;
5999 	char  *path = NULL;
6000 	char  *no_firmlink_path = NULL;
6001 	int  len_path = 0;
6002 	int  len_no_firmlink_path = 0;
6003 	int flags;
6004 	int need_event;
6005 	int has_listeners;
6006 	int truncated_path;
6007 	int truncated_no_firmlink_path;
6008 	int batched;
6009 	struct vnode_attr *vap;
6010 	int do_retry;
6011 	int retry_count = 0;
6012 	int cn_flags;
6013 
6014 	cn_flags = LOCKPARENT;
6015 	if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6016 		cn_flags |= AUDITVNPATH1;
6017 	}
6018 	/* If a starting dvp is passed, it trumps any fd passed. */
6019 	if (start_dvp) {
6020 		cn_flags |= USEDVP;
6021 	}
6022 
6023 #if NAMEDRSRCFORK
6024 	/* unlink or delete is allowed on rsrc forks and named streams */
6025 	cn_flags |= CN_ALLOWRSRCFORK;
6026 #endif
6027 
6028 	__unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6029 	ndp = &__unlink_data->nd;
6030 #if CONFIG_FSE
6031 	fse_info *finfop = &__unlink_data->finfo;
6032 #endif
6033 
6034 retry:
6035 	do_retry = 0;
6036 	flags = 0;
6037 	need_event = 0;
6038 	has_listeners = 0;
6039 	truncated_path = 0;
6040 	truncated_no_firmlink_path = 0;
6041 	vap = NULL;
6042 
6043 	NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6044 
6045 	ndp->ni_dvp = start_dvp;
6046 	ndp->ni_flag |= NAMEI_COMPOUNDREMOVE;
6047 	cnp = &ndp->ni_cnd;
6048 
6049 continue_lookup:
6050 	error = nameiat(ndp, fd);
6051 	if (error) {
6052 		goto early_out;
6053 	}
6054 
6055 	dvp = ndp->ni_dvp;
6056 	vp = ndp->ni_vp;
6057 
6058 	/* With Carbon delete semantics, busy files cannot be deleted */
6059 	if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6060 		flags |= VNODE_REMOVE_NODELETEBUSY;
6061 	}
6062 
6063 	/* Skip any potential upcalls if told to. */
6064 	if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6065 		flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6066 	}
6067 
6068 	if (vp) {
6069 		batched = vnode_compound_remove_available(vp);
6070 		/*
6071 		 * The root of a mounted filesystem cannot be deleted.
6072 		 */
6073 		if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6074 			error = EBUSY;
6075 			goto out;
6076 		}
6077 
6078 #if DEVELOPMENT || DEBUG
6079 		/*
6080 		 * XXX VSWAP: Check for entitlements or special flag here
6081 		 * so we can restrict access appropriately.
6082 		 */
6083 #else /* DEVELOPMENT || DEBUG */
6084 
6085 		if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6086 			error = EPERM;
6087 			goto out;
6088 		}
6089 #endif /* DEVELOPMENT || DEBUG */
6090 
6091 		if (!batched) {
6092 			error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6093 			if (error) {
6094 				if (error == ENOENT) {
6095 					if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6096 						do_retry = 1;
6097 						retry_count++;
6098 					}
6099 				}
6100 				goto out;
6101 			}
6102 		}
6103 	} else {
6104 		batched = 1;
6105 
6106 		if (!vnode_compound_remove_available(dvp)) {
6107 			panic("No vp, but no compound remove?");
6108 		}
6109 	}
6110 
6111 #if CONFIG_FSE
6112 	need_event = need_fsevent(FSE_DELETE, dvp);
6113 	if (need_event) {
6114 		if (!batched) {
6115 			if ((vp->v_flag & VISHARDLINK) == 0) {
6116 				/* XXX need to get these data in batched VNOP */
6117 				get_fse_info(vp, finfop, ctx);
6118 			}
6119 		} else {
6120 			error =
6121 			    vfs_get_notify_attributes(&__unlink_data->va);
6122 			if (error) {
6123 				goto out;
6124 			}
6125 
6126 			vap = &__unlink_data->va;
6127 		}
6128 	}
6129 #endif
6130 	has_listeners = kauth_authorize_fileop_has_listeners();
6131 	if (need_event || has_listeners) {
6132 		if (path == NULL) {
6133 			GET_PATH(path);
6134 		}
6135 		len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6136 		if (no_firmlink_path == NULL) {
6137 			GET_PATH(no_firmlink_path);
6138 		}
6139 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6140 	}
6141 
6142 #if NAMEDRSRCFORK
6143 	if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6144 		error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6145 	} else
6146 #endif
6147 	{
6148 #if CONFIG_FILE_LEASES
6149 		vnode_breakdirlease(dvp, false, O_WRONLY);
6150 #endif
6151 
6152 		error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6153 		vp = ndp->ni_vp;
6154 		if (error == EKEEPLOOKING) {
6155 			if (!batched) {
6156 				panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6157 			}
6158 
6159 			if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6160 				panic("EKEEPLOOKING, but continue flag not set?");
6161 			}
6162 
6163 			if (vnode_isdir(vp)) {
6164 				error = EISDIR;
6165 				goto out;
6166 			}
6167 			goto continue_lookup;
6168 		} else if (error == ENOENT && batched) {
6169 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6170 				/*
6171 				 * For compound VNOPs, the authorization callback may
6172 				 * return ENOENT in case of racing hardlink lookups
6173 				 * hitting the name  cache, redrive the lookup.
6174 				 */
6175 				do_retry = 1;
6176 				retry_count += 1;
6177 				goto out;
6178 			}
6179 		}
6180 	}
6181 
6182 	/*
6183 	 * Call out to allow 3rd party notification of delete.
6184 	 * Ignore result of kauth_authorize_fileop call.
6185 	 */
6186 	if (!error) {
6187 		if (has_listeners) {
6188 			kauth_authorize_fileop(vfs_context_ucred(ctx),
6189 			    KAUTH_FILEOP_DELETE,
6190 			    (uintptr_t)vp,
6191 			    (uintptr_t)path);
6192 		}
6193 
6194 		if (vp->v_flag & VISHARDLINK) {
6195 			//
6196 			// if a hardlink gets deleted we want to blow away the
6197 			// v_parent link because the path that got us to this
6198 			// instance of the link is no longer valid.  this will
6199 			// force the next call to get the path to ask the file
6200 			// system instead of just following the v_parent link.
6201 			//
6202 			vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6203 		}
6204 
6205 #if CONFIG_FSE
6206 		if (need_event) {
6207 			if (vp->v_flag & VISHARDLINK) {
6208 				get_fse_info(vp, finfop, ctx);
6209 			} else if (vap) {
6210 				vnode_get_fse_info_from_vap(vp, finfop, vap);
6211 			}
6212 			if (truncated_path) {
6213 				finfop->mode |= FSE_TRUNCATED_PATH;
6214 			}
6215 			add_fsevent(FSE_DELETE, ctx,
6216 			    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6217 			    FSE_ARG_FINFO, finfop,
6218 			    FSE_ARG_DONE);
6219 		}
6220 #endif
6221 
6222 #if CONFIG_MACF
6223 		mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6224 #endif
6225 	}
6226 
6227 out:
6228 	if (path != NULL) {
6229 		RELEASE_PATH(path);
6230 		path = NULL;
6231 	}
6232 
6233 	if (no_firmlink_path != NULL) {
6234 		RELEASE_PATH(no_firmlink_path);
6235 		no_firmlink_path = NULL;
6236 	}
6237 #if NAMEDRSRCFORK
6238 	/* recycle the deleted rsrc fork vnode to force a reclaim, which
6239 	 * will cause its shadow file to go away if necessary.
6240 	 */
6241 	if (vp && (vnode_isnamedstream(vp)) &&
6242 	    (vp->v_parent != NULLVP) &&
6243 	    vnode_isshadow(vp)) {
6244 		vnode_recycle(vp);
6245 	}
6246 #endif
6247 	/*
6248 	 * nameidone has to happen before we vnode_put(dvp)
6249 	 * since it may need to release the fs_nodelock on the dvp
6250 	 */
6251 	nameidone(ndp);
6252 	vnode_put(dvp);
6253 	if (vp) {
6254 		vnode_put(vp);
6255 	}
6256 
6257 	if (do_retry) {
6258 		goto retry;
6259 	}
6260 
6261 early_out:
6262 	kfree_type(typeof(*__unlink_data), __unlink_data);
6263 	return error;
6264 }
6265 
6266 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6267 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6268     enum uio_seg segflg, int unlink_flags)
6269 {
6270 	return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6271 	           unlink_flags);
6272 }
6273 
6274 /*
6275  * Delete a name from the filesystem using Carbon semantics.
6276  */
6277 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6278 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6279 {
6280 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6281 	           uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6282 }
6283 
6284 /*
6285  * Delete a name from the filesystem using POSIX semantics.
6286  */
6287 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6288 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6289 {
6290 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6291 	           uap->path, UIO_USERSPACE, 0);
6292 }
6293 
6294 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6295 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6296 {
6297 	if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6298 		return EINVAL;
6299 	}
6300 
6301 	if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6302 		int unlink_flags = 0;
6303 
6304 		if (uap->flag & AT_REMOVEDIR_DATALESS) {
6305 			unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6306 		}
6307 		return rmdirat_internal(vfs_context_current(), uap->fd,
6308 		           uap->path, UIO_USERSPACE, unlink_flags);
6309 	} else {
6310 		return unlinkat_internal(vfs_context_current(), uap->fd,
6311 		           NULLVP, uap->path, UIO_USERSPACE, 0);
6312 	}
6313 }
6314 
6315 /*
6316  * Reposition read/write file offset.
6317  */
6318 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6319 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6320 {
6321 	struct fileproc *fp;
6322 	vnode_t vp;
6323 	struct vfs_context *ctx;
6324 	off_t offset = uap->offset, file_size;
6325 	int error;
6326 
6327 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6328 		if (error == ENOTSUP) {
6329 			return ESPIPE;
6330 		}
6331 		return error;
6332 	}
6333 	if (vnode_isfifo(vp)) {
6334 		file_drop(uap->fd);
6335 		return ESPIPE;
6336 	}
6337 
6338 
6339 	ctx = vfs_context_current();
6340 #if CONFIG_MACF
6341 	if (uap->whence == L_INCR && uap->offset == 0) {
6342 		error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6343 		    fp->fp_glob);
6344 	} else {
6345 		error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6346 		    fp->fp_glob);
6347 	}
6348 	if (error) {
6349 		file_drop(uap->fd);
6350 		return error;
6351 	}
6352 #endif
6353 	if ((error = vnode_getwithref(vp))) {
6354 		file_drop(uap->fd);
6355 		return error;
6356 	}
6357 
6358 	switch (uap->whence) {
6359 	case L_INCR:
6360 		offset += fp->fp_glob->fg_offset;
6361 		break;
6362 	case L_XTND:
6363 		if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6364 			break;
6365 		}
6366 		offset += file_size;
6367 		break;
6368 	case L_SET:
6369 		break;
6370 	case SEEK_HOLE:
6371 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6372 		break;
6373 	case SEEK_DATA:
6374 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6375 		break;
6376 	default:
6377 		error = EINVAL;
6378 	}
6379 	if (error == 0) {
6380 		if (uap->offset > 0 && offset < 0) {
6381 			/* Incremented/relative move past max size */
6382 			error = EOVERFLOW;
6383 		} else {
6384 			/*
6385 			 * Allow negative offsets on character devices, per
6386 			 * POSIX 1003.1-2001.  Most likely for writing disk
6387 			 * labels.
6388 			 */
6389 			if (offset < 0 && vp->v_type != VCHR) {
6390 				/* Decremented/relative move before start */
6391 				error = EINVAL;
6392 			} else {
6393 				/* Success */
6394 				fp->fp_glob->fg_offset = offset;
6395 				*retval = fp->fp_glob->fg_offset;
6396 			}
6397 		}
6398 	}
6399 
6400 	/*
6401 	 * An lseek can affect whether data is "available to read."  Use
6402 	 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6403 	 */
6404 	post_event_if_success(vp, error, NOTE_NONE);
6405 	(void)vnode_put(vp);
6406 	file_drop(uap->fd);
6407 	return error;
6408 }
6409 
6410 
6411 /*
6412  * Check access permissions.
6413  *
6414  * Returns:	0			Success
6415  *		vnode_authorize:???
6416  */
6417 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6418 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6419 {
6420 	kauth_action_t action;
6421 	int error;
6422 
6423 	/*
6424 	 * If just the regular access bits, convert them to something
6425 	 * that vnode_authorize will understand.
6426 	 */
6427 	if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6428 		action = 0;
6429 		if (uflags & R_OK) {
6430 			action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
6431 		}
6432 		if (uflags & W_OK) {
6433 			if (vnode_isdir(vp)) {
6434 				action |= KAUTH_VNODE_ADD_FILE |
6435 				    KAUTH_VNODE_ADD_SUBDIRECTORY;
6436 				/* might want delete rights here too */
6437 			} else {
6438 				action |= KAUTH_VNODE_WRITE_DATA;
6439 			}
6440 		}
6441 		if (uflags & X_OK) {
6442 			if (vnode_isdir(vp)) {
6443 				action |= KAUTH_VNODE_SEARCH;
6444 			} else {
6445 				action |= KAUTH_VNODE_EXECUTE;
6446 			}
6447 		}
6448 	} else {
6449 		/* take advantage of definition of uflags */
6450 		action = uflags >> 8;
6451 	}
6452 
6453 #if CONFIG_MACF
6454 	error = mac_vnode_check_access(ctx, vp, uflags);
6455 	if (error) {
6456 		return error;
6457 	}
6458 #endif /* MAC */
6459 
6460 	/* action == 0 means only check for existence */
6461 	if (action != 0) {
6462 		error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6463 	} else {
6464 		error = 0;
6465 	}
6466 
6467 	return error;
6468 }
6469 
6470 
6471 
6472 /*
6473  * access_extended: Check access permissions in bulk.
6474  *
6475  * Description:	uap->entries		Pointer to an array of accessx
6476  *                                      descriptor structs, plus one or
6477  *                                      more NULL terminated strings (see
6478  *                                      "Notes" section below).
6479  *		uap->size		Size of the area pointed to by
6480  *					uap->entries.
6481  *		uap->results		Pointer to the results array.
6482  *
6483  * Returns:	0			Success
6484  *		ENOMEM			Insufficient memory
6485  *		EINVAL			Invalid arguments
6486  *		namei:EFAULT		Bad address
6487  *		namei:ENAMETOOLONG	Filename too long
6488  *		namei:ENOENT		No such file or directory
6489  *		namei:ELOOP		Too many levels of symbolic links
6490  *		namei:EBADF		Bad file descriptor
6491  *		namei:ENOTDIR		Not a directory
6492  *		namei:???
6493  *		access1:
6494  *
6495  * Implicit returns:
6496  *		uap->results		Array contents modified
6497  *
6498  * Notes:	The uap->entries are structured as an arbitrary length array
6499  *		of accessx descriptors, followed by one or more NULL terminated
6500  *		strings
6501  *
6502  *			struct accessx_descriptor[0]
6503  *			...
6504  *			struct accessx_descriptor[n]
6505  *			char name_data[0];
6506  *
6507  *		We determine the entry count by walking the buffer containing
6508  *		the uap->entries argument descriptor.  For each descriptor we
6509  *		see, the valid values for the offset ad_name_offset will be
6510  *		in the byte range:
6511  *
6512  *			[ uap->entries + sizeof(struct accessx_descriptor) ]
6513  *						to
6514  *				[ uap->entries + uap->size - 2 ]
6515  *
6516  *		since we must have at least one string, and the string must
6517  *		be at least one character plus the NULL terminator in length.
6518  *
6519  * XXX:		Need to support the check-as uid argument
6520  */
6521 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6522 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6523 {
6524 	struct accessx_descriptor *input = NULL;
6525 	errno_t *result = NULL;
6526 	errno_t error = 0;
6527 	int wantdelete = 0;
6528 	size_t desc_max, desc_actual = 0;
6529 	unsigned int i, j;
6530 	struct vfs_context context;
6531 	struct nameidata nd;
6532 	int niopts;
6533 	vnode_t vp = NULL;
6534 	vnode_t dvp = NULL;
6535 #define ACCESSX_MAX_DESCR_ON_STACK 10
6536 	struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6537 
6538 	context.vc_ucred = NULL;
6539 
6540 	/*
6541 	 * Validate parameters; if valid, copy the descriptor array and string
6542 	 * arguments into local memory.  Before proceeding, the following
6543 	 * conditions must have been met:
6544 	 *
6545 	 * o	The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6546 	 * o	There must be sufficient room in the request for at least one
6547 	 *	descriptor and a one yte NUL terminated string.
6548 	 * o	The allocation of local storage must not fail.
6549 	 */
6550 	if (uap->size > ACCESSX_MAX_TABLESIZE) {
6551 		return ENOMEM;
6552 	}
6553 	if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6554 		return EINVAL;
6555 	}
6556 	if (uap->size <= sizeof(stack_input)) {
6557 		input = stack_input;
6558 	} else {
6559 		input = kalloc_data(uap->size, Z_WAITOK);
6560 		if (input == NULL) {
6561 			error = ENOMEM;
6562 			goto out;
6563 		}
6564 	}
6565 	error = copyin(uap->entries, input, uap->size);
6566 	if (error) {
6567 		goto out;
6568 	}
6569 
6570 	AUDIT_ARG(opaque, input, uap->size);
6571 
6572 	/*
6573 	 * Force NUL termination of the copyin buffer to avoid nami() running
6574 	 * off the end.  If the caller passes us bogus data, they may get a
6575 	 * bogus result.
6576 	 */
6577 	((char *)input)[uap->size - 1] = 0;
6578 
6579 	/*
6580 	 * Access is defined as checking against the process' real identity,
6581 	 * even if operations are checking the effective identity.  This
6582 	 * requires that we use a local vfs context.
6583 	 */
6584 	context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6585 	context.vc_thread = current_thread();
6586 
6587 	/*
6588 	 * Find out how many entries we have, so we can allocate the result
6589 	 * array by walking the list and adjusting the count downward by the
6590 	 * earliest string offset we see.
6591 	 */
6592 	desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6593 	desc_actual = desc_max;
6594 	for (i = 0; i < desc_actual; i++) {
6595 		/*
6596 		 * Take the offset to the name string for this entry and
6597 		 * convert to an input array index, which would be one off
6598 		 * the end of the array if this entry was the lowest-addressed
6599 		 * name string.
6600 		 */
6601 		j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6602 
6603 		/*
6604 		 * An offset greater than the max allowable offset is an error.
6605 		 * It is also an error for any valid entry to point
6606 		 * to a location prior to the end of the current entry, if
6607 		 * it's not a reference to the string of the previous entry.
6608 		 */
6609 		if (j > desc_max || (j != 0 && j <= i)) {
6610 			error = EINVAL;
6611 			goto out;
6612 		}
6613 
6614 		/* Also do not let ad_name_offset point to something beyond the size of the input */
6615 		if (input[i].ad_name_offset >= uap->size) {
6616 			error = EINVAL;
6617 			goto out;
6618 		}
6619 
6620 		/*
6621 		 * An offset of 0 means use the previous descriptor's offset;
6622 		 * this is used to chain multiple requests for the same file
6623 		 * to avoid multiple lookups.
6624 		 */
6625 		if (j == 0) {
6626 			/* This is not valid for the first entry */
6627 			if (i == 0) {
6628 				error = EINVAL;
6629 				goto out;
6630 			}
6631 			continue;
6632 		}
6633 
6634 		/*
6635 		 * If the offset of the string for this descriptor is before
6636 		 * what we believe is the current actual last descriptor,
6637 		 * then we need to adjust our estimate downward; this permits
6638 		 * the string table following the last descriptor to be out
6639 		 * of order relative to the descriptor list.
6640 		 */
6641 		if (j < desc_actual) {
6642 			desc_actual = j;
6643 		}
6644 	}
6645 
6646 	/*
6647 	 * We limit the actual number of descriptors we are willing to process
6648 	 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
6649 	 * requested does not exceed this limit,
6650 	 */
6651 	if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6652 		error = ENOMEM;
6653 		goto out;
6654 	}
6655 	result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6656 	if (result == NULL) {
6657 		error = ENOMEM;
6658 		goto out;
6659 	}
6660 
6661 	/*
6662 	 * Do the work by iterating over the descriptor entries we know to
6663 	 * at least appear to contain valid data.
6664 	 */
6665 	error = 0;
6666 	for (i = 0; i < desc_actual; i++) {
6667 		/*
6668 		 * If the ad_name_offset is 0, then we use the previous
6669 		 * results to make the check; otherwise, we are looking up
6670 		 * a new file name.
6671 		 */
6672 		if (input[i].ad_name_offset != 0) {
6673 			/* discard old vnodes */
6674 			if (vp) {
6675 				vnode_put(vp);
6676 				vp = NULL;
6677 			}
6678 			if (dvp) {
6679 				vnode_put(dvp);
6680 				dvp = NULL;
6681 			}
6682 
6683 			/*
6684 			 * Scan forward in the descriptor list to see if we
6685 			 * need the parent vnode.  We will need it if we are
6686 			 * deleting, since we must have rights  to remove
6687 			 * entries in the parent directory, as well as the
6688 			 * rights to delete the object itself.
6689 			 */
6690 			wantdelete = input[i].ad_flags & _DELETE_OK;
6691 			for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6692 				if (input[j].ad_flags & _DELETE_OK) {
6693 					wantdelete = 1;
6694 				}
6695 			}
6696 
6697 			niopts = FOLLOW | AUDITVNPATH1;
6698 
6699 			/* need parent for vnode_authorize for deletion test */
6700 			if (wantdelete) {
6701 				niopts |= WANTPARENT;
6702 			}
6703 
6704 			/* do the lookup */
6705 			NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6706 			    CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6707 			    &context);
6708 			error = namei(&nd);
6709 			if (!error) {
6710 				vp = nd.ni_vp;
6711 				if (wantdelete) {
6712 					dvp = nd.ni_dvp;
6713 				}
6714 			}
6715 			nameidone(&nd);
6716 		}
6717 
6718 		/*
6719 		 * Handle lookup errors.
6720 		 */
6721 		switch (error) {
6722 		case ENOENT:
6723 		case EACCES:
6724 		case EPERM:
6725 		case ENOTDIR:
6726 			result[i] = error;
6727 			break;
6728 		case 0:
6729 			/* run this access check */
6730 			result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6731 			break;
6732 		default:
6733 			/* fatal lookup error */
6734 
6735 			goto out;
6736 		}
6737 	}
6738 
6739 	AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6740 
6741 	/* copy out results */
6742 	error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6743 
6744 out:
6745 	if (input && input != stack_input) {
6746 		kfree_data(input, uap->size);
6747 	}
6748 	if (result) {
6749 		kfree_data(result, desc_actual * sizeof(errno_t));
6750 	}
6751 	if (vp) {
6752 		vnode_put(vp);
6753 	}
6754 	if (dvp) {
6755 		vnode_put(dvp);
6756 	}
6757 	if (IS_VALID_CRED(context.vc_ucred)) {
6758 		kauth_cred_unref(&context.vc_ucred);
6759 	}
6760 	return error;
6761 }
6762 
6763 
6764 /*
6765  * Returns:	0			Success
6766  *		namei:EFAULT		Bad address
6767  *		namei:ENAMETOOLONG	Filename too long
6768  *		namei:ENOENT		No such file or directory
6769  *		namei:ELOOP		Too many levels of symbolic links
6770  *		namei:EBADF		Bad file descriptor
6771  *		namei:ENOTDIR		Not a directory
6772  *		namei:???
6773  *		access1:
6774  */
6775 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6776 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6777     int flag, enum uio_seg segflg)
6778 {
6779 	int error;
6780 	struct nameidata nd;
6781 	int niopts;
6782 	struct vfs_context context;
6783 #if NAMEDRSRCFORK
6784 	int is_namedstream = 0;
6785 #endif
6786 
6787 	/*
6788 	 * Unless the AT_EACCESS option is used, Access is defined as checking
6789 	 * against the process' real identity, even if operations are checking
6790 	 * the effective identity.  So we need to tweak the credential
6791 	 * in the context for that case.
6792 	 */
6793 	if (!(flag & AT_EACCESS)) {
6794 		context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6795 	} else {
6796 		context.vc_ucred = ctx->vc_ucred;
6797 	}
6798 	context.vc_thread = ctx->vc_thread;
6799 
6800 
6801 	niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6802 	/* need parent for vnode_authorize for deletion test */
6803 	if (amode & _DELETE_OK) {
6804 		niopts |= WANTPARENT;
6805 	}
6806 	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6807 	    path, &context);
6808 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6809 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6810 	}
6811 
6812 #if NAMEDRSRCFORK
6813 	/* access(F_OK) calls are allowed for resource forks. */
6814 	if (amode == F_OK) {
6815 		nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6816 	}
6817 #endif
6818 	error = nameiat(&nd, fd);
6819 	if (error) {
6820 		goto out;
6821 	}
6822 
6823 #if NAMEDRSRCFORK
6824 	/* Grab reference on the shadow stream file vnode to
6825 	 * force an inactive on release which will mark it
6826 	 * for recycle.
6827 	 */
6828 	if (vnode_isnamedstream(nd.ni_vp) &&
6829 	    (nd.ni_vp->v_parent != NULLVP) &&
6830 	    vnode_isshadow(nd.ni_vp)) {
6831 		is_namedstream = 1;
6832 		vnode_ref(nd.ni_vp);
6833 	}
6834 #endif
6835 
6836 	error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6837 
6838 #if NAMEDRSRCFORK
6839 	if (is_namedstream) {
6840 		vnode_rele(nd.ni_vp);
6841 	}
6842 #endif
6843 
6844 	vnode_put(nd.ni_vp);
6845 	if (amode & _DELETE_OK) {
6846 		vnode_put(nd.ni_dvp);
6847 	}
6848 	nameidone(&nd);
6849 
6850 out:
6851 	if (!(flag & AT_EACCESS)) {
6852 		kauth_cred_unref(&context.vc_ucred);
6853 	}
6854 	return error;
6855 }
6856 
6857 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)6858 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6859 {
6860 	return faccessat_internal(vfs_context_current(), AT_FDCWD,
6861 	           uap->path, uap->flags, 0, UIO_USERSPACE);
6862 }
6863 
6864 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)6865 faccessat(__unused proc_t p, struct faccessat_args *uap,
6866     __unused int32_t *retval)
6867 {
6868 	if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
6869 		return EINVAL;
6870 	}
6871 
6872 	return faccessat_internal(vfs_context_current(), uap->fd,
6873 	           uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6874 }
6875 
6876 /*
6877  * Returns:	0			Success
6878  *		EFAULT
6879  *	copyout:EFAULT
6880  *	namei:???
6881  *	vn_stat:???
6882  */
6883 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)6884 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6885     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6886     enum uio_seg segflg, int fd, int flag)
6887 {
6888 	struct nameidata nd;
6889 	int follow;
6890 	union {
6891 		struct stat sb;
6892 		struct stat64 sb64;
6893 	} source = {};
6894 	union {
6895 		struct user64_stat user64_sb;
6896 		struct user32_stat user32_sb;
6897 		struct user64_stat64 user64_sb64;
6898 		struct user32_stat64 user32_sb64;
6899 	} dest = {};
6900 	caddr_t sbp;
6901 	int error, my_size;
6902 	kauth_filesec_t fsec;
6903 	size_t xsecurity_bufsize;
6904 	void * statptr;
6905 	struct fileproc *fp = NULL;
6906 	int needsrealdev = 0;
6907 
6908 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
6909 	NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6910 	    segflg, path, ctx);
6911 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6912 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6913 	}
6914 
6915 #if NAMEDRSRCFORK
6916 	int is_namedstream = 0;
6917 	/* stat calls are allowed for resource forks. */
6918 	nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6919 #endif
6920 
6921 	if (flag & AT_FDONLY) {
6922 		vnode_t fvp;
6923 
6924 		error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6925 		if (error) {
6926 			return error;
6927 		}
6928 		if ((error = vnode_getwithref(fvp))) {
6929 			file_drop(fd);
6930 			return error;
6931 		}
6932 		nd.ni_vp = fvp;
6933 	} else {
6934 		error = nameiat(&nd, fd);
6935 		if (error) {
6936 			return error;
6937 		}
6938 	}
6939 	fsec = KAUTH_FILESEC_NONE;
6940 
6941 	statptr = (void *)&source;
6942 
6943 #if NAMEDRSRCFORK
6944 	/* Grab reference on the shadow stream file vnode to
6945 	 * force an inactive on release which will mark it
6946 	 * for recycle.
6947 	 */
6948 	if (vnode_isnamedstream(nd.ni_vp) &&
6949 	    (nd.ni_vp->v_parent != NULLVP) &&
6950 	    vnode_isshadow(nd.ni_vp)) {
6951 		is_namedstream = 1;
6952 		vnode_ref(nd.ni_vp);
6953 	}
6954 #endif
6955 
6956 	needsrealdev = flag & AT_REALDEV ? 1 : 0;
6957 	if (fp && (xsecurity == USER_ADDR_NULL)) {
6958 		/*
6959 		 * If the caller has the file open, and is not
6960 		 * requesting extended security information, we are
6961 		 * going to let them get the basic stat information.
6962 		 */
6963 		error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6964 		    fp->fp_glob->fg_cred);
6965 	} else {
6966 		error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6967 		    isstat64, needsrealdev, ctx);
6968 	}
6969 
6970 #if NAMEDRSRCFORK
6971 	if (is_namedstream) {
6972 		vnode_rele(nd.ni_vp);
6973 	}
6974 #endif
6975 	vnode_put(nd.ni_vp);
6976 	nameidone(&nd);
6977 	if (fp) {
6978 		file_drop(fd);
6979 		fp = NULL;
6980 	}
6981 
6982 	if (error) {
6983 		return error;
6984 	}
6985 	/* Zap spare fields */
6986 	if (isstat64 != 0) {
6987 		source.sb64.st_lspare = 0;
6988 		source.sb64.st_qspare[0] = 0LL;
6989 		source.sb64.st_qspare[1] = 0LL;
6990 		if (vfs_context_is64bit(ctx)) {
6991 			munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6992 			my_size = sizeof(dest.user64_sb64);
6993 			sbp = (caddr_t)&dest.user64_sb64;
6994 		} else {
6995 			munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6996 			my_size = sizeof(dest.user32_sb64);
6997 			sbp = (caddr_t)&dest.user32_sb64;
6998 		}
6999 		/*
7000 		 * Check if we raced (post lookup) against the last unlink of a file.
7001 		 */
7002 		if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7003 			source.sb64.st_nlink = 1;
7004 		}
7005 	} else {
7006 		source.sb.st_lspare = 0;
7007 		source.sb.st_qspare[0] = 0LL;
7008 		source.sb.st_qspare[1] = 0LL;
7009 		if (vfs_context_is64bit(ctx)) {
7010 			munge_user64_stat(&source.sb, &dest.user64_sb);
7011 			my_size = sizeof(dest.user64_sb);
7012 			sbp = (caddr_t)&dest.user64_sb;
7013 		} else {
7014 			munge_user32_stat(&source.sb, &dest.user32_sb);
7015 			my_size = sizeof(dest.user32_sb);
7016 			sbp = (caddr_t)&dest.user32_sb;
7017 		}
7018 
7019 		/*
7020 		 * Check if we raced (post lookup) against the last unlink of a file.
7021 		 */
7022 		if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7023 			source.sb.st_nlink = 1;
7024 		}
7025 	}
7026 	if ((error = copyout(sbp, ub, my_size)) != 0) {
7027 		goto out;
7028 	}
7029 
7030 	/* caller wants extended security information? */
7031 	if (xsecurity != USER_ADDR_NULL) {
7032 		/* did we get any? */
7033 		if (fsec == KAUTH_FILESEC_NONE) {
7034 			if (susize(xsecurity_size, 0) != 0) {
7035 				error = EFAULT;
7036 				goto out;
7037 			}
7038 		} else {
7039 			/* find the user buffer size */
7040 			xsecurity_bufsize = fusize(xsecurity_size);
7041 
7042 			/* copy out the actual data size */
7043 			if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7044 				error = EFAULT;
7045 				goto out;
7046 			}
7047 
7048 			/* if the caller supplied enough room, copy out to it */
7049 			if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7050 				error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7051 			}
7052 		}
7053 	}
7054 out:
7055 	if (fsec != KAUTH_FILESEC_NONE) {
7056 		kauth_filesec_free(fsec);
7057 	}
7058 	return error;
7059 }
7060 
7061 /*
7062  * stat_extended: Get file status; with extended security (ACL).
7063  *
7064  * Parameters:    p                       (ignored)
7065  *                uap                     User argument descriptor (see below)
7066  *                retval                  (ignored)
7067  *
7068  * Indirect:      uap->path               Path of file to get status from
7069  *                uap->ub                 User buffer (holds file status info)
7070  *                uap->xsecurity          ACL to get (extended security)
7071  *                uap->xsecurity_size     Size of ACL
7072  *
7073  * Returns:        0                      Success
7074  *                !0                      errno value
7075  *
7076  */
7077 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7078 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7079     __unused int32_t *retval)
7080 {
7081 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7082 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7083 	           0);
7084 }
7085 
7086 /*
7087  * Returns:	0			Success
7088  *	fstatat_internal:???		[see fstatat_internal() in this file]
7089  */
7090 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7091 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7092 {
7093 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7094 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7095 }
7096 
7097 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7098 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7099 {
7100 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7101 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7102 }
7103 
7104 /*
7105  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7106  *
7107  * Parameters:    p                       (ignored)
7108  *                uap                     User argument descriptor (see below)
7109  *                retval                  (ignored)
7110  *
7111  * Indirect:      uap->path               Path of file to get status from
7112  *                uap->ub                 User buffer (holds file status info)
7113  *                uap->xsecurity          ACL to get (extended security)
7114  *                uap->xsecurity_size     Size of ACL
7115  *
7116  * Returns:        0                      Success
7117  *                !0                      errno value
7118  *
7119  */
7120 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7121 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7122 {
7123 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7124 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7125 	           0);
7126 }
7127 
7128 /*
7129  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7130  *
7131  * Parameters:    p                       (ignored)
7132  *                uap                     User argument descriptor (see below)
7133  *                retval                  (ignored)
7134  *
7135  * Indirect:      uap->path               Path of file to get status from
7136  *                uap->ub                 User buffer (holds file status info)
7137  *                uap->xsecurity          ACL to get (extended security)
7138  *                uap->xsecurity_size     Size of ACL
7139  *
7140  * Returns:        0                      Success
7141  *                !0                      errno value
7142  *
7143  */
7144 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7145 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7146 {
7147 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7148 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7149 	           AT_SYMLINK_NOFOLLOW);
7150 }
7151 
7152 /*
7153  * Get file status; this version does not follow links.
7154  */
7155 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7156 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7157 {
7158 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7159 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7160 }
7161 
7162 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7163 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7164 {
7165 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7166 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7167 }
7168 
7169 /*
7170  * lstat64_extended: Get file status; can handle large inode numbers; does not
7171  * follow links; with extended security (ACL).
7172  *
7173  * Parameters:    p                       (ignored)
7174  *                uap                     User argument descriptor (see below)
7175  *                retval                  (ignored)
7176  *
7177  * Indirect:      uap->path               Path of file to get status from
7178  *                uap->ub                 User buffer (holds file status info)
7179  *                uap->xsecurity          ACL to get (extended security)
7180  *                uap->xsecurity_size     Size of ACL
7181  *
7182  * Returns:        0                      Success
7183  *                !0                      errno value
7184  *
7185  */
7186 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7187 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7188 {
7189 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7190 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7191 	           AT_SYMLINK_NOFOLLOW);
7192 }
7193 
7194 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7195 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7196 {
7197 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7198 		return EINVAL;
7199 	}
7200 
7201 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7202 	           0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7203 }
7204 
7205 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7206 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7207     __unused int32_t *retval)
7208 {
7209 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7210 		return EINVAL;
7211 	}
7212 
7213 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7214 	           0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7215 }
7216 
7217 /*
7218  * Get configurable pathname variables.
7219  *
7220  * Returns:	0			Success
7221  *	namei:???
7222  *	vn_pathconf:???
7223  *
7224  * Notes:	Global implementation  constants are intended to be
7225  *		implemented in this function directly; all other constants
7226  *		are per-FS implementation, and therefore must be handled in
7227  *		each respective FS, instead.
7228  *
7229  * XXX We implement some things globally right now that should actually be
7230  * XXX per-FS; we will need to deal with this at some point.
7231  */
7232 /* ARGSUSED */
7233 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7234 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7235 {
7236 	int error;
7237 	struct nameidata nd;
7238 	vfs_context_t ctx = vfs_context_current();
7239 
7240 	NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7241 	    UIO_USERSPACE, uap->path, ctx);
7242 	error = namei(&nd);
7243 	if (error) {
7244 		return error;
7245 	}
7246 
7247 	error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7248 
7249 	vnode_put(nd.ni_vp);
7250 	nameidone(&nd);
7251 	return error;
7252 }
7253 
7254 /*
7255  * Return target name of a symbolic link.
7256  */
7257 /* ARGSUSED */
7258 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7259 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7260     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7261     int *retval)
7262 {
7263 	vnode_t vp;
7264 	uio_t auio;
7265 	int error;
7266 	struct nameidata nd;
7267 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
7268 	bool put_vnode;
7269 
7270 	if (bufsize > INT32_MAX) {
7271 		return EINVAL;
7272 	}
7273 
7274 	if (lnk_vp) {
7275 		vp = lnk_vp;
7276 		put_vnode = false;
7277 	} else {
7278 		NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7279 		    seg, path, ctx);
7280 
7281 		error = nameiat(&nd, fd);
7282 		if (error) {
7283 			return error;
7284 		}
7285 		vp = nd.ni_vp;
7286 		put_vnode = true;
7287 		nameidone(&nd);
7288 	}
7289 
7290 	auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7291 	    &uio_buf[0], sizeof(uio_buf));
7292 	uio_addiov(auio, buf, bufsize);
7293 	if (vp->v_type != VLNK) {
7294 		error = EINVAL;
7295 	} else {
7296 #if CONFIG_MACF
7297 		error = mac_vnode_check_readlink(ctx, vp);
7298 #endif
7299 		if (error == 0) {
7300 			error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7301 			    ctx);
7302 		}
7303 		if (error == 0) {
7304 			error = VNOP_READLINK(vp, auio, ctx);
7305 		}
7306 	}
7307 
7308 	if (put_vnode) {
7309 		vnode_put(vp);
7310 	}
7311 
7312 	*retval = (int)(bufsize - uio_resid(auio));
7313 	return error;
7314 }
7315 
7316 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7317 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7318 {
7319 	enum uio_seg procseg;
7320 	vnode_t vp;
7321 	int error;
7322 
7323 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7324 
7325 	AUDIT_ARG(fd, uap->fd);
7326 
7327 	if ((error = file_vnode(uap->fd, &vp))) {
7328 		return error;
7329 	}
7330 	if ((error = vnode_getwithref(vp))) {
7331 		file_drop(uap->fd);
7332 		return error;
7333 	}
7334 
7335 	error = readlinkat_internal(vfs_context_current(), -1,
7336 	    vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7337 	    uap->bufsize, procseg, retval);
7338 
7339 	vnode_put(vp);
7340 	file_drop(uap->fd);
7341 	return error;
7342 }
7343 
7344 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7345 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7346 {
7347 	enum uio_seg procseg;
7348 
7349 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7350 	return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7351 	           CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7352 	           uap->count, procseg, retval);
7353 }
7354 
7355 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7356 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7357 {
7358 	enum uio_seg procseg;
7359 
7360 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7361 	return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7362 	           CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7363 	           retval);
7364 }
7365 
7366 /*
7367  * Change file flags, the deep inner layer.
7368  */
7369 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7370 chflags0(vnode_t vp, struct vnode_attr *va,
7371     int (*setattr)(vnode_t, void *, vfs_context_t),
7372     void *arg, vfs_context_t ctx)
7373 {
7374 	kauth_action_t action = 0;
7375 	int error;
7376 
7377 #if CONFIG_MACF
7378 	error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7379 	if (error) {
7380 		goto out;
7381 	}
7382 #endif
7383 
7384 	/* request authorisation, disregard immutability */
7385 	if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7386 		goto out;
7387 	}
7388 	/*
7389 	 * Request that the auth layer disregard those file flags it's allowed to when
7390 	 * authorizing this operation; we need to do this in order to be able to
7391 	 * clear immutable flags.
7392 	 */
7393 	if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7394 		goto out;
7395 	}
7396 	error = (*setattr)(vp, arg, ctx);
7397 
7398 #if CONFIG_MACF
7399 	if (error == 0) {
7400 		mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7401 	}
7402 #endif
7403 
7404 out:
7405 	return error;
7406 }
7407 
7408 /*
7409  * Change file flags.
7410  *
7411  * NOTE: this will vnode_put() `vp'
7412  */
7413 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7414 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7415 {
7416 	struct vnode_attr va;
7417 	int error;
7418 
7419 	VATTR_INIT(&va);
7420 	VATTR_SET(&va, va_flags, flags);
7421 
7422 	error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7423 	vnode_put(vp);
7424 
7425 	if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7426 		error = ENOTSUP;
7427 	}
7428 
7429 	return error;
7430 }
7431 
7432 /*
7433  * Change flags of a file given a path name.
7434  */
7435 /* ARGSUSED */
7436 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7437 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7438 {
7439 	vnode_t vp;
7440 	vfs_context_t ctx = vfs_context_current();
7441 	int error;
7442 	struct nameidata nd;
7443 	uint32_t wantparent = 0;
7444 
7445 #if CONFIG_FILE_LEASES
7446 	wantparent = WANTPARENT;
7447 #endif
7448 
7449 	AUDIT_ARG(fflags, uap->flags);
7450 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7451 	    UIO_USERSPACE, uap->path, ctx);
7452 	error = namei(&nd);
7453 	if (error) {
7454 		return error;
7455 	}
7456 	vp = nd.ni_vp;
7457 
7458 #if CONFIG_FILE_LEASES
7459 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7460 	vnode_put(nd.ni_dvp);
7461 #endif
7462 
7463 	nameidone(&nd);
7464 
7465 	/* we don't vnode_put() here because chflags1 does internally */
7466 	error = chflags1(vp, uap->flags, ctx);
7467 
7468 	return error;
7469 }
7470 
7471 /*
7472  * Change flags of a file given a file descriptor.
7473  */
7474 /* ARGSUSED */
7475 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7476 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7477 {
7478 	vnode_t vp;
7479 	int error;
7480 
7481 	AUDIT_ARG(fd, uap->fd);
7482 	AUDIT_ARG(fflags, uap->flags);
7483 	if ((error = file_vnode(uap->fd, &vp))) {
7484 		return error;
7485 	}
7486 
7487 	if ((error = vnode_getwithref(vp))) {
7488 		file_drop(uap->fd);
7489 		return error;
7490 	}
7491 
7492 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7493 
7494 #if CONFIG_FILE_LEASES
7495 	vnode_breakdirlease(vp, true, O_WRONLY);
7496 #endif
7497 
7498 	/* we don't vnode_put() here because chflags1 does internally */
7499 	error = chflags1(vp, uap->flags, vfs_context_current());
7500 
7501 	file_drop(uap->fd);
7502 	return error;
7503 }
7504 
7505 /*
7506  * Change security information on a filesystem object.
7507  *
7508  * Returns:	0			Success
7509  *		EPERM			Operation not permitted
7510  *		vnode_authattr:???	[anything vnode_authattr can return]
7511  *		vnode_authorize:???	[anything vnode_authorize can return]
7512  *		vnode_setattr:???	[anything vnode_setattr can return]
7513  *
7514  * Notes:	If vnode_authattr or vnode_authorize return EACCES, it will be
7515  *		translated to EPERM before being returned.
7516  */
7517 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7518 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7519 {
7520 	kauth_action_t action;
7521 	int error;
7522 
7523 	AUDIT_ARG(mode, vap->va_mode);
7524 	/* XXX audit new args */
7525 
7526 #if NAMEDSTREAMS
7527 	/* chmod calls are not allowed for resource forks. */
7528 	if (vp->v_flag & VISNAMEDSTREAM) {
7529 		return EPERM;
7530 	}
7531 #endif
7532 
7533 #if CONFIG_MACF
7534 	if (VATTR_IS_ACTIVE(vap, va_mode) &&
7535 	    (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7536 		return error;
7537 	}
7538 
7539 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7540 		if ((error = mac_vnode_check_setowner(ctx, vp,
7541 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7542 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7543 			return error;
7544 		}
7545 	}
7546 
7547 	if (VATTR_IS_ACTIVE(vap, va_acl) &&
7548 	    (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7549 		return error;
7550 	}
7551 #endif
7552 
7553 	/* make sure that the caller is allowed to set this security information */
7554 	if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7555 	    ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7556 		if (error == EACCES) {
7557 			error = EPERM;
7558 		}
7559 		return error;
7560 	}
7561 
7562 	if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7563 		return error;
7564 	}
7565 
7566 #if CONFIG_MACF
7567 	if (VATTR_IS_ACTIVE(vap, va_mode)) {
7568 		mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7569 	}
7570 
7571 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7572 		mac_vnode_notify_setowner(ctx, vp,
7573 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7574 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7575 	}
7576 
7577 	if (VATTR_IS_ACTIVE(vap, va_acl)) {
7578 		mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7579 	}
7580 #endif
7581 
7582 	return error;
7583 }
7584 
7585 
7586 /*
7587  * Change mode of a file given a path name.
7588  *
7589  * Returns:	0			Success
7590  *		namei:???		[anything namei can return]
7591  *		chmod_vnode:???		[anything chmod_vnode can return]
7592  */
7593 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7594 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7595     int fd, int flag, enum uio_seg segflg)
7596 {
7597 	struct nameidata nd;
7598 	int follow, error;
7599 	uint32_t wantparent = 0;
7600 
7601 #if CONFIG_FILE_LEASES
7602 	wantparent = WANTPARENT;
7603 #endif
7604 
7605 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7606 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7607 	    segflg, path, ctx);
7608 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7609 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7610 	}
7611 	if ((error = nameiat(&nd, fd))) {
7612 		return error;
7613 	}
7614 
7615 #if CONFIG_FILE_LEASES
7616 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7617 	vnode_put(nd.ni_dvp);
7618 #endif
7619 
7620 	error = chmod_vnode(ctx, nd.ni_vp, vap);
7621 	vnode_put(nd.ni_vp);
7622 	nameidone(&nd);
7623 	return error;
7624 }
7625 
7626 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)7627 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7628     gid_t gid, user_addr_t xsecurity)
7629 {
7630 	int error;
7631 
7632 	VATTR_INIT(pva);
7633 
7634 	if (mode != -1) {
7635 		VATTR_SET(pva, va_mode, mode & ALLPERMS);
7636 	} else {
7637 		pva->va_mode = 0;
7638 	}
7639 
7640 	if (uid != KAUTH_UID_NONE) {
7641 		VATTR_SET(pva, va_uid, uid);
7642 	}
7643 
7644 	if (gid != KAUTH_GID_NONE) {
7645 		VATTR_SET(pva, va_gid, gid);
7646 	}
7647 
7648 	*pxsecdst = NULL;
7649 	switch (xsecurity) {
7650 	case USER_ADDR_NULL:
7651 		break;
7652 
7653 	case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7654 		VATTR_SET(pva, va_acl, NULL);
7655 		break;
7656 
7657 	default:
7658 		if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
7659 			return error;
7660 		}
7661 
7662 		VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7663 		pva->va_vaflags |= VA_FILESEC_ACL;
7664 		KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7665 		break;
7666 	}
7667 
7668 	return 0;
7669 }
7670 
7671 /*
7672  * chmod_extended: Change the mode of a file given a path name; with extended
7673  * argument list (including extended security (ACL)).
7674  *
7675  * Parameters:	p			Process requesting the open
7676  *		uap			User argument descriptor (see below)
7677  *		retval			(ignored)
7678  *
7679  * Indirect:	uap->path		Path to object (same as 'chmod')
7680  *		uap->uid		UID to set
7681  *		uap->gid		GID to set
7682  *		uap->mode		File mode to set (same as 'chmod')
7683  *		uap->xsecurity		ACL to set (or delete)
7684  *
7685  * Returns:	0			Success
7686  *		!0			errno value
7687  *
7688  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
7689  *
7690  * XXX:		We should enummerate the possible errno values here, and where
7691  *		in the code they originated.
7692  */
7693 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7694 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7695 {
7696 	int error;
7697 	struct vnode_attr va;
7698 	kauth_filesec_t xsecdst = NULL;
7699 
7700 	AUDIT_ARG(owner, uap->uid, uap->gid);
7701 
7702 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7703 	    uap->gid, uap->xsecurity);
7704 
7705 	if (error) {
7706 		return error;
7707 	}
7708 
7709 	error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7710 	    UIO_USERSPACE);
7711 
7712 	if (xsecdst != NULL) {
7713 		kauth_filesec_free(xsecdst);
7714 	}
7715 	return error;
7716 }
7717 
7718 /*
7719  * Returns:	0			Success
7720  *		chmodat:???		[anything chmodat can return]
7721  */
7722 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7723 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7724     int flag, enum uio_seg segflg)
7725 {
7726 	struct vnode_attr va;
7727 
7728 	VATTR_INIT(&va);
7729 	VATTR_SET(&va, va_mode, mode & ALLPERMS);
7730 
7731 	return chmodat(ctx, path, &va, fd, flag, segflg);
7732 }
7733 
7734 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7735 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7736 {
7737 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7738 	           AT_FDCWD, 0, UIO_USERSPACE);
7739 }
7740 
7741 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7742 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7743 {
7744 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7745 		return EINVAL;
7746 	}
7747 
7748 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7749 	           uap->fd, uap->flag, UIO_USERSPACE);
7750 }
7751 
7752 /*
7753  * Change mode of a file given a file descriptor.
7754  */
7755 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7756 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7757 {
7758 	vnode_t vp;
7759 	int error;
7760 
7761 	AUDIT_ARG(fd, fd);
7762 
7763 	if ((error = file_vnode(fd, &vp)) != 0) {
7764 		return error;
7765 	}
7766 	if ((error = vnode_getwithref(vp)) != 0) {
7767 		file_drop(fd);
7768 		return error;
7769 	}
7770 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7771 
7772 #if CONFIG_FILE_LEASES
7773 	vnode_breakdirlease(vp, true, O_WRONLY);
7774 #endif
7775 
7776 	error = chmod_vnode(vfs_context_current(), vp, vap);
7777 	(void)vnode_put(vp);
7778 	file_drop(fd);
7779 
7780 	return error;
7781 }
7782 
7783 /*
7784  * fchmod_extended: Change mode of a file given a file descriptor; with
7785  * extended argument list (including extended security (ACL)).
7786  *
7787  * Parameters:    p                       Process requesting to change file mode
7788  *                uap                     User argument descriptor (see below)
7789  *                retval                  (ignored)
7790  *
7791  * Indirect:      uap->mode               File mode to set (same as 'chmod')
7792  *                uap->uid                UID to set
7793  *                uap->gid                GID to set
7794  *                uap->xsecurity          ACL to set (or delete)
7795  *                uap->fd                 File descriptor of file to change mode
7796  *
7797  * Returns:        0                      Success
7798  *                !0                      errno value
7799  *
7800  */
7801 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7802 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7803 {
7804 	int error;
7805 	struct vnode_attr va;
7806 	kauth_filesec_t xsecdst = NULL;
7807 
7808 	AUDIT_ARG(owner, uap->uid, uap->gid);
7809 
7810 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7811 	    uap->gid, uap->xsecurity);
7812 
7813 	if (error) {
7814 		return error;
7815 	}
7816 
7817 	error = fchmod1(p, uap->fd, &va);
7818 
7819 	if (xsecdst != NULL) {
7820 		kauth_filesec_free(xsecdst);
7821 	}
7822 	return error;
7823 }
7824 
7825 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)7826 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7827 {
7828 	struct vnode_attr va;
7829 
7830 	VATTR_INIT(&va);
7831 	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7832 
7833 	return fchmod1(p, uap->fd, &va);
7834 }
7835 
7836 
7837 /*
7838  * Set ownership given a path name.
7839  */
7840 /* ARGSUSED */
7841 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)7842 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7843     gid_t gid, int flag, enum uio_seg segflg)
7844 {
7845 	vnode_t vp;
7846 	struct vnode_attr va;
7847 	int error;
7848 	struct nameidata nd;
7849 	int follow;
7850 	kauth_action_t action;
7851 	uint32_t wantparent = 0;
7852 
7853 #if CONFIG_FILE_LEASES
7854 	wantparent = WANTPARENT;
7855 #endif
7856 
7857 	AUDIT_ARG(owner, uid, gid);
7858 
7859 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7860 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent, segflg,
7861 	    path, ctx);
7862 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7863 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7864 	}
7865 	error = nameiat(&nd, fd);
7866 	if (error) {
7867 		return error;
7868 	}
7869 	vp = nd.ni_vp;
7870 
7871 	VATTR_INIT(&va);
7872 	if (uid != (uid_t)VNOVAL) {
7873 		VATTR_SET(&va, va_uid, uid);
7874 	}
7875 	if (gid != (gid_t)VNOVAL) {
7876 		VATTR_SET(&va, va_gid, gid);
7877 	}
7878 
7879 #if CONFIG_MACF
7880 	error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7881 	if (error) {
7882 		goto out;
7883 	}
7884 #endif
7885 
7886 	/* preflight and authorize attribute changes */
7887 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7888 		goto out;
7889 	}
7890 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7891 		goto out;
7892 	}
7893 
7894 #if CONFIG_FILE_LEASES
7895 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7896 #endif
7897 
7898 	error = vnode_setattr(vp, &va, ctx);
7899 
7900 #if CONFIG_MACF
7901 	if (error == 0) {
7902 		mac_vnode_notify_setowner(ctx, vp, uid, gid);
7903 	}
7904 #endif
7905 
7906 out:
7907 	/*
7908 	 * EACCES is only allowed from namei(); permissions failure should
7909 	 * return EPERM, so we need to translate the error code.
7910 	 */
7911 	if (error == EACCES) {
7912 		error = EPERM;
7913 	}
7914 
7915 #if CONFIG_FILE_LEASES
7916 	vnode_put(nd.ni_dvp);
7917 #endif
7918 	nameidone(&nd);
7919 	vnode_put(vp);
7920 	return error;
7921 }
7922 
7923 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)7924 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7925 {
7926 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7927 	           uap->uid, uap->gid, 0, UIO_USERSPACE);
7928 }
7929 
7930 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)7931 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7932 {
7933 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7934 	           uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7935 }
7936 
7937 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)7938 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7939 {
7940 	if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7941 		return EINVAL;
7942 	}
7943 
7944 	return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7945 	           uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7946 }
7947 
7948 /*
7949  * Set ownership given a file descriptor.
7950  */
7951 /* ARGSUSED */
7952 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)7953 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7954 {
7955 	struct vnode_attr va;
7956 	vfs_context_t ctx = vfs_context_current();
7957 	vnode_t vp;
7958 	int error;
7959 	kauth_action_t action;
7960 
7961 	AUDIT_ARG(owner, uap->uid, uap->gid);
7962 	AUDIT_ARG(fd, uap->fd);
7963 
7964 	if ((error = file_vnode(uap->fd, &vp))) {
7965 		return error;
7966 	}
7967 
7968 	if ((error = vnode_getwithref(vp))) {
7969 		file_drop(uap->fd);
7970 		return error;
7971 	}
7972 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7973 
7974 	VATTR_INIT(&va);
7975 	if (uap->uid != VNOVAL) {
7976 		VATTR_SET(&va, va_uid, uap->uid);
7977 	}
7978 	if (uap->gid != VNOVAL) {
7979 		VATTR_SET(&va, va_gid, uap->gid);
7980 	}
7981 
7982 #if NAMEDSTREAMS
7983 	/* chown calls are not allowed for resource forks. */
7984 	if (vp->v_flag & VISNAMEDSTREAM) {
7985 		error = EPERM;
7986 		goto out;
7987 	}
7988 #endif
7989 
7990 #if CONFIG_MACF
7991 	error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
7992 	if (error) {
7993 		goto out;
7994 	}
7995 #endif
7996 
7997 	/* preflight and authorize attribute changes */
7998 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7999 		goto out;
8000 	}
8001 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8002 		if (error == EACCES) {
8003 			error = EPERM;
8004 		}
8005 		goto out;
8006 	}
8007 
8008 #if CONFIG_FILE_LEASES
8009 	vnode_breakdirlease(vp, true, O_WRONLY);
8010 #endif
8011 
8012 	error = vnode_setattr(vp, &va, ctx);
8013 
8014 #if CONFIG_MACF
8015 	if (error == 0) {
8016 		mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
8017 	}
8018 #endif
8019 
8020 out:
8021 	(void)vnode_put(vp);
8022 	file_drop(uap->fd);
8023 	return error;
8024 }
8025 
8026 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)8027 getutimes(user_addr_t usrtvp, struct timespec *tsp)
8028 {
8029 	int error;
8030 
8031 	if (usrtvp == USER_ADDR_NULL) {
8032 		struct timeval old_tv;
8033 		/* XXX Y2038 bug because of microtime argument */
8034 		microtime(&old_tv);
8035 		TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8036 		tsp[1] = tsp[0];
8037 	} else {
8038 		if (IS_64BIT_PROCESS(current_proc())) {
8039 			struct user64_timeval tv[2];
8040 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8041 			if (error) {
8042 				return error;
8043 			}
8044 			TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8045 			TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8046 		} else {
8047 			struct user32_timeval tv[2];
8048 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8049 			if (error) {
8050 				return error;
8051 			}
8052 			TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8053 			TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8054 		}
8055 	}
8056 	return 0;
8057 }
8058 
8059 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8060 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8061     int nullflag)
8062 {
8063 	int error;
8064 	struct vnode_attr va;
8065 	kauth_action_t action;
8066 
8067 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8068 
8069 	VATTR_INIT(&va);
8070 	VATTR_SET(&va, va_access_time, ts[0]);
8071 	VATTR_SET(&va, va_modify_time, ts[1]);
8072 	if (nullflag) {
8073 		va.va_vaflags |= VA_UTIMES_NULL;
8074 	}
8075 
8076 #if NAMEDSTREAMS
8077 	/* utimes calls are not allowed for resource forks. */
8078 	if (vp->v_flag & VISNAMEDSTREAM) {
8079 		error = EPERM;
8080 		goto out;
8081 	}
8082 #endif
8083 
8084 #if CONFIG_MACF
8085 	error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8086 	if (error) {
8087 		goto out;
8088 	}
8089 #endif
8090 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8091 		if (!nullflag && error == EACCES) {
8092 			error = EPERM;
8093 		}
8094 		goto out;
8095 	}
8096 
8097 	/* since we may not need to auth anything, check here */
8098 	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8099 		if (!nullflag && error == EACCES) {
8100 			error = EPERM;
8101 		}
8102 		goto out;
8103 	}
8104 	error = vnode_setattr(vp, &va, ctx);
8105 
8106 #if CONFIG_MACF
8107 	if (error == 0) {
8108 		mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8109 	}
8110 #endif
8111 
8112 out:
8113 	return error;
8114 }
8115 
8116 /*
8117  * Set the access and modification times of a file.
8118  */
8119 /* ARGSUSED */
8120 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8121 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8122 {
8123 	struct timespec ts[2];
8124 	user_addr_t usrtvp;
8125 	int error;
8126 	struct nameidata nd;
8127 	vfs_context_t ctx = vfs_context_current();
8128 	uint32_t wantparent = 0;
8129 
8130 #if CONFIG_FILE_LEASES
8131 	wantparent = WANTPARENT;
8132 #endif
8133 
8134 	/*
8135 	 * AUDIT: Needed to change the order of operations to do the
8136 	 * name lookup first because auditing wants the path.
8137 	 */
8138 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8139 	    UIO_USERSPACE, uap->path, ctx);
8140 	error = namei(&nd);
8141 	if (error) {
8142 		return error;
8143 	}
8144 
8145 	/*
8146 	 * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
8147 	 * the current time instead.
8148 	 */
8149 	usrtvp = uap->tptr;
8150 	if ((error = getutimes(usrtvp, ts)) != 0) {
8151 		goto out;
8152 	}
8153 
8154 #if CONFIG_FILE_LEASES
8155 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8156 #endif
8157 
8158 	error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8159 
8160 out:
8161 #if CONFIG_FILE_LEASES
8162 	vnode_put(nd.ni_dvp);
8163 #endif
8164 	nameidone(&nd);
8165 	vnode_put(nd.ni_vp);
8166 	return error;
8167 }
8168 
8169 /*
8170  * Set the access and modification times of a file.
8171  */
8172 /* ARGSUSED */
8173 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8174 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8175 {
8176 	struct timespec ts[2];
8177 	vnode_t vp;
8178 	user_addr_t usrtvp;
8179 	int error;
8180 
8181 	AUDIT_ARG(fd, uap->fd);
8182 	usrtvp = uap->tptr;
8183 	if ((error = getutimes(usrtvp, ts)) != 0) {
8184 		return error;
8185 	}
8186 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
8187 		return error;
8188 	}
8189 	if ((error = vnode_getwithref(vp))) {
8190 		file_drop(uap->fd);
8191 		return error;
8192 	}
8193 
8194 #if CONFIG_FILE_LEASES
8195 	vnode_breakdirlease(vp, true, O_WRONLY);
8196 #endif
8197 
8198 	error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8199 
8200 	vnode_put(vp);
8201 	file_drop(uap->fd);
8202 	return error;
8203 }
8204 
8205 static int
truncate_validate_common(proc_t p,off_t length)8206 truncate_validate_common(proc_t p, off_t length)
8207 {
8208 	rlim_t fsize_limit;
8209 
8210 	if (length < 0) {
8211 		return EINVAL;
8212 	}
8213 
8214 	fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8215 	if ((rlim_t)length > fsize_limit) {
8216 		psignal(p, SIGXFSZ);
8217 		return EFBIG;
8218 	}
8219 
8220 	return 0;
8221 }
8222 
8223 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8224 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8225     vfs_context_t ctx, boolean_t need_auth)
8226 {
8227 	struct vnode_attr va;
8228 	kauth_action_t action;
8229 	int error;
8230 
8231 	VATTR_INIT(&va);
8232 	VATTR_SET(&va, va_data_size, length);
8233 
8234 #if CONFIG_MACF
8235 	error = mac_vnode_check_truncate(ctx, cred, vp);
8236 	if (error) {
8237 		return error;
8238 	}
8239 #endif
8240 
8241 	/*
8242 	 * If we reached here from `ftruncate` then we already did an effective
8243 	 * `vnode_authorize` upon open.  We honour the result from then.
8244 	 */
8245 	if (need_auth) {
8246 		if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8247 			return error;
8248 		}
8249 
8250 		if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8251 			return error;
8252 		}
8253 	}
8254 
8255 #if CONFIG_FILE_LEASES
8256 	/* Check if there is a lease placed on the parent directory. */
8257 	vnode_breakdirlease(vp, true, O_WRONLY);
8258 
8259 	/* Now check if there is a lease placed on the file itself. */
8260 	(void)vnode_breaklease(vp, O_WRONLY, ctx);
8261 #endif
8262 
8263 	error = vnode_setattr(vp, &va, ctx);
8264 
8265 #if CONFIG_MACF
8266 	if (error == 0) {
8267 		mac_vnode_notify_truncate(ctx, cred, vp);
8268 	}
8269 #endif
8270 
8271 	return error;
8272 }
8273 
8274 /*
8275  * Truncate a file given its path name.
8276  */
8277 /* ARGSUSED */
8278 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8279 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8280 {
8281 	vfs_context_t ctx = vfs_context_current();
8282 	vnode_t vp;
8283 	int error;
8284 	struct nameidata nd;
8285 
8286 	if ((error = truncate_validate_common(p, uap->length))) {
8287 		return error;
8288 	}
8289 
8290 	NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8291 	    UIO_USERSPACE, uap->path, ctx);
8292 
8293 	if ((error = namei(&nd))) {
8294 		return error;
8295 	}
8296 
8297 	vp = nd.ni_vp;
8298 	nameidone(&nd);
8299 
8300 	error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8301 	vnode_put(vp);
8302 
8303 	return error;
8304 }
8305 
8306 /*
8307  * Truncate a file given a file descriptor.
8308  */
8309 /* ARGSUSED */
8310 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8311 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8312 {
8313 	vnode_t vp;
8314 	struct fileproc *fp;
8315 	int error;
8316 
8317 	AUDIT_ARG(fd, uap->fd);
8318 
8319 	if ((error = truncate_validate_common(p, uap->length))) {
8320 		return error;
8321 	}
8322 
8323 	if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8324 		return error;
8325 	}
8326 
8327 	switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8328 	case DTYPE_PSXSHM:
8329 		error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8330 		goto out;
8331 	case DTYPE_VNODE:
8332 		break;
8333 	default:
8334 		error = EINVAL;
8335 		goto out;
8336 	}
8337 
8338 	vp = (vnode_t)fp_get_data(fp);
8339 
8340 	if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8341 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8342 		error = EINVAL;
8343 		goto out;
8344 	}
8345 
8346 	if ((error = vnode_getwithref(vp)) != 0) {
8347 		goto out;
8348 	}
8349 
8350 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8351 
8352 	error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8353 	    vfs_context_current(), false);
8354 	vnode_put(vp);
8355 
8356 out:
8357 	file_drop(uap->fd);
8358 	return error;
8359 }
8360 
8361 
8362 /*
8363  * Sync an open file with synchronized I/O _file_ integrity completion
8364  */
8365 /* ARGSUSED */
8366 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8367 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8368 {
8369 	__pthread_testcancel(1);
8370 	return fsync_common(p, uap, MNT_WAIT);
8371 }
8372 
8373 
8374 /*
8375  * Sync an open file with synchronized I/O _file_ integrity completion
8376  *
8377  * Notes:	This is a legacy support function that does not test for
8378  *		thread cancellation points.
8379  */
8380 /* ARGSUSED */
8381 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8382 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8383 {
8384 	return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8385 }
8386 
8387 
8388 /*
8389  * Sync an open file with synchronized I/O _data_ integrity completion
8390  */
8391 /* ARGSUSED */
8392 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8393 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8394 {
8395 	__pthread_testcancel(1);
8396 	return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8397 }
8398 
8399 
8400 /*
8401  * fsync_common
8402  *
8403  * Common fsync code to support both synchronized I/O file integrity completion
8404  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8405  *
8406  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8407  * will only guarantee that the file data contents are retrievable.  If
8408  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8409  * includes additional metadata unnecessary for retrieving the file data
8410  * contents, such as atime, mtime, ctime, etc., also be committed to stable
8411  * storage.
8412  *
8413  * Parameters:	p				The process
8414  *		uap->fd				The descriptor to synchronize
8415  *		flags				The data integrity flags
8416  *
8417  * Returns:	int				Success
8418  *	fp_getfvp:EBADF				Bad file descriptor
8419  *	fp_getfvp:ENOTSUP			fd does not refer to a vnode
8420  *	VNOP_FSYNC:???				unspecified
8421  *
8422  * Notes:	We use struct fsync_args because it is a short name, and all
8423  *		caller argument structures are otherwise identical.
8424  */
8425 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8426 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8427 {
8428 	vnode_t vp;
8429 	struct fileproc *fp;
8430 	vfs_context_t ctx = vfs_context_current();
8431 	int error;
8432 
8433 	AUDIT_ARG(fd, uap->fd);
8434 
8435 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8436 		return error;
8437 	}
8438 	if ((error = vnode_getwithref(vp))) {
8439 		file_drop(uap->fd);
8440 		return error;
8441 	}
8442 
8443 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8444 
8445 	error = VNOP_FSYNC(vp, flags, ctx);
8446 
8447 #if NAMEDRSRCFORK
8448 	/* Sync resource fork shadow file if necessary. */
8449 	if ((error == 0) &&
8450 	    (vp->v_flag & VISNAMEDSTREAM) &&
8451 	    (vp->v_parent != NULLVP) &&
8452 	    vnode_isshadow(vp) &&
8453 	    (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8454 		(void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8455 	}
8456 #endif
8457 
8458 	(void)vnode_put(vp);
8459 	file_drop(uap->fd);
8460 	return error;
8461 }
8462 
8463 /*
8464  * Duplicate files.  Source must be a file, target must be a file or
8465  * must not exist.
8466  *
8467  * XXX Copyfile authorisation checking is woefully inadequate, and will not
8468  *     perform inheritance correctly.
8469  */
8470 /* ARGSUSED */
8471 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8472 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8473 {
8474 	vnode_t tvp, fvp, tdvp, sdvp;
8475 	struct nameidata fromnd, tond;
8476 	int error;
8477 	vfs_context_t ctx = vfs_context_current();
8478 
8479 	/* Check that the flags are valid. */
8480 	if (uap->flags & ~CPF_MASK) {
8481 		return EINVAL;
8482 	}
8483 
8484 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8485 	    UIO_USERSPACE, uap->from, ctx);
8486 	if ((error = namei(&fromnd))) {
8487 		return error;
8488 	}
8489 	fvp = fromnd.ni_vp;
8490 
8491 	NDINIT(&tond, CREATE, OP_LINK,
8492 	    LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8493 	    UIO_USERSPACE, uap->to, ctx);
8494 	if ((error = namei(&tond))) {
8495 		goto out1;
8496 	}
8497 	tdvp = tond.ni_dvp;
8498 	tvp = tond.ni_vp;
8499 
8500 	if (tvp != NULL) {
8501 		if (!(uap->flags & CPF_OVERWRITE)) {
8502 			error = EEXIST;
8503 			goto out;
8504 		}
8505 	}
8506 
8507 	if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8508 		error = EISDIR;
8509 		goto out;
8510 	}
8511 
8512 	if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8513 		error = EOPNOTSUPP;
8514 		goto out;
8515 	}
8516 
8517 #if CONFIG_MACF
8518 	if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8519 		goto out;
8520 	}
8521 #endif /* CONFIG_MACF */
8522 
8523 	if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8524 		goto out;
8525 	}
8526 	if (tvp) {
8527 		if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8528 			goto out;
8529 		}
8530 	}
8531 	if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8532 		goto out;
8533 	}
8534 
8535 	if (fvp == tdvp) {
8536 		error = EINVAL;
8537 	}
8538 	/*
8539 	 * If source is the same as the destination (that is the
8540 	 * same inode number) then there is nothing to do.
8541 	 * (fixed to have POSIX semantics - CSM 3/2/98)
8542 	 */
8543 	if (fvp == tvp) {
8544 		error = -1;
8545 	}
8546 
8547 #if CONFIG_FILE_LEASES
8548 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8549 #endif
8550 
8551 	if (!error) {
8552 		error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8553 	}
8554 out:
8555 	sdvp = tond.ni_startdir;
8556 	/*
8557 	 * nameidone has to happen before we vnode_put(tdvp)
8558 	 * since it may need to release the fs_nodelock on the tdvp
8559 	 */
8560 	nameidone(&tond);
8561 
8562 	if (tvp) {
8563 		vnode_put(tvp);
8564 	}
8565 	vnode_put(tdvp);
8566 	vnode_put(sdvp);
8567 out1:
8568 	vnode_put(fvp);
8569 
8570 	nameidone(&fromnd);
8571 
8572 	if (error == -1) {
8573 		return 0;
8574 	}
8575 	return error;
8576 }
8577 
8578 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8579 
8580 /*
8581  * Helper function for doing clones. The caller is expected to provide an
8582  * iocounted source vnode and release it.
8583  */
8584 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)8585 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8586     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8587 {
8588 	vnode_t tvp, tdvp;
8589 	struct nameidata tond;
8590 	int error;
8591 	int follow;
8592 	boolean_t free_src_acl;
8593 	boolean_t attr_cleanup;
8594 	enum vtype v_type;
8595 	kauth_action_t action;
8596 	struct componentname *cnp;
8597 	uint32_t defaulted = 0;
8598 	struct vnode_attr va;
8599 	struct vnode_attr nva;
8600 	uint32_t vnop_flags;
8601 
8602 	v_type = vnode_vtype(fvp);
8603 	switch (v_type) {
8604 	case VLNK:
8605 	/* FALLTHRU */
8606 	case VREG:
8607 		action = KAUTH_VNODE_ADD_FILE;
8608 		break;
8609 	case VDIR:
8610 		if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8611 		    fvp->v_mountedhere) {
8612 			return EINVAL;
8613 		}
8614 		action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8615 		break;
8616 	default:
8617 		return EINVAL;
8618 	}
8619 
8620 	AUDIT_ARG(fd2, dst_dirfd);
8621 	AUDIT_ARG(value32, flags);
8622 
8623 	follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8624 	NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8625 	    UIO_USERSPACE, dst, ctx);
8626 	if ((error = nameiat(&tond, dst_dirfd))) {
8627 		return error;
8628 	}
8629 	cnp = &tond.ni_cnd;
8630 	tdvp = tond.ni_dvp;
8631 	tvp = tond.ni_vp;
8632 
8633 	free_src_acl = FALSE;
8634 	attr_cleanup = FALSE;
8635 
8636 	if (tvp != NULL) {
8637 		error = EEXIST;
8638 		goto out;
8639 	}
8640 
8641 	if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8642 		error = EXDEV;
8643 		goto out;
8644 	}
8645 
8646 #if CONFIG_MACF
8647 	if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8648 		goto out;
8649 	}
8650 #endif
8651 	if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8652 		goto out;
8653 	}
8654 
8655 	action = KAUTH_VNODE_GENERIC_READ_BITS;
8656 	if (data_read_authorised) {
8657 		action &= ~KAUTH_VNODE_READ_DATA;
8658 	}
8659 	if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8660 		goto out;
8661 	}
8662 
8663 	/*
8664 	 * certain attributes may need to be changed from the source, we ask for
8665 	 * those here with the exception of source file's ACLs unless the CLONE_ACL
8666 	 * flag is specified. By default, the clone file will inherit the target
8667 	 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8668 	 * will inherit the source file's ACLs instead.
8669 	 */
8670 	VATTR_INIT(&va);
8671 	VATTR_WANTED(&va, va_uid);
8672 	VATTR_WANTED(&va, va_gid);
8673 	VATTR_WANTED(&va, va_mode);
8674 	VATTR_WANTED(&va, va_flags);
8675 	if (flags & CLONE_ACL) {
8676 		VATTR_WANTED(&va, va_acl);
8677 	}
8678 
8679 	if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
8680 		goto out;
8681 	}
8682 
8683 	VATTR_INIT(&nva);
8684 	VATTR_SET(&nva, va_type, v_type);
8685 	if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8686 		VATTR_SET(&nva, va_acl, va.va_acl);
8687 		free_src_acl = TRUE;
8688 	}
8689 
8690 	/* Handle ACL inheritance, initialize vap. */
8691 	if (v_type == VLNK) {
8692 		error = vnode_authattr_new(tdvp, &nva, 0, ctx);
8693 	} else {
8694 		error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
8695 		if (error) {
8696 			goto out;
8697 		}
8698 		attr_cleanup = TRUE;
8699 	}
8700 
8701 	vnop_flags = VNODE_CLONEFILE_DEFAULT;
8702 	/*
8703 	 * We've got initial values for all security parameters,
8704 	 * If we are superuser, then we can change owners to be the
8705 	 * same as the source. Both superuser and the owner have default
8706 	 * WRITE_SECURITY privileges so all other fields can be taken
8707 	 * from source as well.
8708 	 */
8709 	if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8710 		if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8711 			VATTR_SET(&nva, va_uid, va.va_uid);
8712 		}
8713 		if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8714 			VATTR_SET(&nva, va_gid, va.va_gid);
8715 		}
8716 	} else {
8717 		vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8718 	}
8719 
8720 	if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8721 		VATTR_SET(&nva, va_mode, va.va_mode);
8722 	}
8723 	if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8724 		VATTR_SET(&nva, va_flags,
8725 		    ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8726 		    (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8727 	}
8728 
8729 #if CONFIG_FILE_LEASES
8730 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8731 #endif
8732 
8733 	error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8734 
8735 	if (!error && tvp) {
8736 		int     update_flags = 0;
8737 #if CONFIG_FSE
8738 		int fsevent;
8739 #endif /* CONFIG_FSE */
8740 
8741 		/*
8742 		 * If some of the requested attributes weren't handled by the
8743 		 * VNOP, use our fallback code.
8744 		 */
8745 		if (!VATTR_ALL_SUPPORTED(&nva)) {
8746 			(void)vnode_setattr_fallback(tvp, &nva, ctx);
8747 		}
8748 
8749 #if CONFIG_MACF
8750 		(void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8751 		    VNODE_LABEL_CREATE, ctx);
8752 #endif
8753 
8754 		// Make sure the name & parent pointers are hooked up
8755 		if (tvp->v_name == NULL) {
8756 			update_flags |= VNODE_UPDATE_NAME;
8757 		}
8758 		if (tvp->v_parent == NULLVP) {
8759 			update_flags |= VNODE_UPDATE_PARENT;
8760 		}
8761 
8762 		if (update_flags) {
8763 			(void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8764 			    cnp->cn_namelen, cnp->cn_hash, update_flags);
8765 		}
8766 
8767 #if CONFIG_FSE
8768 		switch (vnode_vtype(tvp)) {
8769 		case VLNK:
8770 		/* FALLTHRU */
8771 		case VREG:
8772 			fsevent = FSE_CREATE_FILE;
8773 			break;
8774 		case VDIR:
8775 			fsevent = FSE_CREATE_DIR;
8776 			break;
8777 		default:
8778 			goto out;
8779 		}
8780 
8781 		if (need_fsevent(fsevent, tvp)) {
8782 			/*
8783 			 * The following is a sequence of three explicit events.
8784 			 * A pair of FSE_CLONE events representing the source and destination
8785 			 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8786 			 * fseventsd may coalesce the destination clone and create events
8787 			 * into a single event resulting in the following sequence for a client
8788 			 * FSE_CLONE (src)
8789 			 * FSE_CLONE | FSE_CREATE (dst)
8790 			 */
8791 			add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8792 			    FSE_ARG_DONE);
8793 			add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8794 			    FSE_ARG_DONE);
8795 		}
8796 #endif /* CONFIG_FSE */
8797 	}
8798 
8799 out:
8800 	if (attr_cleanup) {
8801 		vn_attribute_cleanup(&nva, defaulted);
8802 	}
8803 	if (free_src_acl && va.va_acl) {
8804 		kauth_acl_free(va.va_acl);
8805 	}
8806 	nameidone(&tond);
8807 	if (tvp) {
8808 		vnode_put(tvp);
8809 	}
8810 	vnode_put(tdvp);
8811 	return error;
8812 }
8813 
8814 /*
8815  * clone files or directories, target must not exist.
8816  */
8817 /* ARGSUSED */
8818 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8819 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8820     __unused int32_t *retval)
8821 {
8822 	vnode_t fvp;
8823 	struct nameidata fromnd;
8824 	int follow;
8825 	int error;
8826 	vfs_context_t ctx = vfs_context_current();
8827 
8828 	/* Check that the flags are valid. */
8829 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8830 		return EINVAL;
8831 	}
8832 
8833 	AUDIT_ARG(fd, uap->src_dirfd);
8834 
8835 	follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8836 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8837 	    UIO_USERSPACE, uap->src, ctx);
8838 	if ((error = nameiat(&fromnd, uap->src_dirfd))) {
8839 		return error;
8840 	}
8841 
8842 	fvp = fromnd.ni_vp;
8843 	nameidone(&fromnd);
8844 
8845 	error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8846 	    uap->flags, ctx);
8847 
8848 	vnode_put(fvp);
8849 	return error;
8850 }
8851 
8852 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)8853 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8854     __unused int32_t *retval)
8855 {
8856 	vnode_t fvp;
8857 	struct fileproc *fp;
8858 	int error;
8859 	vfs_context_t ctx = vfs_context_current();
8860 
8861 	/* Check that the flags are valid. */
8862 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8863 		return EINVAL;
8864 	}
8865 
8866 	AUDIT_ARG(fd, uap->src_fd);
8867 	error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8868 	if (error) {
8869 		return error;
8870 	}
8871 
8872 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8873 		AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8874 		error = EBADF;
8875 		goto out;
8876 	}
8877 
8878 	if ((error = vnode_getwithref(fvp))) {
8879 		goto out;
8880 	}
8881 
8882 	AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8883 
8884 	error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8885 	    uap->flags, ctx);
8886 
8887 	vnode_put(fvp);
8888 out:
8889 	file_drop(uap->src_fd);
8890 	return error;
8891 }
8892 
8893 static int
rename_submounts_callback(mount_t mp,void * arg)8894 rename_submounts_callback(mount_t mp, void *arg)
8895 {
8896 	int error = 0;
8897 	mount_t pmp = (mount_t)arg;
8898 	int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8899 
8900 	if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8901 		return 0;
8902 	}
8903 
8904 	if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8905 		return 0;
8906 	}
8907 
8908 	if ((error = vfs_busy(mp, LK_NOWAIT))) {
8909 		printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8910 		return -1;
8911 	}
8912 
8913 	size_t pathlen = MAXPATHLEN;
8914 	if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8915 		printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8916 	}
8917 
8918 	vfs_unbusy(mp);
8919 
8920 	return error;
8921 }
8922 
8923 /*
8924  * Rename files.  Source and destination must either both be directories,
8925  * or both not be directories.  If target is a directory, it must be empty.
8926  */
8927 /* ARGSUSED */
8928 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)8929 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8930     int tofd, user_addr_t to, int segflg, u_int uflags)
8931 {
8932 	vnode_t tvp, tdvp;
8933 	vnode_t fvp, fdvp;
8934 	vnode_t mnt_fvp;
8935 	struct nameidata *fromnd, *tond;
8936 	int error = 0;
8937 	int do_retry;
8938 	int retry_count;
8939 	int mntrename;
8940 	int need_event;
8941 	int need_kpath2;
8942 	int has_listeners;
8943 	const char *oname = NULL;
8944 	char *from_name = NULL, *to_name = NULL;
8945 	char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8946 	int from_len = 0, to_len = 0;
8947 	int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8948 	int holding_mntlock;
8949 	int vn_authorize_skipped;
8950 	mount_t locked_mp = NULL;
8951 	vnode_t oparent = NULLVP;
8952 #if CONFIG_FSE
8953 	fse_info from_finfo = {}, to_finfo;
8954 #endif
8955 	int from_truncated = 0, to_truncated = 0;
8956 	int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8957 	int batched = 0;
8958 	struct vnode_attr *fvap, *tvap;
8959 	int continuing = 0;
8960 	vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
8961 	int32_t nofollow_any = 0;
8962 	/* carving out a chunk for structs that are too big to be on stack. */
8963 	struct {
8964 		struct nameidata from_node, to_node;
8965 		struct vnode_attr fv_attr, tv_attr;
8966 	} * __rename_data;
8967 
8968 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
8969 	fromnd = &__rename_data->from_node;
8970 	tond = &__rename_data->to_node;
8971 
8972 	holding_mntlock = 0;
8973 	do_retry = 0;
8974 	retry_count = 0;
8975 retry:
8976 	fvp = tvp = NULL;
8977 	fdvp = tdvp = NULL;
8978 	fvap = tvap = NULL;
8979 	mnt_fvp = NULLVP;
8980 	mntrename = FALSE;
8981 	vn_authorize_skipped = FALSE;
8982 
8983 	if (uflags & RENAME_NOFOLLOW_ANY) {
8984 		nofollow_any = NAMEI_NOFOLLOW_ANY;
8985 	}
8986 	NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8987 	    segflg, from, ctx);
8988 	fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8989 
8990 	NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8991 	    segflg, to, ctx);
8992 	tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8993 
8994 continue_lookup:
8995 	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8996 		if ((error = nameiat(fromnd, fromfd))) {
8997 			goto out1;
8998 		}
8999 		fdvp = fromnd->ni_dvp;
9000 		fvp  = fromnd->ni_vp;
9001 
9002 		if (fvp && fvp->v_type == VDIR) {
9003 			tond->ni_cnd.cn_flags |= WILLBEDIR;
9004 		}
9005 	}
9006 
9007 	if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9008 		if ((error = nameiat(tond, tofd))) {
9009 			/*
9010 			 * Translate error code for rename("dir1", "dir2/.").
9011 			 */
9012 			if (error == EISDIR && fvp->v_type == VDIR) {
9013 				error = EINVAL;
9014 			}
9015 			goto out1;
9016 		}
9017 		tdvp = tond->ni_dvp;
9018 		tvp  = tond->ni_vp;
9019 	}
9020 
9021 #if DEVELOPMENT || DEBUG
9022 	/*
9023 	 * XXX VSWAP: Check for entitlements or special flag here
9024 	 * so we can restrict access appropriately.
9025 	 */
9026 #else /* DEVELOPMENT || DEBUG */
9027 
9028 	if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
9029 		error = EPERM;
9030 		goto out1;
9031 	}
9032 
9033 	if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
9034 		error = EPERM;
9035 		goto out1;
9036 	}
9037 #endif /* DEVELOPMENT || DEBUG */
9038 
9039 	if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9040 		error = ENOENT;
9041 		goto out1;
9042 	}
9043 
9044 	if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9045 		int32_t pval = 0;
9046 		int err = 0;
9047 
9048 		/*
9049 		 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9050 		 * has the same name as target iff the following conditions are met:
9051 		 * 1. the target file system is case insensitive
9052 		 * 2. source and target directories are the same
9053 		 * 3. source and target files are the same
9054 		 * 4. name only differs in case (determined by underlying filesystem)
9055 		 */
9056 		if (fvp != tvp || fdvp != tdvp) {
9057 			error = EEXIST;
9058 			goto out1;
9059 		}
9060 
9061 		/*
9062 		 * Assume that the target file system is case sensitive if
9063 		 * _PC_CASE_SENSITIVE selector isn't supported.
9064 		 */
9065 		err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9066 		if (err != 0 || pval != 0) {
9067 			error = EEXIST;
9068 			goto out1;
9069 		}
9070 	}
9071 
9072 	batched = vnode_compound_rename_available(fdvp);
9073 
9074 #if CONFIG_FSE
9075 	need_event = need_fsevent(FSE_RENAME, fdvp);
9076 	if (need_event) {
9077 		if (fvp) {
9078 			get_fse_info(fvp, &from_finfo, ctx);
9079 		} else {
9080 			error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9081 			if (error) {
9082 				goto out1;
9083 			}
9084 
9085 			fvap = &__rename_data->fv_attr;
9086 		}
9087 
9088 		if (tvp) {
9089 			get_fse_info(tvp, &to_finfo, ctx);
9090 		} else if (batched) {
9091 			error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9092 			if (error) {
9093 				goto out1;
9094 			}
9095 
9096 			tvap = &__rename_data->tv_attr;
9097 		}
9098 	}
9099 #else
9100 	need_event = 0;
9101 #endif /* CONFIG_FSE */
9102 
9103 	has_listeners = kauth_authorize_fileop_has_listeners();
9104 
9105 	need_kpath2 = 0;
9106 #if CONFIG_AUDIT
9107 	if (AUDIT_RECORD_EXISTS()) {
9108 		need_kpath2 = 1;
9109 	}
9110 #endif
9111 
9112 	if (need_event || has_listeners) {
9113 		if (from_name == NULL) {
9114 			GET_PATH(from_name);
9115 		}
9116 
9117 		from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9118 
9119 		if (from_name_no_firmlink == NULL) {
9120 			GET_PATH(from_name_no_firmlink);
9121 		}
9122 
9123 		from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9124 	}
9125 
9126 	if (need_event || need_kpath2 || has_listeners) {
9127 		if (to_name == NULL) {
9128 			GET_PATH(to_name);
9129 		}
9130 
9131 		to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9132 
9133 		if (to_name_no_firmlink == NULL) {
9134 			GET_PATH(to_name_no_firmlink);
9135 		}
9136 
9137 		to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9138 		if (to_name && need_kpath2) {
9139 			AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9140 		}
9141 	}
9142 	if (!fvp) {
9143 		/*
9144 		 * Claim: this check will never reject a valid rename.
9145 		 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9146 		 * Suppose fdvp and tdvp are not on the same mount.
9147 		 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
9148 		 *      then you can't move it to within another dir on the same mountpoint.
9149 		 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9150 		 *
9151 		 * If this check passes, then we are safe to pass these vnodes to the same FS.
9152 		 */
9153 		if (fdvp->v_mount != tdvp->v_mount) {
9154 			error = EXDEV;
9155 			goto out1;
9156 		}
9157 		goto skipped_lookup;
9158 	}
9159 
9160 	/*
9161 	 * If the source and destination are the same (i.e. they're
9162 	 * links to the same vnode) and the target file system is
9163 	 * case sensitive, then there is nothing to do.
9164 	 *
9165 	 * XXX Come back to this.
9166 	 */
9167 	if (fvp == tvp) {
9168 		int pathconf_val;
9169 
9170 		/*
9171 		 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9172 		 * then assume that this file system is case sensitive.
9173 		 */
9174 		if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9175 		    pathconf_val != 0) {
9176 			vn_authorize_skipped = TRUE;
9177 			goto out1;
9178 		}
9179 	}
9180 
9181 	/*
9182 	 * Allow the renaming of mount points.
9183 	 * - target must not exist
9184 	 * - target must reside in the same directory as source
9185 	 * - union mounts cannot be renamed
9186 	 * - the root fs, and tightly-linked system volumes, cannot be renamed
9187 	 *
9188 	 * XXX Handle this in VFS after a continued lookup (if we missed
9189 	 * in the cache to start off)
9190 	 *
9191 	 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9192 	 * we'll skip past here.  The file system is responsible for
9193 	 * checking that @tvp is not a descendent of @fvp and vice versa
9194 	 * so it should always return EINVAL if either @tvp or @fvp is the
9195 	 * root of a volume.
9196 	 */
9197 	if ((fvp->v_flag & VROOT) &&
9198 	    (fvp->v_type == VDIR) &&
9199 	    (tvp == NULL) &&
9200 	    (fvp->v_mountedhere == NULL) &&
9201 	    (fdvp == tdvp) &&
9202 	    ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9203 	    ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9204 	    (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9205 		vnode_t coveredvp;
9206 
9207 		/* switch fvp to the covered vnode */
9208 		coveredvp = fvp->v_mount->mnt_vnodecovered;
9209 		if ((vnode_getwithref(coveredvp))) {
9210 			error = ENOENT;
9211 			goto out1;
9212 		}
9213 		/*
9214 		 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9215 		 * later.
9216 		 */
9217 		mnt_fvp = fvp;
9218 
9219 		fvp = coveredvp;
9220 		mntrename = TRUE;
9221 	}
9222 	/*
9223 	 * Check for cross-device rename.
9224 	 */
9225 	if ((fvp->v_mount != tdvp->v_mount) ||
9226 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
9227 		error = EXDEV;
9228 		goto out1;
9229 	}
9230 
9231 	/*
9232 	 * If source is the same as the destination (that is the
9233 	 * same inode number) then there is nothing to do...
9234 	 * EXCEPT if the underlying file system supports case
9235 	 * insensitivity and is case preserving.  In this case
9236 	 * the file system needs to handle the special case of
9237 	 * getting the same vnode as target (fvp) and source (tvp).
9238 	 *
9239 	 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9240 	 * and _PC_CASE_PRESERVING can have this exception, and they need to
9241 	 * handle the special case of getting the same vnode as target and
9242 	 * source.  NOTE: Then the target is unlocked going into vnop_rename,
9243 	 * so not to cause locking problems. There is a single reference on tvp.
9244 	 *
9245 	 * NOTE - that fvp == tvp also occurs if they are hard linked and
9246 	 * that correct behaviour then is just to return success without doing
9247 	 * anything.
9248 	 *
9249 	 * XXX filesystem should take care of this itself, perhaps...
9250 	 */
9251 	if (fvp == tvp && fdvp == tdvp) {
9252 		if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9253 		    !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9254 		    fromnd->ni_cnd.cn_namelen)) {
9255 			vn_authorize_skipped = TRUE;
9256 			goto out1;
9257 		}
9258 	}
9259 
9260 	if (holding_mntlock && fvp->v_mount != locked_mp) {
9261 		/*
9262 		 * we're holding a reference and lock
9263 		 * on locked_mp, but it no longer matches
9264 		 * what we want to do... so drop our hold
9265 		 */
9266 		mount_unlock_renames(locked_mp);
9267 		mount_drop(locked_mp, 0);
9268 		holding_mntlock = 0;
9269 	}
9270 	if (tdvp != fdvp && fvp->v_type == VDIR) {
9271 		/*
9272 		 * serialize renames that re-shape
9273 		 * the tree... if holding_mntlock is
9274 		 * set, then we're ready to go...
9275 		 * otherwise we
9276 		 * first need to drop the iocounts
9277 		 * we picked up, second take the
9278 		 * lock to serialize the access,
9279 		 * then finally start the lookup
9280 		 * process over with the lock held
9281 		 */
9282 		if (!holding_mntlock) {
9283 			/*
9284 			 * need to grab a reference on
9285 			 * the mount point before we
9286 			 * drop all the iocounts... once
9287 			 * the iocounts are gone, the mount
9288 			 * could follow
9289 			 */
9290 			locked_mp = fvp->v_mount;
9291 			mount_ref(locked_mp, 0);
9292 
9293 			/*
9294 			 * nameidone has to happen before we vnode_put(tvp)
9295 			 * since it may need to release the fs_nodelock on the tvp
9296 			 */
9297 			nameidone(tond);
9298 
9299 			if (tvp) {
9300 				vnode_put(tvp);
9301 			}
9302 			vnode_put(tdvp);
9303 
9304 			/*
9305 			 * nameidone has to happen before we vnode_put(fdvp)
9306 			 * since it may need to release the fs_nodelock on the fvp
9307 			 */
9308 			nameidone(fromnd);
9309 
9310 			vnode_put(fvp);
9311 			vnode_put(fdvp);
9312 
9313 			if (mnt_fvp != NULLVP) {
9314 				vnode_put(mnt_fvp);
9315 			}
9316 
9317 			mount_lock_renames(locked_mp);
9318 			holding_mntlock = 1;
9319 
9320 			goto retry;
9321 		}
9322 	} else {
9323 		/*
9324 		 * when we dropped the iocounts to take
9325 		 * the lock, we allowed the identity of
9326 		 * the various vnodes to change... if they did,
9327 		 * we may no longer be dealing with a rename
9328 		 * that reshapes the tree... once we're holding
9329 		 * the iocounts, the vnodes can't change type
9330 		 * so we're free to drop the lock at this point
9331 		 * and continue on
9332 		 */
9333 		if (holding_mntlock) {
9334 			mount_unlock_renames(locked_mp);
9335 			mount_drop(locked_mp, 0);
9336 			holding_mntlock = 0;
9337 		}
9338 	}
9339 
9340 	if (!batched) {
9341 		error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9342 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9343 		    flags, NULL);
9344 		if (error) {
9345 			if (error == ENOENT) {
9346 				if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9347 					/*
9348 					 * We encountered a race where after doing the namei,
9349 					 * tvp stops being valid. If so, simply re-drive the rename
9350 					 * call from the top.
9351 					 */
9352 					do_retry = 1;
9353 					retry_count += 1;
9354 				}
9355 			}
9356 			goto out1;
9357 		}
9358 	}
9359 
9360 	/* Release the 'mnt_fvp' now that it is no longer needed. */
9361 	if (mnt_fvp != NULLVP) {
9362 		vnode_put(mnt_fvp);
9363 		mnt_fvp = NULLVP;
9364 	}
9365 
9366 	// save these off so we can later verify that fvp is the same
9367 	oname   = fvp->v_name;
9368 	oparent = fvp->v_parent;
9369 
9370 skipped_lookup:
9371 #if CONFIG_FILE_LEASES
9372 	/* Lease break needed for source's parent dir? */
9373 	vnode_breakdirlease(fdvp, false, O_WRONLY);
9374 
9375 	/* Lease break needed for target's parent dir? */
9376 	vnode_breakdirlease(tdvp, false, O_WRONLY);
9377 #endif
9378 
9379 	error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9380 	    tdvp, &tvp, &tond->ni_cnd, tvap,
9381 	    flags, ctx);
9382 
9383 	if (holding_mntlock) {
9384 		/*
9385 		 * we can drop our serialization
9386 		 * lock now
9387 		 */
9388 		mount_unlock_renames(locked_mp);
9389 		mount_drop(locked_mp, 0);
9390 		holding_mntlock = 0;
9391 	}
9392 	if (error) {
9393 		if (error == EDATALESS) {
9394 			/*
9395 			 * If we've been here before, something has gone
9396 			 * horribly wrong and we should just get out lest
9397 			 * we spiral around the drain forever.
9398 			 */
9399 			if (flags & VFS_RENAME_DATALESS) {
9400 				error = EIO;
9401 				goto out1;
9402 			}
9403 
9404 			/*
9405 			 * The object we're renaming is dataless (or has a
9406 			 * dataless descendent) and requires materialization
9407 			 * before the rename occurs.  But we're holding the
9408 			 * mount point's rename lock, so it's not safe to
9409 			 * make the upcall.
9410 			 *
9411 			 * In this case, we release the lock, perform the
9412 			 * materialization, and start the whole thing over.
9413 			 */
9414 			error = vnode_materialize_dataless_file(fvp,
9415 			    NAMESPACE_HANDLER_RENAME_OP);
9416 
9417 			if (error == 0) {
9418 				/*
9419 				 * The next time around we need to tell the
9420 				 * file system that the materializtaion has
9421 				 * been performed.
9422 				 */
9423 				flags |= VFS_RENAME_DATALESS;
9424 				do_retry = 1;
9425 			}
9426 			goto out1;
9427 		}
9428 		if (error == EKEEPLOOKING) {
9429 			if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9430 				if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9431 					panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9432 				}
9433 			}
9434 
9435 			fromnd->ni_vp = fvp;
9436 			tond->ni_vp = tvp;
9437 
9438 			goto continue_lookup;
9439 		}
9440 
9441 		/*
9442 		 * We may encounter a race in the VNOP where the destination didn't
9443 		 * exist when we did the namei, but it does by the time we go and
9444 		 * try to create the entry. In this case, we should re-drive this rename
9445 		 * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
9446 		 * but other filesystems susceptible to this race could return it, too.
9447 		 */
9448 		if (error == ERECYCLE) {
9449 			if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9450 				do_retry = 1;
9451 				retry_count += 1;
9452 			} else {
9453 				printf("rename retry limit due to ERECYCLE reached\n");
9454 				error = ENOENT;
9455 			}
9456 		}
9457 
9458 		/*
9459 		 * For compound VNOPs, the authorization callback may return
9460 		 * ENOENT in case of racing hardlink lookups hitting the name
9461 		 * cache, redrive the lookup.
9462 		 */
9463 		if (batched && error == ENOENT) {
9464 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9465 				do_retry = 1;
9466 				retry_count += 1;
9467 			}
9468 		}
9469 
9470 		goto out1;
9471 	}
9472 
9473 	/* call out to allow 3rd party notification of rename.
9474 	 * Ignore result of kauth_authorize_fileop call.
9475 	 */
9476 	kauth_authorize_fileop(vfs_context_ucred(ctx),
9477 	    KAUTH_FILEOP_RENAME,
9478 	    (uintptr_t)from_name, (uintptr_t)to_name);
9479 	if (flags & VFS_RENAME_SWAP) {
9480 		kauth_authorize_fileop(vfs_context_ucred(ctx),
9481 		    KAUTH_FILEOP_RENAME,
9482 		    (uintptr_t)to_name, (uintptr_t)from_name);
9483 	}
9484 
9485 #if CONFIG_FSE
9486 	if (from_name != NULL && to_name != NULL) {
9487 		if (from_truncated || to_truncated) {
9488 			// set it here since only the from_finfo gets reported up to user space
9489 			from_finfo.mode |= FSE_TRUNCATED_PATH;
9490 		}
9491 
9492 		if (tvap && tvp) {
9493 			vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
9494 		}
9495 		if (fvap) {
9496 			vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
9497 		}
9498 
9499 		if (tvp) {
9500 			add_fsevent(FSE_RENAME, ctx,
9501 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9502 			    FSE_ARG_FINFO, &from_finfo,
9503 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9504 			    FSE_ARG_FINFO, &to_finfo,
9505 			    FSE_ARG_DONE);
9506 			if (flags & VFS_RENAME_SWAP) {
9507 				/*
9508 				 * Strictly speaking, swap is the equivalent of
9509 				 * *three* renames.  FSEvents clients should only take
9510 				 * the events as a hint, so we only bother reporting
9511 				 * two.
9512 				 */
9513 				add_fsevent(FSE_RENAME, ctx,
9514 				    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9515 				    FSE_ARG_FINFO, &to_finfo,
9516 				    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9517 				    FSE_ARG_FINFO, &from_finfo,
9518 				    FSE_ARG_DONE);
9519 			}
9520 		} else {
9521 			add_fsevent(FSE_RENAME, ctx,
9522 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9523 			    FSE_ARG_FINFO, &from_finfo,
9524 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9525 			    FSE_ARG_DONE);
9526 		}
9527 	}
9528 #endif /* CONFIG_FSE */
9529 
9530 	/*
9531 	 * update filesystem's mount point data
9532 	 */
9533 	if (mntrename) {
9534 		char *cp, *pathend, *mpname;
9535 		char * tobuf;
9536 		struct mount *mp;
9537 		int maxlen;
9538 		size_t len = 0;
9539 
9540 		mp = fvp->v_mountedhere;
9541 
9542 		if (vfs_busy(mp, LK_NOWAIT)) {
9543 			error = EBUSY;
9544 			goto out1;
9545 		}
9546 		tobuf = zalloc(ZV_NAMEI);
9547 
9548 		if (UIO_SEG_IS_USER_SPACE(segflg)) {
9549 			error = copyinstr(to, tobuf, MAXPATHLEN, &len);
9550 		} else {
9551 			error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
9552 		}
9553 		if (!error) {
9554 			/* find current mount point prefix */
9555 			pathend = &mp->mnt_vfsstat.f_mntonname[0];
9556 			for (cp = pathend; *cp != '\0'; ++cp) {
9557 				if (*cp == '/') {
9558 					pathend = cp + 1;
9559 				}
9560 			}
9561 			/* find last component of target name */
9562 			for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9563 				if (*cp == '/') {
9564 					mpname = cp + 1;
9565 				}
9566 			}
9567 
9568 			/* Update f_mntonname of sub mounts */
9569 			vfs_iterate(0, rename_submounts_callback, (void *)mp);
9570 
9571 			/* append name to prefix */
9572 			maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9573 			bzero(pathend, maxlen);
9574 
9575 			strlcpy(pathend, mpname, maxlen);
9576 		}
9577 		zfree(ZV_NAMEI, tobuf);
9578 
9579 		vfs_unbusy(mp);
9580 
9581 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
9582 	}
9583 	/*
9584 	 * fix up name & parent pointers.  note that we first
9585 	 * check that fvp has the same name/parent pointers it
9586 	 * had before the rename call... this is a 'weak' check
9587 	 * at best...
9588 	 *
9589 	 * XXX oparent and oname may not be set in the compound vnop case
9590 	 */
9591 	if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9592 		int update_flags;
9593 
9594 		update_flags = VNODE_UPDATE_NAME;
9595 
9596 		if (fdvp != tdvp) {
9597 			update_flags |= VNODE_UPDATE_PARENT;
9598 		}
9599 
9600 		vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
9601 	}
9602 out1:
9603 	/*
9604 	 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9605 	 * skipped earlier as no actual rename was performed.
9606 	 */
9607 	if (vn_authorize_skipped && error == 0) {
9608 		error = vn_authorize_renamex_with_paths(fdvp, fvp,
9609 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9610 		    flags, NULL);
9611 		if (error && error == ENOENT) {
9612 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9613 				do_retry = 1;
9614 				retry_count += 1;
9615 			}
9616 		}
9617 	}
9618 	if (to_name != NULL) {
9619 		RELEASE_PATH(to_name);
9620 		to_name = NULL;
9621 	}
9622 	if (to_name_no_firmlink != NULL) {
9623 		RELEASE_PATH(to_name_no_firmlink);
9624 		to_name_no_firmlink = NULL;
9625 	}
9626 	if (from_name != NULL) {
9627 		RELEASE_PATH(from_name);
9628 		from_name = NULL;
9629 	}
9630 	if (from_name_no_firmlink != NULL) {
9631 		RELEASE_PATH(from_name_no_firmlink);
9632 		from_name_no_firmlink = NULL;
9633 	}
9634 	if (holding_mntlock) {
9635 		mount_unlock_renames(locked_mp);
9636 		mount_drop(locked_mp, 0);
9637 		holding_mntlock = 0;
9638 	}
9639 	if (tdvp) {
9640 		/*
9641 		 * nameidone has to happen before we vnode_put(tdvp)
9642 		 * since it may need to release the fs_nodelock on the tdvp
9643 		 */
9644 		nameidone(tond);
9645 
9646 		if (tvp) {
9647 			vnode_put(tvp);
9648 		}
9649 		vnode_put(tdvp);
9650 	}
9651 	if (fdvp) {
9652 		/*
9653 		 * nameidone has to happen before we vnode_put(fdvp)
9654 		 * since it may need to release the fs_nodelock on the fdvp
9655 		 */
9656 		nameidone(fromnd);
9657 
9658 		if (fvp) {
9659 			vnode_put(fvp);
9660 		}
9661 		vnode_put(fdvp);
9662 	}
9663 	if (mnt_fvp != NULLVP) {
9664 		vnode_put(mnt_fvp);
9665 	}
9666 	/*
9667 	 * If things changed after we did the namei, then we will re-drive
9668 	 * this rename call from the top.
9669 	 */
9670 	if (do_retry) {
9671 		do_retry = 0;
9672 		goto retry;
9673 	}
9674 
9675 	kfree_type(typeof(*__rename_data), __rename_data);
9676 	return error;
9677 }
9678 
9679 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9680 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9681 {
9682 	return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9683 	           AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9684 }
9685 
9686 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9687 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9688 {
9689 	if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9690 		return EINVAL;
9691 	}
9692 
9693 	if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9694 		return EINVAL;
9695 	}
9696 
9697 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9698 	           uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9699 }
9700 
9701 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9702 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9703 {
9704 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9705 	           uap->tofd, uap->to, UIO_USERSPACE, 0);
9706 }
9707 
9708 /*
9709  * Make a directory file.
9710  *
9711  * Returns:	0			Success
9712  *		EEXIST
9713  *	namei:???
9714  *	vnode_authorize:???
9715  *	vn_create:???
9716  */
9717 /* ARGSUSED */
9718 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9719 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9720     enum uio_seg segflg)
9721 {
9722 	vnode_t vp, dvp;
9723 	int error;
9724 	int update_flags = 0;
9725 	int batched;
9726 	struct nameidata nd;
9727 
9728 	AUDIT_ARG(mode, vap->va_mode);
9729 	NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9730 	    path, ctx);
9731 	nd.ni_cnd.cn_flags |= WILLBEDIR;
9732 	nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9733 
9734 continue_lookup:
9735 	error = nameiat(&nd, fd);
9736 	if (error) {
9737 		return error;
9738 	}
9739 	dvp = nd.ni_dvp;
9740 	vp = nd.ni_vp;
9741 
9742 	if (vp != NULL) {
9743 		error = EEXIST;
9744 		goto out;
9745 	}
9746 
9747 	batched = vnode_compound_mkdir_available(dvp);
9748 
9749 	VATTR_SET(vap, va_type, VDIR);
9750 
9751 	/*
9752 	 * XXX
9753 	 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9754 	 * only get EXISTS or EISDIR for existing path components, and not that it could see
9755 	 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9756 	 * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
9757 	 */
9758 	if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9759 		if (error == EACCES || error == EPERM) {
9760 			int error2;
9761 
9762 			nameidone(&nd);
9763 			vnode_put(dvp);
9764 			dvp = NULLVP;
9765 
9766 			/*
9767 			 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9768 			 * rather than EACCESS if the target exists.
9769 			 */
9770 			NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9771 			    path, ctx);
9772 			error2 = nameiat(&nd, fd);
9773 			if (error2) {
9774 				goto out;
9775 			} else {
9776 				vp = nd.ni_vp;
9777 				error = EEXIST;
9778 				goto out;
9779 			}
9780 		}
9781 
9782 		goto out;
9783 	}
9784 
9785 #if CONFIG_FILE_LEASES
9786 	vnode_breakdirlease(dvp, false, O_WRONLY);
9787 #endif
9788 
9789 	/*
9790 	 * make the directory
9791 	 */
9792 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9793 		if (error == EKEEPLOOKING) {
9794 			nd.ni_vp = vp;
9795 			goto continue_lookup;
9796 		}
9797 
9798 		goto out;
9799 	}
9800 
9801 	// Make sure the name & parent pointers are hooked up
9802 	if (vp->v_name == NULL) {
9803 		update_flags |= VNODE_UPDATE_NAME;
9804 	}
9805 	if (vp->v_parent == NULLVP) {
9806 		update_flags |= VNODE_UPDATE_PARENT;
9807 	}
9808 
9809 	if (update_flags) {
9810 		vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9811 	}
9812 
9813 #if CONFIG_FSE
9814 	add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9815 #endif
9816 
9817 out:
9818 	/*
9819 	 * nameidone has to happen before we vnode_put(dvp)
9820 	 * since it may need to release the fs_nodelock on the dvp
9821 	 */
9822 	nameidone(&nd);
9823 
9824 	if (vp) {
9825 		vnode_put(vp);
9826 	}
9827 	if (dvp) {
9828 		vnode_put(dvp);
9829 	}
9830 
9831 	return error;
9832 }
9833 
9834 /*
9835  * mkdir_extended: Create a directory; with extended security (ACL).
9836  *
9837  * Parameters:    p                       Process requesting to create the directory
9838  *                uap                     User argument descriptor (see below)
9839  *                retval                  (ignored)
9840  *
9841  * Indirect:      uap->path               Path of directory to create
9842  *                uap->mode               Access permissions to set
9843  *                uap->xsecurity          ACL to set
9844  *
9845  * Returns:        0                      Success
9846  *                !0                      Not success
9847  *
9848  */
9849 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)9850 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9851 {
9852 	int ciferror;
9853 	kauth_filesec_t xsecdst;
9854 	struct vnode_attr va;
9855 
9856 	AUDIT_ARG(owner, uap->uid, uap->gid);
9857 
9858 	xsecdst = NULL;
9859 	if ((uap->xsecurity != USER_ADDR_NULL) &&
9860 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9861 		return ciferror;
9862 	}
9863 
9864 	VATTR_INIT(&va);
9865 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9866 	if (xsecdst != NULL) {
9867 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9868 		va.va_vaflags |= VA_FILESEC_ACL;
9869 	}
9870 
9871 	ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9872 	    UIO_USERSPACE);
9873 	if (xsecdst != NULL) {
9874 		kauth_filesec_free(xsecdst);
9875 	}
9876 	return ciferror;
9877 }
9878 
9879 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)9880 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9881 {
9882 	struct vnode_attr va;
9883 
9884 	VATTR_INIT(&va);
9885 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9886 
9887 	return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9888 	           UIO_USERSPACE);
9889 }
9890 
9891 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)9892 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9893 {
9894 	struct vnode_attr va;
9895 
9896 	VATTR_INIT(&va);
9897 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9898 
9899 	return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9900 	           UIO_USERSPACE);
9901 }
9902 
9903 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)9904 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9905     enum uio_seg segflg, int unlink_flags)
9906 {
9907 	struct {
9908 		struct nameidata nd;
9909 #if CONFIG_FSE
9910 		struct vnode_attr va;
9911 #endif /* CONFIG_FSE */
9912 	} *__rmdir_data;
9913 	vnode_t vp, dvp;
9914 	int error;
9915 	struct nameidata *ndp;
9916 	char     *path = NULL;
9917 	char     *no_firmlink_path = NULL;
9918 	int       len_path = 0;
9919 	int       len_no_firmlink_path = 0;
9920 	int has_listeners = 0;
9921 	int need_event = 0;
9922 	int truncated_path = 0;
9923 	int truncated_no_firmlink_path = 0;
9924 	struct vnode_attr *vap = NULL;
9925 	int restart_count = 0;
9926 	int batched;
9927 
9928 	int restart_flag;
9929 
9930 	__rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
9931 	ndp = &__rmdir_data->nd;
9932 
9933 	/*
9934 	 * This loop exists to restart rmdir in the unlikely case that two
9935 	 * processes are simultaneously trying to remove the same directory
9936 	 * containing orphaned appleDouble files.
9937 	 */
9938 	do {
9939 		NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9940 		    segflg, dirpath, ctx);
9941 		ndp->ni_flag = NAMEI_COMPOUNDRMDIR;
9942 continue_lookup:
9943 		restart_flag = 0;
9944 		vap = NULL;
9945 
9946 		error = nameiat(ndp, fd);
9947 		if (error) {
9948 			goto err_out;
9949 		}
9950 
9951 		dvp = ndp->ni_dvp;
9952 		vp = ndp->ni_vp;
9953 
9954 		if (vp) {
9955 			batched = vnode_compound_rmdir_available(vp);
9956 
9957 			if (vp->v_flag & VROOT) {
9958 				/*
9959 				 * The root of a mounted filesystem cannot be deleted.
9960 				 */
9961 				error = EBUSY;
9962 				goto out;
9963 			}
9964 
9965 #if DEVELOPMENT || DEBUG
9966 			/*
9967 			 * XXX VSWAP: Check for entitlements or special flag here
9968 			 * so we can restrict access appropriately.
9969 			 */
9970 #else /* DEVELOPMENT || DEBUG */
9971 
9972 			if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
9973 				error = EPERM;
9974 				goto out;
9975 			}
9976 #endif /* DEVELOPMENT || DEBUG */
9977 
9978 			/*
9979 			 * Removed a check here; we used to abort if vp's vid
9980 			 * was not the same as what we'd seen the last time around.
9981 			 * I do not think that check was valid, because if we retry
9982 			 * and all dirents are gone, the directory could legitimately
9983 			 * be recycled but still be present in a situation where we would
9984 			 * have had permission to delete.  Therefore, we won't make
9985 			 * an effort to preserve that check now that we may not have a
9986 			 * vp here.
9987 			 */
9988 
9989 			if (!batched) {
9990 				error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
9991 				if (error) {
9992 					if (error == ENOENT) {
9993 						if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9994 							restart_flag = 1;
9995 							restart_count += 1;
9996 						}
9997 					}
9998 					goto out;
9999 				}
10000 			}
10001 		} else {
10002 			batched = 1;
10003 
10004 			if (!vnode_compound_rmdir_available(dvp)) {
10005 				panic("No error, but no compound rmdir?");
10006 			}
10007 		}
10008 
10009 #if CONFIG_FSE
10010 		fse_info  finfo = {0};
10011 
10012 		need_event = need_fsevent(FSE_DELETE, dvp);
10013 		if (need_event) {
10014 			if (!batched) {
10015 				get_fse_info(vp, &finfo, ctx);
10016 			} else {
10017 				error = vfs_get_notify_attributes(&__rmdir_data->va);
10018 				if (error) {
10019 					goto out;
10020 				}
10021 
10022 				vap = &__rmdir_data->va;
10023 			}
10024 		}
10025 #endif
10026 		has_listeners = kauth_authorize_fileop_has_listeners();
10027 		if (need_event || has_listeners) {
10028 			if (path == NULL) {
10029 				GET_PATH(path);
10030 			}
10031 
10032 			len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10033 
10034 			if (no_firmlink_path == NULL) {
10035 				GET_PATH(no_firmlink_path);
10036 			}
10037 
10038 			len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10039 #if CONFIG_FSE
10040 			if (truncated_no_firmlink_path) {
10041 				finfo.mode |= FSE_TRUNCATED_PATH;
10042 			}
10043 #endif
10044 		}
10045 
10046 #if CONFIG_FILE_LEASES
10047 		vnode_breakdirlease(dvp, false, O_WRONLY);
10048 #endif
10049 
10050 		error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10051 		ndp->ni_vp = vp;
10052 		if (vp == NULLVP) {
10053 			/* Couldn't find a vnode */
10054 			goto out;
10055 		}
10056 
10057 		if (error == EKEEPLOOKING) {
10058 			goto continue_lookup;
10059 		} else if (batched && error == ENOENT) {
10060 			if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10061 				/*
10062 				 * For compound VNOPs, the authorization callback
10063 				 * may return ENOENT in case of racing hard link lookups
10064 				 * redrive the lookup.
10065 				 */
10066 				restart_flag = 1;
10067 				restart_count += 1;
10068 				goto out;
10069 			}
10070 		}
10071 
10072 		/*
10073 		 * XXX There's no provision for passing flags
10074 		 * to VNOP_RMDIR().  So, if vn_rmdir() fails
10075 		 * because it's not empty, then we try again
10076 		 * with VNOP_REMOVE(), passing in a special
10077 		 * flag that clever file systems will know
10078 		 * how to handle.
10079 		 */
10080 		if (error == ENOTEMPTY &&
10081 		    (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10082 			/*
10083 			 * If this fails, we want to keep the original
10084 			 * error.
10085 			 */
10086 			if (vn_remove(dvp, &vp, ndp,
10087 			    VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10088 				error = 0;
10089 			}
10090 		}
10091 
10092 #if CONFIG_APPLEDOUBLE
10093 		/*
10094 		 * Special case to remove orphaned AppleDouble
10095 		 * files. I don't like putting this in the kernel,
10096 		 * but carbon does not like putting this in carbon either,
10097 		 * so here we are.
10098 		 */
10099 		if (error == ENOTEMPTY) {
10100 			int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10101 			if (ad_error == EBUSY) {
10102 				error = ad_error;
10103 				goto out;
10104 			}
10105 
10106 
10107 			/*
10108 			 * Assuming everything went well, we will try the RMDIR again
10109 			 */
10110 			if (!ad_error) {
10111 				error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10112 			}
10113 		}
10114 #endif /* CONFIG_APPLEDOUBLE */
10115 		/*
10116 		 * Call out to allow 3rd party notification of delete.
10117 		 * Ignore result of kauth_authorize_fileop call.
10118 		 */
10119 		if (!error) {
10120 			if (has_listeners) {
10121 				kauth_authorize_fileop(vfs_context_ucred(ctx),
10122 				    KAUTH_FILEOP_DELETE,
10123 				    (uintptr_t)vp,
10124 				    (uintptr_t)path);
10125 			}
10126 
10127 			if (vp->v_flag & VISHARDLINK) {
10128 				// see the comment in unlink1() about why we update
10129 				// the parent of a hard link when it is removed
10130 				vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10131 			}
10132 
10133 #if CONFIG_FSE
10134 			if (need_event) {
10135 				if (vap) {
10136 					vnode_get_fse_info_from_vap(vp, &finfo, vap);
10137 				}
10138 				add_fsevent(FSE_DELETE, ctx,
10139 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10140 				    FSE_ARG_FINFO, &finfo,
10141 				    FSE_ARG_DONE);
10142 			}
10143 #endif
10144 
10145 #if CONFIG_MACF
10146 			mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10147 #endif
10148 		}
10149 
10150 out:
10151 		if (path != NULL) {
10152 			RELEASE_PATH(path);
10153 			path = NULL;
10154 		}
10155 
10156 		if (no_firmlink_path != NULL) {
10157 			RELEASE_PATH(no_firmlink_path);
10158 			no_firmlink_path = NULL;
10159 		}
10160 
10161 		/*
10162 		 * nameidone has to happen before we vnode_put(dvp)
10163 		 * since it may need to release the fs_nodelock on the dvp
10164 		 */
10165 		nameidone(ndp);
10166 		vnode_put(dvp);
10167 
10168 		if (vp) {
10169 			vnode_put(vp);
10170 		}
10171 
10172 		if (restart_flag == 0) {
10173 			wakeup_one((caddr_t)vp);
10174 			goto err_out;
10175 		}
10176 		tsleep(vp, PVFS, "rm AD", 1);
10177 	} while (restart_flag != 0);
10178 
10179 err_out:
10180 	kfree_type(typeof(*__rmdir_data), __rmdir_data);
10181 
10182 	return error;
10183 }
10184 
10185 /*
10186  * Remove a directory file.
10187  */
10188 /* ARGSUSED */
10189 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10190 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10191 {
10192 	return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10193 	           CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10194 }
10195 
10196 /* Get direntry length padded to 8 byte alignment */
10197 #define DIRENT64_LEN(namlen) \
10198 	((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10199 
10200 /* Get dirent length padded to 4 byte alignment */
10201 #define DIRENT_LEN(namelen) \
10202 	((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10203 
10204 /* Get the end of this dirent */
10205 #define DIRENT_END(dep) \
10206 	(((char *)(dep)) + (dep)->d_reclen - 1)
10207 
10208 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10209 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10210     int *numdirent, vfs_context_t ctxp)
10211 {
10212 	/* Check if fs natively supports VNODE_READDIR_EXTENDED */
10213 	if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10214 	    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10215 		return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10216 	} else {
10217 		size_t bufsize;
10218 		void * bufptr;
10219 		uio_t auio;
10220 		struct direntry *entry64;
10221 		struct dirent *dep;
10222 		size_t bytesread;
10223 		int error;
10224 
10225 		/*
10226 		 * We're here because the underlying file system does not
10227 		 * support direnties or we mounted denying support so we must
10228 		 * fall back to dirents and convert them to direntries.
10229 		 *
10230 		 * Our kernel buffer needs to be smaller since re-packing will
10231 		 * expand each dirent.  The worse case (when the name length
10232 		 * is 3 or less) corresponds to a struct direntry size of 32
10233 		 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10234 		 * (4-byte aligned).  So having a buffer that is 3/8 the size
10235 		 * will prevent us from reading more than we can pack.
10236 		 *
10237 		 * Since this buffer is wired memory, we will limit the
10238 		 * buffer size to a maximum of 32K. We would really like to
10239 		 * use 32K in the MIN(), but we use magic number 87371 to
10240 		 * prevent uio_resid() * 3 / 8 from overflowing.
10241 		 */
10242 		bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10243 		bufptr = kalloc_data(bufsize, Z_WAITOK);
10244 		if (bufptr == NULL) {
10245 			return ENOMEM;
10246 		}
10247 
10248 		auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10249 		uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10250 		auio->uio_offset = uio->uio_offset;
10251 
10252 		error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10253 
10254 		dep = (struct dirent *)bufptr;
10255 		bytesread = bufsize - uio_resid(auio);
10256 
10257 		entry64 = kalloc_type(struct direntry, Z_WAITOK);
10258 		/*
10259 		 * Convert all the entries and copy them out to user's buffer.
10260 		 */
10261 		while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10262 			/* First check that the dirent struct up to d_name is within the buffer */
10263 			if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10264 			    /* Check that the length of the entire dirent is within the buffer */
10265 			    DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10266 			    /* Check that the actual length including the name doesn't exceed d_reclen */
10267 			    DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10268 				printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10269 				    vp->v_mount->mnt_vfsstat.f_mntonname,
10270 				    vp->v_name ? vp->v_name : "<unknown>");
10271 				error = EIO;
10272 				break;
10273 			}
10274 
10275 			size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
10276 
10277 			bzero(entry64, enbufsize);
10278 			/* Convert a dirent to a dirent64. */
10279 			entry64->d_ino = dep->d_ino;
10280 			entry64->d_seekoff = 0;
10281 			entry64->d_reclen = (uint16_t)enbufsize;
10282 			entry64->d_namlen = dep->d_namlen;
10283 			entry64->d_type = dep->d_type;
10284 			bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10285 
10286 			/* Move to next entry. */
10287 			dep = (struct dirent *)((char *)dep + dep->d_reclen);
10288 
10289 			/* Copy entry64 to user's buffer. */
10290 			error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10291 		}
10292 
10293 		/* Update the real offset using the offset we got from VNOP_READDIR. */
10294 		if (error == 0) {
10295 			uio->uio_offset = auio->uio_offset;
10296 		}
10297 		uio_free(auio);
10298 		kfree_data(bufptr, bufsize);
10299 		kfree_type(struct direntry, entry64);
10300 		return error;
10301 	}
10302 }
10303 
10304 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
10305 
10306 /*
10307  * Read a block of directory entries in a file system independent format.
10308  */
10309 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10310 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10311     off_t *offset, int *eofflag, int flags)
10312 {
10313 	vnode_t vp;
10314 	struct vfs_context context = *vfs_context_current();    /* local copy */
10315 	struct fileproc *fp;
10316 	uio_t auio;
10317 	int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10318 	off_t loff;
10319 	int error, numdirent;
10320 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10321 
10322 get_from_fd:
10323 	error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10324 	if (error) {
10325 		return error;
10326 	}
10327 
10328 	vn_offset_lock(fp->fp_glob);
10329 	if (((vnode_t)fp_get_data(fp)) != vp) {
10330 		vn_offset_unlock(fp->fp_glob);
10331 		file_drop(fd);
10332 		goto get_from_fd;
10333 	}
10334 
10335 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10336 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10337 		error = EBADF;
10338 		goto out;
10339 	}
10340 
10341 	if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10342 		bufsize = GETDIRENTRIES_MAXBUFSIZE;
10343 	}
10344 
10345 #if CONFIG_MACF
10346 	error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10347 	if (error) {
10348 		goto out;
10349 	}
10350 #endif
10351 
10352 	if ((error = vnode_getwithref(vp))) {
10353 		goto out;
10354 	}
10355 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10356 
10357 #if CONFIG_UNION_MOUNTS
10358 unionread:
10359 #endif /* CONFIG_UNION_MOUNTS */
10360 	if (vp->v_type != VDIR) {
10361 		(void)vnode_put(vp);
10362 		error = EINVAL;
10363 		goto out;
10364 	}
10365 
10366 #if CONFIG_MACF
10367 	error = mac_vnode_check_readdir(&context, vp);
10368 	if (error != 0) {
10369 		(void)vnode_put(vp);
10370 		goto out;
10371 	}
10372 #endif /* MAC */
10373 
10374 	loff = fp->fp_glob->fg_offset;
10375 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10376 	uio_addiov(auio, bufp, bufsize);
10377 
10378 	if (flags & VNODE_READDIR_EXTENDED) {
10379 		error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10380 		fp->fp_glob->fg_offset = uio_offset(auio);
10381 	} else {
10382 		error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10383 		fp->fp_glob->fg_offset = uio_offset(auio);
10384 	}
10385 	if (error) {
10386 		(void)vnode_put(vp);
10387 		goto out;
10388 	}
10389 
10390 #if CONFIG_UNION_MOUNTS
10391 	if ((user_ssize_t)bufsize == uio_resid(auio) &&
10392 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
10393 		vnode_t uvp;
10394 
10395 		if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10396 			if (vnode_ref(uvp) == 0) {
10397 				fp_set_data(fp, uvp);
10398 				fp->fp_glob->fg_offset = 0;
10399 				vnode_rele(vp);
10400 				vnode_put(vp);
10401 				vp = uvp;
10402 				goto unionread;
10403 			} else {
10404 				/* could not get a ref, can't replace in fd */
10405 				vnode_put(uvp);
10406 			}
10407 		}
10408 	}
10409 #endif /* CONFIG_UNION_MOUNTS */
10410 
10411 	vnode_put(vp);
10412 	if (offset) {
10413 		*offset = loff;
10414 	}
10415 
10416 	*bytesread = bufsize - uio_resid(auio);
10417 out:
10418 	vn_offset_unlock(fp->fp_glob);
10419 	file_drop(fd);
10420 	return error;
10421 }
10422 
10423 
10424 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)10425 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10426 {
10427 	off_t offset;
10428 	ssize_t bytesread;
10429 	int error, eofflag;
10430 
10431 	AUDIT_ARG(fd, uap->fd);
10432 	error = getdirentries_common(uap->fd, uap->buf, uap->count,
10433 	    &bytesread, &offset, &eofflag, 0);
10434 
10435 	if (error == 0) {
10436 		if (proc_is64bit(p)) {
10437 			user64_long_t base = (user64_long_t)offset;
10438 			error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10439 		} else {
10440 			user32_long_t base = (user32_long_t)offset;
10441 			error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10442 		}
10443 		*retval = (int)bytesread;
10444 	}
10445 	return error;
10446 }
10447 
10448 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)10449 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10450 {
10451 	off_t offset;
10452 	ssize_t bytesread;
10453 	int error, eofflag;
10454 	user_size_t bufsize;
10455 
10456 	AUDIT_ARG(fd, uap->fd);
10457 
10458 	/*
10459 	 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10460 	 * then the kernel carves out the last 4 bytes to return extended
10461 	 * information to userspace (namely whether we reached EOF with this call).
10462 	 */
10463 	if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10464 		bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10465 	} else {
10466 		bufsize = uap->bufsize;
10467 	}
10468 
10469 	error = getdirentries_common(uap->fd, uap->buf, bufsize,
10470 	    &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
10471 
10472 	if (error == 0) {
10473 		*retval = bytesread;
10474 		error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10475 
10476 		if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10477 			getdirentries64_flags_t flags = 0;
10478 			if (eofflag) {
10479 				flags |= GETDIRENTRIES64_EOF;
10480 			}
10481 			error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10482 			    sizeof(flags));
10483 		}
10484 	}
10485 	return error;
10486 }
10487 
10488 
10489 /*
10490  * Set the mode mask for creation of filesystem nodes.
10491  * XXX implement xsecurity
10492  */
10493 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
10494 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)10495 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10496 {
10497 	AUDIT_ARG(mask, newmask);
10498 	proc_fdlock(p);
10499 	*retval = p->p_fd.fd_cmask;
10500 	p->p_fd.fd_cmask = newmask & ALLPERMS;
10501 	proc_fdunlock(p);
10502 	return 0;
10503 }
10504 
10505 /*
10506  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10507  *
10508  * Parameters:    p                       Process requesting to set the umask
10509  *                uap                     User argument descriptor (see below)
10510  *                retval                  umask of the process (parameter p)
10511  *
10512  * Indirect:      uap->newmask            umask to set
10513  *                uap->xsecurity          ACL to set
10514  *
10515  * Returns:        0                      Success
10516  *                !0                      Not success
10517  *
10518  */
10519 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)10520 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10521 {
10522 	return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
10523 }
10524 
10525 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)10526 umask(proc_t p, struct umask_args *uap, int32_t *retval)
10527 {
10528 	return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
10529 }
10530 
10531 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT                               \
10532 	"com.apple.private.vfs.revoke-mounted-device"
10533 
10534 /*
10535  * Void all references to file by ripping underlying filesystem
10536  * away from vnode.
10537  */
10538 /* ARGSUSED */
10539 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)10540 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10541 {
10542 	vnode_t vp;
10543 	struct vnode_attr va;
10544 	vfs_context_t ctx = vfs_context_current();
10545 	int error;
10546 	struct nameidata nd;
10547 
10548 	NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10549 	    uap->path, ctx);
10550 	error = namei(&nd);
10551 	if (error) {
10552 		return error;
10553 	}
10554 	vp = nd.ni_vp;
10555 
10556 	nameidone(&nd);
10557 
10558 	if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10559 		error = ENOTSUP;
10560 		goto out;
10561 	}
10562 
10563 	if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10564 		error = EBUSY;
10565 		goto out;
10566 	}
10567 
10568 #if CONFIG_MACF
10569 	error = mac_vnode_check_revoke(ctx, vp);
10570 	if (error) {
10571 		goto out;
10572 	}
10573 #endif
10574 
10575 	VATTR_INIT(&va);
10576 	VATTR_WANTED(&va, va_uid);
10577 	if ((error = vnode_getattr(vp, &va, ctx))) {
10578 		goto out;
10579 	}
10580 	if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
10581 	    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
10582 		goto out;
10583 	}
10584 	if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10585 		VNOP_REVOKE(vp, REVOKEALL, ctx);
10586 	}
10587 out:
10588 	vnode_put(vp);
10589 	return error;
10590 }
10591 
10592 
10593 /*
10594  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10595  *  The following system calls are designed to support features
10596  *  which are specific to the HFS & HFS Plus volume formats
10597  */
10598 
10599 
10600 /*
10601  * Obtain attribute information on objects in a directory while enumerating
10602  * the directory.
10603  */
10604 /* ARGSUSED */
10605 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)10606 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10607 {
10608 	vnode_t vp;
10609 	struct fileproc *fp;
10610 	uio_t auio = NULL;
10611 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10612 	uint32_t count = 0, savecount = 0;
10613 	uint32_t newstate = 0;
10614 	int error, eofflag = 0;
10615 	off_t loff = 0;
10616 	struct attrlist attributelist;
10617 	vfs_context_t ctx = vfs_context_current();
10618 	int fd = uap->fd;
10619 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10620 	kauth_action_t action;
10621 
10622 	AUDIT_ARG(fd, fd);
10623 
10624 	/* Get the attributes into kernel space */
10625 	if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10626 		return error;
10627 	}
10628 	if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10629 		return error;
10630 	}
10631 	savecount = count;
10632 
10633 get_from_fd:
10634 	if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10635 		return error;
10636 	}
10637 
10638 	vn_offset_lock(fp->fp_glob);
10639 	if (((vnode_t)fp_get_data(fp)) != vp) {
10640 		vn_offset_unlock(fp->fp_glob);
10641 		file_drop(fd);
10642 		goto get_from_fd;
10643 	}
10644 
10645 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10646 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10647 		error = EBADF;
10648 		goto out;
10649 	}
10650 
10651 
10652 #if CONFIG_MACF
10653 	error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10654 	    fp->fp_glob);
10655 	if (error) {
10656 		goto out;
10657 	}
10658 #endif
10659 
10660 
10661 	if ((error = vnode_getwithref(vp))) {
10662 		goto out;
10663 	}
10664 
10665 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10666 
10667 #if CONFIG_UNION_MOUNTS
10668 unionread:
10669 #endif /* CONFIG_UNION_MOUNTS */
10670 	if (vp->v_type != VDIR) {
10671 		(void)vnode_put(vp);
10672 		error = EINVAL;
10673 		goto out;
10674 	}
10675 
10676 #if CONFIG_MACF
10677 	error = mac_vnode_check_readdir(ctx, vp);
10678 	if (error != 0) {
10679 		(void)vnode_put(vp);
10680 		goto out;
10681 	}
10682 #endif /* MAC */
10683 
10684 	/* set up the uio structure which will contain the users return buffer */
10685 	loff = fp->fp_glob->fg_offset;
10686 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10687 	uio_addiov(auio, uap->buffer, uap->buffersize);
10688 
10689 	/*
10690 	 * If the only item requested is file names, we can let that past with
10691 	 * just LIST_DIRECTORY.  If they want any other attributes, that means
10692 	 * they need SEARCH as well.
10693 	 */
10694 	action = KAUTH_VNODE_LIST_DIRECTORY;
10695 	if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10696 	    attributelist.fileattr || attributelist.dirattr) {
10697 		action |= KAUTH_VNODE_SEARCH;
10698 	}
10699 
10700 	if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10701 		/* Believe it or not, uap->options only has 32-bits of valid
10702 		 * info, so truncate before extending again */
10703 
10704 		error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10705 		    (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10706 	}
10707 
10708 	if (error) {
10709 		(void) vnode_put(vp);
10710 		goto out;
10711 	}
10712 
10713 #if CONFIG_UNION_MOUNTS
10714 	/*
10715 	 * If we've got the last entry of a directory in a union mount
10716 	 * then reset the eofflag and pretend there's still more to come.
10717 	 * The next call will again set eofflag and the buffer will be empty,
10718 	 * so traverse to the underlying directory and do the directory
10719 	 * read there.
10720 	 */
10721 	if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10722 		if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10723 			eofflag = 0;
10724 		} else {                                                // Empty buffer
10725 			vnode_t uvp;
10726 			if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10727 				if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10728 					fp_set_data(fp, uvp);
10729 					fp->fp_glob->fg_offset = 0; // reset index for new dir
10730 					count = savecount;
10731 					vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10732 					vnode_put(vp);
10733 					vp = uvp;
10734 					goto unionread;
10735 				} else {
10736 					/* could not get a ref, can't replace in fd */
10737 					vnode_put(uvp);
10738 				}
10739 			}
10740 		}
10741 	}
10742 #endif /* CONFIG_UNION_MOUNTS */
10743 
10744 	(void)vnode_put(vp);
10745 
10746 	if (error) {
10747 		goto out;
10748 	}
10749 	fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10750 
10751 	if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10752 		goto out;
10753 	}
10754 	if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10755 		goto out;
10756 	}
10757 	if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10758 		goto out;
10759 	}
10760 
10761 	*retval = eofflag;  /* similar to getdirentries */
10762 	error = 0;
10763 out:
10764 	vn_offset_unlock(fp->fp_glob);
10765 	file_drop(fd);
10766 	return error; /* return error earlier, an retval of 0 or 1 now */
10767 } /* end of getdirentriesattr system call */
10768 
10769 /*
10770  * Exchange data between two files
10771  */
10772 
10773 /* ARGSUSED */
10774 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10775 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10776 {
10777 	struct nameidata fnd, snd;
10778 	vfs_context_t ctx = vfs_context_current();
10779 	vnode_t fvp;
10780 	vnode_t svp;
10781 	int error;
10782 	u_int32_t nameiflags;
10783 	char *fpath = NULL;
10784 	char *spath = NULL;
10785 	int   flen = 0, slen = 0;
10786 	int from_truncated = 0, to_truncated = 0;
10787 #if CONFIG_FSE
10788 	fse_info f_finfo, s_finfo;
10789 #endif
10790 
10791 	nameiflags = 0;
10792 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10793 		nameiflags |= FOLLOW;
10794 	}
10795 
10796 	NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10797 	    UIO_USERSPACE, uap->path1, ctx);
10798 
10799 	error = namei(&fnd);
10800 	if (error) {
10801 		goto out2;
10802 	}
10803 
10804 	nameidone(&fnd);
10805 	fvp = fnd.ni_vp;
10806 
10807 	NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10808 	    UIO_USERSPACE, uap->path2, ctx);
10809 
10810 	error = namei(&snd);
10811 	if (error) {
10812 		vnode_put(fvp);
10813 		goto out2;
10814 	}
10815 	nameidone(&snd);
10816 	svp = snd.ni_vp;
10817 
10818 	/*
10819 	 * if the files are the same, return an inval error
10820 	 */
10821 	if (svp == fvp) {
10822 		error = EINVAL;
10823 		goto out;
10824 	}
10825 
10826 	/*
10827 	 * if the files are on different volumes, return an error
10828 	 */
10829 	if (svp->v_mount != fvp->v_mount) {
10830 		error = EXDEV;
10831 		goto out;
10832 	}
10833 
10834 	/* If they're not files, return an error */
10835 	if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10836 		error = EINVAL;
10837 		goto out;
10838 	}
10839 
10840 #if CONFIG_MACF
10841 	error = mac_vnode_check_exchangedata(ctx,
10842 	    fvp, svp);
10843 	if (error) {
10844 		goto out;
10845 	}
10846 #endif
10847 	if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10848 	    ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10849 		goto out;
10850 	}
10851 
10852 	if (
10853 #if CONFIG_FSE
10854 		need_fsevent(FSE_EXCHANGE, fvp) ||
10855 #endif
10856 		kauth_authorize_fileop_has_listeners()) {
10857 		GET_PATH(fpath);
10858 		GET_PATH(spath);
10859 
10860 		flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10861 		slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10862 
10863 #if CONFIG_FSE
10864 		get_fse_info(fvp, &f_finfo, ctx);
10865 		get_fse_info(svp, &s_finfo, ctx);
10866 		if (from_truncated || to_truncated) {
10867 			// set it here since only the f_finfo gets reported up to user space
10868 			f_finfo.mode |= FSE_TRUNCATED_PATH;
10869 		}
10870 #endif
10871 	}
10872 	/* Ok, make the call */
10873 	error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10874 
10875 	if (error == 0) {
10876 		const char *tmpname;
10877 
10878 		if (fpath != NULL && spath != NULL) {
10879 			/* call out to allow 3rd party notification of exchangedata.
10880 			 * Ignore result of kauth_authorize_fileop call.
10881 			 */
10882 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10883 			    (uintptr_t)fpath, (uintptr_t)spath);
10884 		}
10885 		name_cache_lock();
10886 
10887 		tmpname     = fvp->v_name;
10888 		fvp->v_name = svp->v_name;
10889 		svp->v_name = tmpname;
10890 
10891 		if (fvp->v_parent != svp->v_parent) {
10892 			vnode_t tmp;
10893 
10894 			tmp           = fvp->v_parent;
10895 			fvp->v_parent = svp->v_parent;
10896 			svp->v_parent = tmp;
10897 		}
10898 		name_cache_unlock();
10899 
10900 #if CONFIG_FSE
10901 		if (fpath != NULL && spath != NULL) {
10902 			add_fsevent(FSE_EXCHANGE, ctx,
10903 			    FSE_ARG_STRING, flen, fpath,
10904 			    FSE_ARG_FINFO, &f_finfo,
10905 			    FSE_ARG_STRING, slen, spath,
10906 			    FSE_ARG_FINFO, &s_finfo,
10907 			    FSE_ARG_DONE);
10908 		}
10909 #endif
10910 	}
10911 
10912 out:
10913 	if (fpath != NULL) {
10914 		RELEASE_PATH(fpath);
10915 	}
10916 	if (spath != NULL) {
10917 		RELEASE_PATH(spath);
10918 	}
10919 	vnode_put(svp);
10920 	vnode_put(fvp);
10921 out2:
10922 	return error;
10923 }
10924 
10925 /*
10926  * Return (in MB) the amount of freespace on the given vnode's volume.
10927  */
10928 uint32_t freespace_mb(vnode_t vp);
10929 
10930 uint32_t
freespace_mb(vnode_t vp)10931 freespace_mb(vnode_t vp)
10932 {
10933 	vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
10934 	return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10935 	       vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10936 }
10937 
10938 #if CONFIG_SEARCHFS
10939 
10940 /* ARGSUSED */
10941 
10942 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)10943 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10944 {
10945 	vnode_t vp, tvp;
10946 	int i, error = 0;
10947 	int fserror = 0;
10948 	struct nameidata nd;
10949 	struct user64_fssearchblock searchblock;
10950 	struct searchstate *state;
10951 	struct attrlist *returnattrs;
10952 	struct timeval timelimit;
10953 	void *searchparams1, *searchparams2;
10954 	uio_t auio = NULL;
10955 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10956 	uint32_t nummatches;
10957 	size_t mallocsize;
10958 	uint32_t nameiflags;
10959 	vfs_context_t ctx = vfs_context_current();
10960 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10961 
10962 	/* Start by copying in fsearchblock parameter list */
10963 	if (IS_64BIT_PROCESS(p)) {
10964 		error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
10965 		timelimit.tv_sec = searchblock.timelimit.tv_sec;
10966 		timelimit.tv_usec = searchblock.timelimit.tv_usec;
10967 	} else {
10968 		struct user32_fssearchblock tmp_searchblock;
10969 
10970 		error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
10971 		// munge into 64-bit version
10972 		searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
10973 		searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
10974 		searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
10975 		searchblock.maxmatches = tmp_searchblock.maxmatches;
10976 		/*
10977 		 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
10978 		 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
10979 		 */
10980 		timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
10981 		timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
10982 		searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
10983 		searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
10984 		searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
10985 		searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
10986 		searchblock.searchattrs = tmp_searchblock.searchattrs;
10987 	}
10988 	if (error) {
10989 		return error;
10990 	}
10991 
10992 	/* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
10993 	 */
10994 	if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10995 	    searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10996 		return EINVAL;
10997 	}
10998 
10999 	/* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
11000 	/* It all has to do into local memory and it's not that big so we might as well  put it all together. */
11001 	/* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
11002 	/* block.                                                                                             */
11003 	/*												      */
11004 	/* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
11005 	/*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
11006 	/*       assumes the size is still 556 bytes it will continue to work				      */
11007 
11008 	mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
11009 	    sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
11010 
11011 	searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
11012 
11013 	/* Now set up the various pointers to the correct place in our newly allocated memory */
11014 
11015 	searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
11016 	returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11017 	state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11018 
11019 	/* Now copy in the stuff given our local variables. */
11020 
11021 	if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11022 		goto freeandexit;
11023 	}
11024 
11025 	if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11026 		goto freeandexit;
11027 	}
11028 
11029 	if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11030 		goto freeandexit;
11031 	}
11032 
11033 	if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11034 		goto freeandexit;
11035 	}
11036 
11037 	/*
11038 	 * When searching a union mount, need to set the
11039 	 * start flag at the first call on each layer to
11040 	 * reset state for the new volume.
11041 	 */
11042 	if (uap->options & SRCHFS_START) {
11043 		state->ss_union_layer = 0;
11044 	} else {
11045 		uap->options |= state->ss_union_flags;
11046 	}
11047 	state->ss_union_flags = 0;
11048 
11049 	/*
11050 	 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11051 	 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11052 	 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11053 	 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11054 	 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11055 	 */
11056 
11057 	if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11058 		attrreference_t* string_ref;
11059 		u_int32_t* start_length;
11060 		user64_size_t param_length;
11061 
11062 		/* validate searchparams1 */
11063 		param_length = searchblock.sizeofsearchparams1;
11064 		/* skip the word that specifies length of the buffer */
11065 		start_length = (u_int32_t*) searchparams1;
11066 		start_length = start_length + 1;
11067 		string_ref = (attrreference_t*) start_length;
11068 
11069 		/* ensure no negative offsets or too big offsets */
11070 		if (string_ref->attr_dataoffset < 0) {
11071 			error = EINVAL;
11072 			goto freeandexit;
11073 		}
11074 		if (string_ref->attr_length > MAXPATHLEN) {
11075 			error = EINVAL;
11076 			goto freeandexit;
11077 		}
11078 
11079 		/* Check for pointer overflow in the string ref */
11080 		if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11081 			error = EINVAL;
11082 			goto freeandexit;
11083 		}
11084 
11085 		if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11086 			error = EINVAL;
11087 			goto freeandexit;
11088 		}
11089 		if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11090 			error = EINVAL;
11091 			goto freeandexit;
11092 		}
11093 	}
11094 
11095 	/* set up the uio structure which will contain the users return buffer */
11096 	auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11097 	uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11098 
11099 	nameiflags = 0;
11100 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11101 		nameiflags |= FOLLOW;
11102 	}
11103 	NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11104 	    UIO_USERSPACE, uap->path, ctx);
11105 
11106 	error = namei(&nd);
11107 	if (error) {
11108 		goto freeandexit;
11109 	}
11110 	vp = nd.ni_vp;
11111 	nameidone(&nd);
11112 
11113 	/*
11114 	 * Switch to the root vnode for the volume
11115 	 */
11116 	error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11117 	vnode_put(vp);
11118 	if (error) {
11119 		goto freeandexit;
11120 	}
11121 	vp = tvp;
11122 
11123 #if CONFIG_UNION_MOUNTS
11124 	/*
11125 	 * If it's a union mount, the path lookup takes
11126 	 * us to the top layer. But we may need to descend
11127 	 * to a lower layer. For non-union mounts the layer
11128 	 * is always zero.
11129 	 */
11130 	for (i = 0; i < (int) state->ss_union_layer; i++) {
11131 		if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11132 			break;
11133 		}
11134 		tvp = vp;
11135 		vp = vp->v_mount->mnt_vnodecovered;
11136 		if (vp == NULL) {
11137 			vnode_put(tvp);
11138 			error = ENOENT;
11139 			goto freeandexit;
11140 		}
11141 		error = vnode_getwithref(vp);
11142 		vnode_put(tvp);
11143 		if (error) {
11144 			goto freeandexit;
11145 		}
11146 	}
11147 #endif /* CONFIG_UNION_MOUNTS */
11148 
11149 #if CONFIG_MACF
11150 	error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11151 	if (error) {
11152 		vnode_put(vp);
11153 		goto freeandexit;
11154 	}
11155 #endif
11156 
11157 
11158 	/*
11159 	 * If searchblock.maxmatches == 0, then skip the search. This has happened
11160 	 * before and sometimes the underlying code doesnt deal with it well.
11161 	 */
11162 	if (searchblock.maxmatches == 0) {
11163 		nummatches = 0;
11164 		goto saveandexit;
11165 	}
11166 
11167 	/*
11168 	 * Allright, we have everything we need, so lets make that call.
11169 	 *
11170 	 * We keep special track of the return value from the file system:
11171 	 * EAGAIN is an acceptable error condition that shouldn't keep us
11172 	 * from copying out any results...
11173 	 */
11174 
11175 	fserror = VNOP_SEARCHFS(vp,
11176 	    searchparams1,
11177 	    searchparams2,
11178 	    &searchblock.searchattrs,
11179 	    (uint32_t)searchblock.maxmatches,
11180 	    &timelimit,
11181 	    returnattrs,
11182 	    &nummatches,
11183 	    (uint32_t)uap->scriptcode,
11184 	    (uint32_t)uap->options,
11185 	    auio,
11186 	    (struct searchstate *) &state->ss_fsstate,
11187 	    ctx);
11188 
11189 #if CONFIG_UNION_MOUNTS
11190 	/*
11191 	 * If it's a union mount we need to be called again
11192 	 * to search the mounted-on filesystem.
11193 	 */
11194 	if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11195 		state->ss_union_flags = SRCHFS_START;
11196 		state->ss_union_layer++;        // search next layer down
11197 		fserror = EAGAIN;
11198 	}
11199 #endif /* CONFIG_UNION_MOUNTS */
11200 
11201 saveandexit:
11202 
11203 	vnode_put(vp);
11204 
11205 	/* Now copy out the stuff that needs copying out. That means the number of matches, the
11206 	 *  search state.  Everything was already put into he return buffer by the vop call. */
11207 
11208 	if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11209 		goto freeandexit;
11210 	}
11211 
11212 	if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11213 		goto freeandexit;
11214 	}
11215 
11216 	error = fserror;
11217 
11218 freeandexit:
11219 
11220 	kfree_data(searchparams1, mallocsize);
11221 
11222 	return error;
11223 } /* end of searchfs system call */
11224 
11225 #else /* CONFIG_SEARCHFS */
11226 
11227 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11228 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11229 {
11230 	return ENOTSUP;
11231 }
11232 
11233 #endif /* CONFIG_SEARCHFS */
11234 
11235 
11236 #if CONFIG_DATALESS_FILES
11237 
11238 /*
11239  * === Namespace Resolver Up-call Mechanism ===
11240  *
11241  * When I/O is performed to a dataless file or directory (read, write,
11242  * lookup-in, etc.), the file system performs an upcall to the namespace
11243  * resolver (filecoordinationd) to materialize the object.
11244  *
11245  * We need multiple up-calls to be in flight at once, and we need these
11246  * up-calls to be interruptible, thus the following implementation:
11247  *
11248  * => The nspace_resolver_request represents the in-kernel request state.
11249  *    It contains a request ID, storage space for the errno code returned
11250  *    by filecoordinationd, and flags.
11251  *
11252  * => The request ID is simply a global monotonically incrementing 32-bit
11253  *    number.  Outstanding requests are stored in a hash table, and the
11254  *    hash function is extremely simple.
11255  *
11256  * => When an upcall is to be made to filecoordinationd, a request structure
11257  *    is allocated on the stack (it is small, and needs to live only during
11258  *    the duration of the call to resolve_nspace_item_ext()).  It is
11259  *    initialized and inserted into the table.  Some backpressure from
11260  *    filecoordinationd is applied by limiting the numnber of entries that
11261  *    can be inserted into the table (and thus limiting the number of
11262  *    outstanding requests issued to filecoordinationd); waiting for an
11263  *    available slot is interruptible.
11264  *
11265  * => Once the request has been inserted into the table, the up-call is made
11266  *    to filecoordinationd via a MiG-generated stub.  The up-call returns
11267  *    immediately and filecoordinationd processes the request asynchronously.
11268  *
11269  * => The caller now waits for the request to complete.  Tnis is achieved by
11270  *    sleeping on the address of the request structure and waiting for
11271  *    filecoordinationd to mark the request structure as complete.  This
11272  *    is an interruptible sleep call; if interrupted, the request structure
11273  *    is removed from the table and EINTR is returned to the caller.  If
11274  *    this occurs, an advisory up-call is made to filecoordinationd with
11275  *    the request ID to indicate that the request can be aborted or
11276  *    de-prioritized at the discretion of filecoordinationd.
11277  *
11278  * => When filecoordinationd has completed the request, it signals completion
11279  *    by writing to the vfs.nspace.complete sysctl node.  Only a process
11280  *    decorated as a namespace resolver can write to this sysctl node.  The
11281  *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11282  *    The request ID is looked up in the table, and if the request is found,
11283  *    the error code is stored in the request structure and a wakeup()
11284  *    issued on the address of the request structure.  If the request is not
11285  *    found, we simply drop the completion notification, assuming that the
11286  *    caller was interrupted.
11287  *
11288  * => When the waiting thread wakes up, it extracts the error code from the
11289  *    request structure, removes the request from the table, and returns the
11290  *    error code to the calling function.  Fini!
11291  */
11292 
11293 struct nspace_resolver_request {
11294 	LIST_ENTRY(nspace_resolver_request) r_hashlink;
11295 	vnode_t         r_vp;
11296 	uint32_t        r_req_id;
11297 	int             r_resolver_error;
11298 	int             r_flags;
11299 };
11300 
11301 #define RRF_COMPLETE    0x0001
11302 
11303 static uint32_t
next_nspace_req_id(void)11304 next_nspace_req_id(void)
11305 {
11306 	static uint32_t next_req_id;
11307 
11308 	return OSAddAtomic(1, &next_req_id);
11309 }
11310 
11311 #define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
11312 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
11313 
11314 static LIST_HEAD(nspace_resolver_requesthead,
11315     nspace_resolver_request) * nspace_resolver_request_hashtbl;
11316 static u_long nspace_resolver_request_hashmask;
11317 static u_int nspace_resolver_request_count;
11318 static bool nspace_resolver_request_wait_slot;
11319 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11320 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11321     &nspace_resolver_request_lck_grp);
11322 
11323 #define NSPACE_REQ_LOCK() \
11324 	lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11325 #define NSPACE_REQ_UNLOCK() \
11326 	lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11327 
11328 #define NSPACE_RESOLVER_HASH(req_id)    \
11329 	(&nspace_resolver_request_hashtbl[(req_id) & \
11330 	 nspace_resolver_request_hashmask])
11331 
11332 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id)11333 nspace_resolver_req_lookup(uint32_t req_id)
11334 {
11335 	struct nspace_resolver_requesthead *bucket;
11336 	struct nspace_resolver_request *req;
11337 
11338 	bucket = NSPACE_RESOLVER_HASH(req_id);
11339 	LIST_FOREACH(req, bucket, r_hashlink) {
11340 		if (req->r_req_id == req_id) {
11341 			return req;
11342 		}
11343 	}
11344 
11345 	return NULL;
11346 }
11347 
11348 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11349 nspace_resolver_req_add(struct nspace_resolver_request *req)
11350 {
11351 	struct nspace_resolver_requesthead *bucket;
11352 	int error;
11353 
11354 	while (nspace_resolver_request_count >=
11355 	    NSPACE_RESOLVER_MAX_OUTSTANDING) {
11356 		nspace_resolver_request_wait_slot = true;
11357 		error = msleep(&nspace_resolver_request_count,
11358 		    &nspace_resolver_request_hash_mutex,
11359 		    PVFS | PCATCH, "nspacerq", NULL);
11360 		if (error) {
11361 			return error;
11362 		}
11363 	}
11364 
11365 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11366 #if DIAGNOSTIC
11367 	assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
11368 #endif /* DIAGNOSTIC */
11369 	LIST_INSERT_HEAD(bucket, req, r_hashlink);
11370 	nspace_resolver_request_count++;
11371 
11372 	return 0;
11373 }
11374 
11375 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)11376 nspace_resolver_req_remove(struct nspace_resolver_request *req)
11377 {
11378 	struct nspace_resolver_requesthead *bucket;
11379 
11380 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11381 #if DIAGNOSTIC
11382 	assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
11383 #endif /* DIAGNOSTIC */
11384 	LIST_REMOVE(req, r_hashlink);
11385 	nspace_resolver_request_count--;
11386 
11387 	if (nspace_resolver_request_wait_slot) {
11388 		nspace_resolver_request_wait_slot = false;
11389 		wakeup(&nspace_resolver_request_count);
11390 	}
11391 }
11392 
11393 static void
nspace_resolver_req_cancel(uint32_t req_id)11394 nspace_resolver_req_cancel(uint32_t req_id)
11395 {
11396 	kern_return_t kr;
11397 	mach_port_t mp;
11398 
11399 	// Failures here aren't fatal -- the cancellation message
11400 	// sent to the resolver is merely advisory.
11401 
11402 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11403 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11404 		return;
11405 	}
11406 
11407 	kr = send_nspace_resolve_cancel(mp, req_id);
11408 	if (kr != KERN_SUCCESS) {
11409 		os_log_error(OS_LOG_DEFAULT,
11410 		    "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11411 	}
11412 
11413 	ipc_port_release_send(mp);
11414 }
11415 
11416 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)11417 nspace_resolver_req_wait(struct nspace_resolver_request *req)
11418 {
11419 	bool send_cancel_message = false;
11420 	int error;
11421 
11422 	NSPACE_REQ_LOCK();
11423 
11424 	while ((req->r_flags & RRF_COMPLETE) == 0) {
11425 		error = msleep(req, &nspace_resolver_request_hash_mutex,
11426 		    PVFS | PCATCH, "nspace", NULL);
11427 		if (error && error != ERESTART) {
11428 			req->r_resolver_error = (error == EINTR) ? EINTR :
11429 			    ETIMEDOUT;
11430 			send_cancel_message = true;
11431 			break;
11432 		}
11433 	}
11434 
11435 	nspace_resolver_req_remove(req);
11436 
11437 	NSPACE_REQ_UNLOCK();
11438 
11439 	if (send_cancel_message) {
11440 		nspace_resolver_req_cancel(req->r_req_id);
11441 	}
11442 
11443 	return req->r_resolver_error;
11444 }
11445 
11446 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)11447 nspace_resolver_req_mark_complete(
11448 	struct nspace_resolver_request *req,
11449 	int resolver_error)
11450 {
11451 	req->r_resolver_error = resolver_error;
11452 	req->r_flags |= RRF_COMPLETE;
11453 	wakeup(req);
11454 }
11455 
11456 static void
nspace_resolver_req_completed(uint32_t req_id,int resolver_error,uint64_t orig_gencount)11457 nspace_resolver_req_completed(uint32_t req_id, int resolver_error, uint64_t orig_gencount)
11458 {
11459 	struct nspace_resolver_request *req;
11460 
11461 	NSPACE_REQ_LOCK();
11462 
11463 	// If we don't find the request corresponding to our req_id,
11464 	// just drop the completion signal on the floor; it's likely
11465 	// that the requester interrupted with a signal.
11466 
11467 	req = nspace_resolver_req_lookup(req_id);
11468 	if (req) {
11469 		mount_t locked_mp = NULL;
11470 
11471 		locked_mp = req->r_vp->v_mount;
11472 		mount_ref(locked_mp, 0);
11473 		mount_lock_renames(locked_mp);
11474 
11475 		//
11476 		// if the resolver isn't already returning an error and we have an
11477 		// orig_gencount, then get an iocount on the request vnode and check
11478 		// that the gencount on req->r_vp has not changed.
11479 		//
11480 		// note: a ref was taken on req->r_vp when the request was created
11481 		// and that ref will be dropped by that thread when it wakes up.
11482 		//
11483 		if (resolver_error == 0 &&
11484 		    orig_gencount != 0 &&
11485 		    vnode_getwithref(req->r_vp) == 0) {
11486 			struct vnode_attr va;
11487 			uint64_t cur_gencount;
11488 
11489 			VATTR_INIT(&va);
11490 			VATTR_WANTED(&va, va_recursive_gencount);
11491 
11492 			if (vnode_getattr(req->r_vp, &va, vfs_context_kernel()) == 0) {
11493 				cur_gencount = va.va_recursive_gencount;
11494 			} else {
11495 				cur_gencount = 0;
11496 			}
11497 
11498 			if (resolver_error == 0 && cur_gencount && orig_gencount && cur_gencount != orig_gencount) {
11499 				printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n", orig_gencount, cur_gencount);
11500 
11501 				// this error will be returned to the thread that initiated the
11502 				// materialization of req->r_vp.
11503 				resolver_error = EBUSY;
11504 
11505 				// note: we explicitly do not return an error to the caller (i.e.
11506 				// the thread that did the materialization) because they said they
11507 				// don't want one.
11508 			}
11509 
11510 			vnode_put(req->r_vp);
11511 		}
11512 
11513 		mount_unlock_renames(locked_mp);
11514 		mount_drop(locked_mp, 0);
11515 
11516 		nspace_resolver_req_mark_complete(req, resolver_error);
11517 	}
11518 
11519 	NSPACE_REQ_UNLOCK();
11520 
11521 	return;
11522 }
11523 
11524 static struct proc *nspace_resolver_proc;
11525 
11526 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)11527 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11528 {
11529 	*is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11530 	    p == nspace_resolver_proc) ? 1 : 0;
11531 	return 0;
11532 }
11533 
11534 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11535 
11536 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)11537 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11538 {
11539 	vfs_context_t ctx = vfs_context_current();
11540 	int error = 0;
11541 
11542 	//
11543 	// The system filecoordinationd runs as uid == 0.  This also
11544 	// has the nice side-effect of filtering out filecoordinationd
11545 	// running in the simulator.
11546 	//
11547 	if (!vfs_context_issuser(ctx) ||
11548 	    !vfs_context_is_dataless_resolver(ctx)) {
11549 		return EPERM;
11550 	}
11551 
11552 	if (is_resolver) {
11553 		NSPACE_REQ_LOCK();
11554 
11555 		if (nspace_resolver_proc == NULL) {
11556 			proc_lock(p);
11557 			p->p_lflag |= P_LNSPACE_RESOLVER;
11558 			proc_unlock(p);
11559 			nspace_resolver_proc = p;
11560 		} else {
11561 			error = EBUSY;
11562 		}
11563 
11564 		NSPACE_REQ_UNLOCK();
11565 	} else {
11566 		// This is basically just like the exit case.
11567 		// nspace_resolver_exited() will verify that the
11568 		// process is the resolver, and will clear the
11569 		// global.
11570 		nspace_resolver_exited(p);
11571 	}
11572 
11573 	return error;
11574 }
11575 
11576 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)11577 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11578 {
11579 	if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11580 	    (p->p_vfs_iopolicy &
11581 	    P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11582 		*is_prevented = 1;
11583 	} else {
11584 		*is_prevented = 0;
11585 	}
11586 	return 0;
11587 }
11588 
11589 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)11590 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11591 {
11592 	if (p->p_lflag & P_LNSPACE_RESOLVER) {
11593 		return is_prevented ? 0 : EBUSY;
11594 	}
11595 
11596 	if (is_prevented) {
11597 		OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
11598 	} else {
11599 		OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
11600 	}
11601 	return 0;
11602 }
11603 
11604 static int
nspace_materialization_get_thread_state(int * is_prevented)11605 nspace_materialization_get_thread_state(int *is_prevented)
11606 {
11607 	uthread_t ut = current_uthread();
11608 
11609 	*is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11610 	return 0;
11611 }
11612 
11613 static int
nspace_materialization_set_thread_state(int is_prevented)11614 nspace_materialization_set_thread_state(int is_prevented)
11615 {
11616 	uthread_t ut = current_uthread();
11617 
11618 	if (is_prevented) {
11619 		ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11620 	} else {
11621 		ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11622 	}
11623 	return 0;
11624 }
11625 
11626 /* the vfs.nspace branch */
11627 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11628 
11629 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11630 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11631     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11632 {
11633 	struct proc *p = req->p;
11634 	int new_value, old_value, changed = 0;
11635 	int error;
11636 
11637 	error = nspace_resolver_get_proc_state(p, &old_value);
11638 	if (error) {
11639 		return error;
11640 	}
11641 
11642 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11643 	    &changed);
11644 	if (error == 0 && changed) {
11645 		error = nspace_resolver_set_proc_state(p, new_value);
11646 	}
11647 	return error;
11648 }
11649 
11650 /* decorate this process as the dataless file resolver */
11651 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11652     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11653     0, 0, sysctl_nspace_resolver, "I", "");
11654 
11655 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11656 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11657     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11658 {
11659 	struct proc *p = req->p;
11660 	int new_value, old_value, changed = 0;
11661 	int error;
11662 
11663 	error = nspace_materialization_get_proc_state(p, &old_value);
11664 	if (error) {
11665 		return error;
11666 	}
11667 
11668 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11669 	    &changed);
11670 	if (error == 0 && changed) {
11671 		error = nspace_materialization_set_proc_state(p, new_value);
11672 	}
11673 	return error;
11674 }
11675 
11676 /* decorate this process as not wanting to materialize dataless files */
11677 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
11678     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11679     0, 0, sysctl_nspace_prevent_materialization, "I", "");
11680 
11681 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11682 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
11683     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11684 {
11685 	int new_value, old_value, changed = 0;
11686 	int error;
11687 
11688 	error = nspace_materialization_get_thread_state(&old_value);
11689 	if (error) {
11690 		return error;
11691 	}
11692 
11693 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11694 	    &changed);
11695 	if (error == 0 && changed) {
11696 		error = nspace_materialization_set_thread_state(new_value);
11697 	}
11698 	return error;
11699 }
11700 
11701 /* decorate this thread as not wanting to materialize dataless files */
11702 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
11703     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11704     0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
11705 
11706 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11707 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
11708     __unused int arg2, struct sysctl_req *req)
11709 {
11710 	struct proc *p = req->p;
11711 	uint32_t req_status[2] = { 0, 0 };
11712 	uint64_t gencount = 0;
11713 	int error, is_resolver, changed = 0, gencount_changed;
11714 
11715 	error = nspace_resolver_get_proc_state(p, &is_resolver);
11716 	if (error) {
11717 		return error;
11718 	}
11719 
11720 	if (!is_resolver) {
11721 		return EPERM;
11722 	}
11723 
11724 	error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11725 	    &changed);
11726 	if (error) {
11727 		return error;
11728 	}
11729 
11730 	// get the gencount if it was passed
11731 	error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11732 	    &gencount_changed);
11733 	if (error) {
11734 		gencount = 0;
11735 		// we ignore the error because the gencount was optional
11736 		error = 0;
11737 	}
11738 
11739 	/*
11740 	 * req_status[0] is the req_id
11741 	 *
11742 	 * req_status[1] is the errno
11743 	 */
11744 	if (error == 0 && changed) {
11745 		nspace_resolver_req_completed(req_status[0],
11746 		    (int)req_status[1], gencount);
11747 	}
11748 	return error;
11749 }
11750 
11751 /* Resolver reports completed reqs here. */
11752 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11753     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11754     0, 0, sysctl_nspace_complete, "-", "");
11755 
11756 #endif /* CONFIG_DATALESS_FILES */
11757 
11758 #if CONFIG_DATALESS_FILES
11759 #define __no_dataless_unused    /* nothing */
11760 #else
11761 #define __no_dataless_unused    __unused
11762 #endif
11763 
11764 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)11765 vfs_context_dataless_materialization_is_prevented(
11766 	vfs_context_t const ctx __no_dataless_unused)
11767 {
11768 #if CONFIG_DATALESS_FILES
11769 	proc_t const p = vfs_context_proc(ctx);
11770 	thread_t const t = vfs_context_thread(ctx);
11771 	uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11772 
11773 	/*
11774 	 * Kernel context ==> return EDEADLK, as we would with any random
11775 	 * process decorated as no-materialize.
11776 	 */
11777 	if (ctx == vfs_context_kernel()) {
11778 		return EDEADLK;
11779 	}
11780 
11781 	/*
11782 	 * If the process has the dataless-manipulation entitlement,
11783 	 * materialization is prevented, and depending on the kind
11784 	 * of file system operation, things get to proceed as if the
11785 	 * object is not dataless.
11786 	 */
11787 	if (vfs_context_is_dataless_manipulator(ctx)) {
11788 		return EJUSTRETURN;
11789 	}
11790 
11791 	/*
11792 	 * Per-thread decorations override any process-wide decorations.
11793 	 * (Foundation uses this, and this overrides even the dataless-
11794 	 * manipulation entitlement so as to make API contracts consistent.)
11795 	 */
11796 	if (ut != NULL) {
11797 		if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11798 			return EDEADLK;
11799 		}
11800 		if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11801 			return 0;
11802 		}
11803 	}
11804 
11805 	/*
11806 	 * If the process's iopolicy specifies that dataless files
11807 	 * can be materialized, then we let it go ahead.
11808 	 */
11809 	if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11810 		return 0;
11811 	}
11812 #endif /* CONFIG_DATALESS_FILES */
11813 
11814 	/*
11815 	 * The default behavior is to not materialize dataless files;
11816 	 * return to the caller that deadlock was detected.
11817 	 */
11818 	return EDEADLK;
11819 }
11820 
11821 void
nspace_resolver_init(void)11822 nspace_resolver_init(void)
11823 {
11824 #if CONFIG_DATALESS_FILES
11825 	nspace_resolver_request_hashtbl =
11826 	    hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
11827 	    M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
11828 #endif /* CONFIG_DATALESS_FILES */
11829 }
11830 
11831 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)11832 nspace_resolver_exited(struct proc *p __no_dataless_unused)
11833 {
11834 #if CONFIG_DATALESS_FILES
11835 	struct nspace_resolver_requesthead *bucket;
11836 	struct nspace_resolver_request *req;
11837 	u_long idx;
11838 
11839 	NSPACE_REQ_LOCK();
11840 
11841 	if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11842 	    p == nspace_resolver_proc) {
11843 		for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
11844 			bucket = &nspace_resolver_request_hashtbl[idx];
11845 			LIST_FOREACH(req, bucket, r_hashlink) {
11846 				nspace_resolver_req_mark_complete(req,
11847 				    ETIMEDOUT);
11848 			}
11849 		}
11850 		nspace_resolver_proc = NULL;
11851 	}
11852 
11853 	NSPACE_REQ_UNLOCK();
11854 #endif /* CONFIG_DATALESS_FILES */
11855 }
11856 
11857 int
resolve_nspace_item(struct vnode * vp,uint64_t op)11858 resolve_nspace_item(struct vnode *vp, uint64_t op)
11859 {
11860 	return resolve_nspace_item_ext(vp, op, NULL);
11861 }
11862 
11863 #define DATALESS_RESOLVER_ENTITLEMENT     \
11864 	"com.apple.private.vfs.dataless-resolver"
11865 #define DATALESS_MANIPULATION_ENTITLEMENT \
11866 	"com.apple.private.vfs.dataless-manipulation"
11867 
11868 #if CONFIG_DATALESS_FILES
11869 /*
11870  * Return TRUE if the vfs context is associated with the dataless
11871  * resolver.
11872  */
11873 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)11874 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
11875 {
11876 	return IOTaskHasEntitlement(vfs_context_task(ctx),
11877 	           DATALESS_RESOLVER_ENTITLEMENT);
11878 }
11879 #endif /* CONFIG_DATALESS_FILES */
11880 
11881 /*
11882  * Return TRUE if the vfs context is associated with a process entitled
11883  * for dataless manipulation.
11884  *
11885  * XXX Arguably belongs in vfs_subr.c, but is here because of the
11886  * complication around CONFIG_DATALESS_FILES.
11887  */
11888 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)11889 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
11890 {
11891 #if CONFIG_DATALESS_FILES
11892 	task_t task = vfs_context_task(ctx);
11893 	return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
11894 	       IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
11895 #else
11896 	return false;
11897 #endif /* CONFIG_DATALESS_FILES */
11898 }
11899 
11900 #if CONFIG_DATALESS_FILES
11901 static void
log_materialization_prevented(vnode_t vp,uint64_t op)11902 log_materialization_prevented(vnode_t vp, uint64_t op)
11903 {
11904 	char p_name[MAXCOMLEN + 1];
11905 	char *vntype;
11906 	proc_selfname(&p_name[0], sizeof(p_name));
11907 
11908 	if (vp->v_type == VREG) {
11909 		vntype = "File";
11910 	} else if (vp->v_type == VDIR) {
11911 		vntype = "Dir";
11912 	} else if (vp->v_type == VLNK) {
11913 		vntype = "SymLink";
11914 	} else {
11915 		vntype = "Other";
11916 	}
11917 
11918 #if DEVELOPMENT
11919 	char *path = NULL;
11920 	int   len;
11921 
11922 	path = get_pathbuff();
11923 	len = MAXPATHLEN;
11924 	if (path) {
11925 		vn_getpath(vp, path, &len);
11926 	}
11927 
11928 	os_log_debug(OS_LOG_DEFAULT,
11929 	    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) path: %s",
11930 	    p_name, proc_selfpid(),
11931 	    op, vntype, path ? path : "<unknown-path>");
11932 	if (path) {
11933 		release_pathbuff(path);
11934 	}
11935 #else
11936 	os_log_debug(OS_LOG_DEFAULT,
11937 	    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
11938 	    p_name, proc_selfpid(),
11939 	    op, vntype);
11940 #endif
11941 }
11942 #endif /* CONFIG_DATALESS_FILES */
11943 
11944 static int
vfs_materialize_item(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused)11945 vfs_materialize_item(
11946 	struct vnode *vp __no_dataless_unused,
11947 	uint64_t op __no_dataless_unused,
11948 	int64_t offset __no_dataless_unused,
11949 	int64_t size __no_dataless_unused,
11950 	char *lookup_name __no_dataless_unused,
11951 	size_t const namelen __no_dataless_unused)
11952 {
11953 #if CONFIG_DATALESS_FILES
11954 	struct nspace_resolver_request req;
11955 	kern_return_t kern_ret;
11956 	mach_port_t mach_port;
11957 	char *path = NULL;
11958 	vfs_context_t context;
11959 	int path_len;
11960 	int error;
11961 	audit_token_t atoken;
11962 
11963 	/*
11964 	 * If this is a snapshot event and the vnode is on a disk image just
11965 	 * pretend nothing happened since any change to the disk image will
11966 	 * cause the disk image itself to get backed up and this avoids multi-
11967 	 * way deadlocks between the snapshot handler and the ever popular
11968 	 * diskimages-helper process. The variable nspace_allow_virtual_devs
11969 	 * allows this behavior to be overridden (for use by the Mobile
11970 	 * TimeMachine testing infrastructure which uses disk images).
11971 	 */
11972 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
11973 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
11974 		return ENOTSUP;
11975 	}
11976 
11977 	context = vfs_context_current();
11978 
11979 	error = vfs_context_dataless_materialization_is_prevented(context);
11980 	if (error) {
11981 		log_materialization_prevented(vp, op);
11982 		return error;
11983 	}
11984 
11985 	kern_ret = host_get_filecoordinationd_port(host_priv_self(),
11986 	    &mach_port);
11987 	if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
11988 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
11989 		/*
11990 		 * Treat this like being unable to access the backing store
11991 		 * server.
11992 		 */
11993 		return ETIMEDOUT;
11994 	}
11995 
11996 	int path_alloc_len = MAXPATHLEN;
11997 	do {
11998 		path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
11999 		if (path == NULL) {
12000 			return ENOMEM;
12001 		}
12002 
12003 		path_len = path_alloc_len;
12004 		error = vn_getpath(vp, path, &path_len);
12005 		if (error == 0) {
12006 			break;
12007 		} else if (error == ENOSPC) {
12008 			kfree_data(path, path_alloc_len);
12009 			path = NULL;
12010 		} else {
12011 			goto out_release_port;
12012 		}
12013 	} while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) && path_alloc_len <= FSGETPATH_MAXBUFLEN);
12014 
12015 	error = vfs_context_copy_audit_token(context, &atoken);
12016 	if (error) {
12017 		goto out_release_port;
12018 	}
12019 
12020 	req.r_req_id = next_nspace_req_id();
12021 	req.r_resolver_error = 0;
12022 	req.r_flags = 0;
12023 	req.r_vp = vp;
12024 
12025 	NSPACE_REQ_LOCK();
12026 	error = nspace_resolver_req_add(&req);
12027 	NSPACE_REQ_UNLOCK();
12028 	if (error) {
12029 		goto out_release_port;
12030 	}
12031 
12032 	os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12033 	if (vp->v_type == VDIR) {
12034 		char *tmpname = NULL;
12035 
12036 		/*
12037 		 * If the caller provided a lookup_name *and* a name length,
12038 		 * then we assume the lookup_name is not NUL-terminated.
12039 		 * Allocate a temporary buffer in this case to provide
12040 		 * a NUL-terminated path name to the IPC call.
12041 		 */
12042 		if (lookup_name != NULL && namelen != 0) {
12043 			if (namelen >= PATH_MAX) {
12044 				error = EINVAL;
12045 				goto out_release_port;
12046 			}
12047 			tmpname = zalloc(ZV_NAMEI);
12048 			strlcpy(tmpname, lookup_name, namelen + 1);
12049 			lookup_name = tmpname;
12050 		} else if (lookup_name != NULL) {
12051 			/*
12052 			 * If the caller provided a lookup_name with a
12053 			 * zero name length, then we assume it's NUL-
12054 			 * terminated.  Verify it has a valid length.
12055 			 */
12056 			if (strlen(lookup_name) >= PATH_MAX) {
12057 				error = EINVAL;
12058 				goto out_release_port;
12059 			}
12060 		}
12061 
12062 		kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12063 		    req.r_req_id, (uint32_t)(op & 0xffffffff),
12064 		    lookup_name == NULL ? "" : lookup_name, path, atoken);
12065 
12066 		if (tmpname != NULL) {
12067 			zfree(ZV_NAMEI, tmpname);
12068 
12069 			/*
12070 			 * Poison lookup_name rather than reference
12071 			 * freed memory.
12072 			 */
12073 			lookup_name = NULL;
12074 		}
12075 	} else {
12076 		kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12077 		    req.r_req_id, (uint32_t)(op & 0xffffffff),
12078 		    offset, size, path, atoken);
12079 	}
12080 	if (kern_ret != KERN_SUCCESS) {
12081 		/*
12082 		 * Also treat this like being unable to access the backing
12083 		 * store server.
12084 		 */
12085 		os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12086 		    kern_ret);
12087 		error = ETIMEDOUT;
12088 
12089 		NSPACE_REQ_LOCK();
12090 		nspace_resolver_req_remove(&req);
12091 		NSPACE_REQ_UNLOCK();
12092 		goto out_release_port;
12093 	}
12094 
12095 	/*
12096 	 * Give back the memory we allocated earlier while we wait; we
12097 	 * no longer need it.
12098 	 */
12099 	kfree_data(path, path_alloc_len);
12100 	path = NULL;
12101 
12102 	/*
12103 	 * Request has been submitted to the resolver. Now (interruptibly)
12104 	 * wait for completion. Upon requrn, the request will have been
12105 	 * removed from the lookup table.
12106 	 */
12107 	error = nspace_resolver_req_wait(&req);
12108 
12109 out_release_port:
12110 	if (path != NULL) {
12111 		kfree_data(path, path_alloc_len);
12112 		path = NULL;
12113 	}
12114 	ipc_port_release_send(mach_port);
12115 
12116 	return error;
12117 #else
12118 	return ENOTSUP;
12119 #endif /* CONFIG_DATALESS_FILES */
12120 }
12121 
12122 /*
12123  * vfs_materialize_file: Materialize a regular file.
12124  *
12125  * Inputs:
12126  * vp		The dataless file to be materialized.
12127  *
12128  * op		What kind of operation is being performed:
12129  *		-> NAMESPACE_HANDLER_READ_OP
12130  *		-> NAMESPACE_HANDLER_WRITE_OP
12131  *		-> NAMESPACE_HANDLER_LINK_CREATE
12132  *		-> NAMESPACE_HANDLER_DELETE_OP
12133  *		-> NAMESPACE_HANDLER_TRUNCATE_OP
12134  *		-> NAMESPACE_HANDLER_RENAME_OP
12135  *
12136  * offset	offset of I/O for READ or WRITE.  Ignored for
12137  *		other ops.
12138  *
12139  * size		size of I/O for READ or WRITE  Ignored for
12140  *		other ops.
12141  *
12142  * If offsize or size are -1 for a READ or WRITE, then the resolver should
12143  * consider the range to be unknown.
12144  *
12145  * Upon successful return, the caller may proceed with the operation.
12146  * N.B. the file may still be "dataless" in this case.
12147  */
12148 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12149 vfs_materialize_file(
12150 	struct vnode *vp,
12151 	uint64_t op,
12152 	int64_t offset,
12153 	int64_t size)
12154 {
12155 	if (vp->v_type != VREG) {
12156 		return EFTYPE;
12157 	}
12158 	return vfs_materialize_item(vp, op, offset, size, NULL, 0);
12159 }
12160 
12161 /*
12162  * vfs_materialize_dir:
12163  *
12164  * Inputs:
12165  * vp		The dataless directory to be materialized.
12166  *
12167  * op		What kind of operation is being performed:
12168  *		-> NAMESPACE_HANDLER_READ_OP
12169  *		-> NAMESPACE_HANDLER_WRITE_OP
12170  *		-> NAMESPACE_HANDLER_DELETE_OP
12171  *		-> NAMESPACE_HANDLER_RENAME_OP
12172  *		-> NAMESPACE_HANDLER_LOOKUP_OP
12173  *
12174  * lookup_name	Name being looked up for a LOOKUP op.  Ignored for
12175  *		other ops.  May or may not be NUL-terminated; see below.
12176  *
12177  * namelen	If non-zero, then lookup_name is assumed to not be NUL-
12178  *		terminated and namelen is the number of valid bytes in
12179  *		lookup_name. If zero, then lookup_name is assumed to be
12180  *		NUL-terminated.
12181  *
12182  * Upon successful return, the caller may proceed with the operation.
12183  * N.B. the directory may still be "dataless" in this case.
12184  */
12185 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12186 vfs_materialize_dir(
12187 	struct vnode *vp,
12188 	uint64_t op,
12189 	char *lookup_name,
12190 	size_t namelen)
12191 {
12192 	if (vp->v_type != VDIR) {
12193 		return EFTYPE;
12194 	}
12195 	if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12196 		return EINVAL;
12197 	}
12198 	return vfs_materialize_item(vp, op, 0, 0, lookup_name, namelen);
12199 }
12200 
12201 int
resolve_nspace_item_ext(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,void * arg __unused)12202 resolve_nspace_item_ext(
12203 	struct vnode *vp __no_dataless_unused,
12204 	uint64_t op __no_dataless_unused,
12205 	void *arg __unused)
12206 {
12207 #if CONFIG_DATALESS_FILES
12208 	int error;
12209 	mach_port_t mp;
12210 	char *path = NULL;
12211 	int path_len;
12212 	kern_return_t kr;
12213 	struct nspace_resolver_request req;
12214 
12215 	// only allow namespace events on regular files, directories and symlinks.
12216 	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
12217 		return EFTYPE;
12218 	}
12219 
12220 	//
12221 	// if this is a snapshot event and the vnode is on a
12222 	// disk image just pretend nothing happened since any
12223 	// change to the disk image will cause the disk image
12224 	// itself to get backed up and this avoids multi-way
12225 	// deadlocks between the snapshot handler and the ever
12226 	// popular diskimages-helper process.  the variable
12227 	// nspace_allow_virtual_devs allows this behavior to
12228 	// be overridden (for use by the Mobile TimeMachine
12229 	// testing infrastructure which uses disk images)
12230 	//
12231 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12232 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12233 		return ENOTSUP;
12234 	}
12235 
12236 	error = vfs_context_dataless_materialization_is_prevented(
12237 		vfs_context_current());
12238 	if (error) {
12239 		log_materialization_prevented(vp, op);
12240 		return error;
12241 	}
12242 
12243 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
12244 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
12245 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12246 		// Treat this like being unable to access the backing
12247 		// store server.
12248 		return ETIMEDOUT;
12249 	}
12250 
12251 	int path_alloc_len = MAXPATHLEN;
12252 	do {
12253 		path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
12254 		if (path == NULL) {
12255 			return ENOMEM;
12256 		}
12257 
12258 		path_len = path_alloc_len;
12259 		error = vn_getpath(vp, path, &path_len);
12260 		if (error == 0) {
12261 			break;
12262 		} else if (error == ENOSPC) {
12263 			kfree_data(path, path_alloc_len);
12264 			path = NULL;
12265 		} else {
12266 			goto out_release_port;
12267 		}
12268 	} while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) && path_alloc_len <= FSGETPATH_MAXBUFLEN);
12269 
12270 	if (error == 0) {
12271 		int xxx_rdar44371223;   /* XXX Mig bug */
12272 		req.r_req_id = next_nspace_req_id();
12273 		req.r_resolver_error = 0;
12274 		req.r_flags = 0;
12275 
12276 		if ((error = vnode_ref(vp)) == 0) {     // take a ref so that the vnode doesn't go away
12277 			req.r_vp = vp;
12278 		} else {
12279 			goto out_release_port;
12280 		}
12281 
12282 		NSPACE_REQ_LOCK();
12283 		error = nspace_resolver_req_add(&req);
12284 		NSPACE_REQ_UNLOCK();
12285 		if (error) {
12286 			vnode_rele(req.r_vp);
12287 			goto out_release_port;
12288 		}
12289 
12290 		os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12291 		kr = send_nspace_resolve_path(mp, req.r_req_id,
12292 		    proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
12293 		    path, &xxx_rdar44371223);
12294 		if (kr != KERN_SUCCESS) {
12295 			// Also treat this like being unable to access
12296 			// the backing store server.
12297 			os_log_error(OS_LOG_DEFAULT,
12298 			    "NSPACE resolve_path failure: %d", kr);
12299 			error = ETIMEDOUT;
12300 
12301 			NSPACE_REQ_LOCK();
12302 			nspace_resolver_req_remove(&req);
12303 			NSPACE_REQ_UNLOCK();
12304 			vnode_rele(req.r_vp);
12305 			goto out_release_port;
12306 		}
12307 
12308 		// Give back the memory we allocated earlier while
12309 		// we wait; we no longer need it.
12310 		kfree_data(path, path_alloc_len);
12311 		path = NULL;
12312 
12313 		// Request has been submitted to the resolver.
12314 		// Now (interruptibly) wait for completion.
12315 		// Upon requrn, the request will have been removed
12316 		// from the lookup table.
12317 		error = nspace_resolver_req_wait(&req);
12318 
12319 		vnode_rele(req.r_vp);
12320 	}
12321 
12322 out_release_port:
12323 	if (path != NULL) {
12324 		kfree_data(path, path_alloc_len);
12325 		path = NULL;
12326 	}
12327 	ipc_port_release_send(mp);
12328 
12329 	return error;
12330 #else
12331 	return ENOTSUP;
12332 #endif /* CONFIG_DATALESS_FILES */
12333 }
12334 
12335 int
nspace_snapshot_event(__unused vnode_t vp,__unused time_t ctime,__unused uint64_t op_type,__unused void * arg)12336 nspace_snapshot_event(__unused vnode_t vp, __unused  time_t ctime,
12337     __unused uint64_t op_type, __unused void *arg)
12338 {
12339 	return 0;
12340 }
12341 
12342 #if 0
12343 static int
12344 build_volfs_path(struct vnode *vp, char *path, int *len)
12345 {
12346 	struct vnode_attr va;
12347 	int ret;
12348 
12349 	VATTR_INIT(&va);
12350 	VATTR_WANTED(&va, va_fsid);
12351 	VATTR_WANTED(&va, va_fileid);
12352 
12353 	if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12354 		*len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12355 		ret = -1;
12356 	} else {
12357 		*len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12358 		ret = 0;
12359 	}
12360 
12361 	return ret;
12362 }
12363 #endif
12364 
12365 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)12366 fsctl_bogus_command_compat(unsigned long cmd)
12367 {
12368 	switch (cmd) {
12369 	case IOCBASECMD(FSIOC_SYNC_VOLUME):
12370 		return FSIOC_SYNC_VOLUME;
12371 	case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12372 		return FSIOC_ROUTEFS_SETROUTEID;
12373 	case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12374 		return FSIOC_SET_PACKAGE_EXTS;
12375 	case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12376 		return FSIOC_SET_FSTYPENAME_OVERRIDE;
12377 	case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12378 		return DISK_CONDITIONER_IOC_GET;
12379 	case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12380 		return DISK_CONDITIONER_IOC_SET;
12381 	case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12382 		return FSIOC_FIOSEEKHOLE;
12383 	case IOCBASECMD(FSIOC_FIOSEEKDATA):
12384 		return FSIOC_FIOSEEKDATA;
12385 	case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12386 		return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12387 	case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12388 		return SPOTLIGHT_IOC_GET_LAST_MTIME;
12389 	}
12390 
12391 	return cmd;
12392 }
12393 
12394 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)12395 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12396 {
12397 	return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
12398 }
12399 
12400 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)12401 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12402 {
12403 	struct vfs_attr vfa;
12404 	mount_t mp = vp->v_mount;
12405 	unsigned arg;
12406 	int error;
12407 
12408 	/* record vid of vp so we can drop it below. */
12409 	uint32_t vvid = vp->v_id;
12410 
12411 	/*
12412 	 * Then grab mount_iterref so that we can release the vnode.
12413 	 * Without this, a thread may call vnode_iterate_prepare then
12414 	 * get into a deadlock because we've never released the root vp
12415 	 */
12416 	error = mount_iterref(mp, 0);
12417 	if (error) {
12418 		return error;
12419 	}
12420 	vnode_hold(vp);
12421 	vnode_put(vp);
12422 
12423 	arg = MNT_NOWAIT;
12424 	if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12425 		arg = MNT_WAIT;
12426 	}
12427 
12428 	/*
12429 	 * If the filessytem supports multiple filesytems in a
12430 	 * partition (For eg APFS volumes in a container, it knows
12431 	 * that the waitfor argument to VFS_SYNC are flags.
12432 	 */
12433 	VFSATTR_INIT(&vfa);
12434 	VFSATTR_WANTED(&vfa, f_capabilities);
12435 	if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
12436 	    VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12437 	    ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12438 	    ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12439 		arg |= MNT_VOLUME;
12440 	}
12441 
12442 	/* issue the sync for this volume */
12443 	(void)sync_callback(mp, &arg);
12444 
12445 	/*
12446 	 * Then release the mount_iterref once we're done syncing; it's not
12447 	 * needed for the VNOP_IOCTL below
12448 	 */
12449 	mount_iterdrop(mp);
12450 
12451 	if (arg & FSCTL_SYNC_FULLSYNC) {
12452 		/* re-obtain vnode iocount on the root vp, if possible */
12453 		error = vnode_getwithvid(vp, vvid);
12454 		if (error == 0) {
12455 			error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
12456 			vnode_put(vp);
12457 		}
12458 	}
12459 	vnode_drop(vp);
12460 	/* mark the argument VP as having been released */
12461 	*arg_vp = NULL;
12462 	return error;
12463 }
12464 
12465 #if ROUTEFS
12466 static int __attribute__((noinline))
handle_routes(user_addr_t udata)12467 handle_routes(user_addr_t udata)
12468 {
12469 	char routepath[MAXPATHLEN];
12470 	size_t len = 0;
12471 	int error;
12472 
12473 	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12474 		return error;
12475 	}
12476 	bzero(routepath, MAXPATHLEN);
12477 	error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12478 	if (error) {
12479 		return error;
12480 	}
12481 	error = routefs_kernel_mount(routepath);
12482 	return error;
12483 }
12484 #endif
12485 
12486 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)12487 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12488 {
12489 	struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12490 	struct vnode_attr va;
12491 	int error;
12492 
12493 	VATTR_INIT(&va);
12494 	VATTR_SET(&va, va_flags, cas->new_flags);
12495 
12496 	error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
12497 
12498 #if CONFIG_FSE
12499 	if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12500 		add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12501 	}
12502 #endif
12503 
12504 	return error;
12505 }
12506 
12507 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)12508 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12509 {
12510 	struct mount *mp = NULL;
12511 	errno_t rootauth = 0;
12512 
12513 	mp = vp->v_mount;
12514 
12515 	/*
12516 	 * query the underlying FS and see if it reports something
12517 	 * sane for this vnode. If volume is authenticated via
12518 	 * chunklist, leave that for the caller to determine.
12519 	 */
12520 	rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12521 
12522 	return rootauth;
12523 }
12524 
12525 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12526 	"com.apple.private.kernel.set-package-extensions"
12527 
12528 /*
12529  * Make a filesystem-specific control call:
12530  */
12531 /* ARGSUSED */
12532 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)12533 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12534 {
12535 	int error = 0;
12536 	boolean_t is64bit;
12537 	u_int size;
12538 #define STK_PARAMS 128
12539 	char stkbuf[STK_PARAMS] = {0};
12540 	caddr_t data, memp;
12541 	vnode_t vp = *arg_vp;
12542 
12543 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
12544 		return ENOTTY;
12545 	}
12546 
12547 	cmd = fsctl_bogus_command_compat(cmd);
12548 
12549 	size = IOCPARM_LEN(cmd);
12550 	if (size > IOCPARM_MAX) {
12551 		return EINVAL;
12552 	}
12553 
12554 	is64bit = proc_is64bit(p);
12555 
12556 	memp = NULL;
12557 
12558 	if (size > sizeof(stkbuf)) {
12559 		if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12560 			return ENOMEM;
12561 		}
12562 		data = memp;
12563 	} else {
12564 		data = &stkbuf[0];
12565 	};
12566 
12567 	if (cmd & IOC_IN) {
12568 		if (size) {
12569 			error = copyin(udata, data, size);
12570 			if (error) {
12571 				if (memp) {
12572 					kfree_data(memp, size);
12573 				}
12574 				return error;
12575 			}
12576 		} else {
12577 			if (is64bit) {
12578 				*(user_addr_t *)data = udata;
12579 			} else {
12580 				*(uint32_t *)data = (uint32_t)udata;
12581 			}
12582 		};
12583 	} else if ((cmd & IOC_OUT) && size) {
12584 		/*
12585 		 * Zero the buffer so the user always
12586 		 * gets back something deterministic.
12587 		 */
12588 		bzero(data, size);
12589 	} else if (cmd & IOC_VOID) {
12590 		if (is64bit) {
12591 			*(user_addr_t *)data = udata;
12592 		} else {
12593 			*(uint32_t *)data = (uint32_t)udata;
12594 		}
12595 	}
12596 
12597 	/* Check to see if it's a generic command */
12598 	switch (cmd) {
12599 	case FSIOC_SYNC_VOLUME:
12600 		error = handle_sync_volume(vp, arg_vp, data, ctx);
12601 		break;
12602 
12603 	case FSIOC_ROUTEFS_SETROUTEID:
12604 #if ROUTEFS
12605 		error = handle_routes(udata);
12606 #endif
12607 		break;
12608 
12609 	case FSIOC_SET_PACKAGE_EXTS: {
12610 		user_addr_t ext_strings;
12611 		uint32_t    num_entries;
12612 		uint32_t    max_width;
12613 
12614 		if (!IOTaskHasEntitlement(vfs_context_task(ctx),
12615 		    SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12616 			error = EPERM;
12617 			break;
12618 		}
12619 
12620 		if ((is64bit && size != sizeof(user64_package_ext_info))
12621 		    || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12622 			// either you're 64-bit and passed a 64-bit struct or
12623 			// you're 32-bit and passed a 32-bit struct.  otherwise
12624 			// it's not ok.
12625 			error = EINVAL;
12626 			break;
12627 		}
12628 
12629 		if (is64bit) {
12630 			if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12631 				assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12632 			}
12633 			ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12634 			num_entries = ((user64_package_ext_info *)data)->num_entries;
12635 			max_width   = ((user64_package_ext_info *)data)->max_width;
12636 		} else {
12637 			ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12638 			num_entries = ((user32_package_ext_info *)data)->num_entries;
12639 			max_width   = ((user32_package_ext_info *)data)->max_width;
12640 		}
12641 		error = set_package_extensions_table(ext_strings, num_entries, max_width);
12642 	}
12643 	break;
12644 
12645 	case FSIOC_SET_FSTYPENAME_OVERRIDE:
12646 	{
12647 		if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12648 			break;
12649 		}
12650 		if (vp->v_mount) {
12651 			mount_lock(vp->v_mount);
12652 			if (data[0] != 0) {
12653 				int i;
12654 				for (i = 0; i < MFSTYPENAMELEN; i++) {
12655 					if (!data[i]) {
12656 						goto continue_copy;
12657 					}
12658 				}
12659 				/*
12660 				 * Getting here means we have a user data string which has no
12661 				 * NULL termination in its first MFSTYPENAMELEN bytes.
12662 				 * This is bogus, let's avoid strlcpy-ing the read data and
12663 				 * return an error.
12664 				 */
12665 				error = EINVAL;
12666 				goto unlock;
12667 continue_copy:
12668 				strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
12669 				vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
12670 				if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
12671 					vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
12672 					vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
12673 				}
12674 			} else {
12675 				if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
12676 					vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
12677 				}
12678 				vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
12679 				vp->v_mount->fstypename_override[0] = '\0';
12680 			}
12681 unlock:
12682 			mount_unlock(vp->v_mount);
12683 		}
12684 	}
12685 	break;
12686 
12687 	case DISK_CONDITIONER_IOC_GET: {
12688 		error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12689 	}
12690 	break;
12691 
12692 	case DISK_CONDITIONER_IOC_SET: {
12693 		error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
12694 	}
12695 	break;
12696 
12697 	case FSIOC_CAS_BSDFLAGS:
12698 		error = handle_flags(vp, data, ctx);
12699 		break;
12700 
12701 	case FSIOC_FD_ONLY_OPEN_ONCE: {
12702 		error = 0;
12703 		if (vnode_usecount(vp) > 1) {
12704 			vnode_lock_spin(vp);
12705 			if (vp->v_lflag & VL_HASSTREAMS) {
12706 				if (vnode_isinuse_locked(vp, 1, 1)) {
12707 					error = EBUSY;
12708 				}
12709 			} else if (vnode_usecount(vp) > 1) {
12710 				error = EBUSY;
12711 			}
12712 			vnode_unlock(vp);
12713 		}
12714 	}
12715 	break;
12716 
12717 	case FSIOC_EVAL_ROOTAUTH:
12718 		error = handle_auth(vp, cmd, data, options, ctx);
12719 		break;
12720 
12721 	case FSIOC_TEST_FSE_ACCESS_GRANTED:
12722 		error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
12723 		break;
12724 
12725 	default: {
12726 		/* other, known commands shouldn't be passed down here */
12727 		switch (cmd) {
12728 		case F_PUNCHHOLE:
12729 		case F_TRIM_ACTIVE_FILE:
12730 		case F_RDADVISE:
12731 		case F_TRANSCODEKEY:
12732 		case F_GETPROTECTIONLEVEL:
12733 		case F_GETDEFAULTPROTLEVEL:
12734 		case F_MAKECOMPRESSED:
12735 		case F_SET_GREEDY_MODE:
12736 		case F_SETSTATICCONTENT:
12737 		case F_SETIOTYPE:
12738 		case F_SETBACKINGSTORE:
12739 		case F_GETPATH_MTMINFO:
12740 		case APFSIOC_REVERT_TO_SNAPSHOT:
12741 		case FSIOC_FIOSEEKHOLE:
12742 		case FSIOC_FIOSEEKDATA:
12743 		case HFS_GET_BOOT_INFO:
12744 		case HFS_SET_BOOT_INFO:
12745 		case FIOPINSWAP:
12746 		case F_CHKCLEAN:
12747 		case F_FULLFSYNC:
12748 		case F_BARRIERFSYNC:
12749 		case F_FREEZE_FS:
12750 		case F_THAW_FS:
12751 		case FSIOC_KERNEL_ROOTAUTH:
12752 		case FSIOC_GRAFT_FS:
12753 		case FSIOC_UNGRAFT_FS:
12754 		case FSIOC_AUTH_FS:
12755 			error = EINVAL;
12756 			goto outdrop;
12757 		}
12758 		/* Invoke the filesystem-specific code */
12759 		error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12760 	}
12761 	} /* end switch stmt */
12762 
12763 	/*
12764 	 * if no errors, copy any data to user. Size was
12765 	 * already set and checked above.
12766 	 */
12767 	if (error == 0 && (cmd & IOC_OUT) && size) {
12768 		error = copyout(data, udata, size);
12769 	}
12770 
12771 outdrop:
12772 	if (memp) {
12773 		kfree_data(memp, size);
12774 	}
12775 
12776 	return error;
12777 }
12778 
12779 /* ARGSUSED */
12780 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)12781 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
12782 {
12783 	int error;
12784 	struct nameidata nd;
12785 	uint32_t nameiflags;
12786 	vnode_t vp = NULL;
12787 	vfs_context_t ctx = vfs_context_current();
12788 
12789 	AUDIT_ARG(cmd, (int)uap->cmd);
12790 	AUDIT_ARG(value32, uap->options);
12791 	/* Get the vnode for the file we are getting info on:  */
12792 	nameiflags = 0;
12793 	//
12794 	// if we come through fsctl() then the file is by definition not open.
12795 	// therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
12796 	// lest the caller mistakenly thinks the only open is their own (but in
12797 	// reality it's someone elses).
12798 	//
12799 	if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
12800 		return EINVAL;
12801 	}
12802 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
12803 		nameiflags |= FOLLOW;
12804 	}
12805 	if (uap->cmd == FSIOC_FIRMLINK_CTL) {
12806 		nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
12807 	}
12808 	NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
12809 	    UIO_USERSPACE, uap->path, ctx);
12810 	if ((error = namei(&nd))) {
12811 		goto done;
12812 	}
12813 	vp = nd.ni_vp;
12814 	nameidone(&nd);
12815 
12816 #if CONFIG_MACF
12817 	error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
12818 	if (error) {
12819 		goto done;
12820 	}
12821 #endif
12822 
12823 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12824 
12825 done:
12826 	if (vp) {
12827 		vnode_put(vp);
12828 	}
12829 	return error;
12830 }
12831 /* ARGSUSED */
12832 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)12833 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
12834 {
12835 	int error;
12836 	vnode_t vp = NULL;
12837 	vfs_context_t ctx = vfs_context_current();
12838 	int fd = -1;
12839 
12840 	AUDIT_ARG(fd, uap->fd);
12841 	AUDIT_ARG(cmd, (int)uap->cmd);
12842 	AUDIT_ARG(value32, uap->options);
12843 
12844 	/* Get the vnode for the file we are getting info on:  */
12845 	if ((error = file_vnode(uap->fd, &vp))) {
12846 		return error;
12847 	}
12848 	fd = uap->fd;
12849 	if ((error = vnode_getwithref(vp))) {
12850 		file_drop(fd);
12851 		return error;
12852 	}
12853 
12854 #if CONFIG_MACF
12855 	if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
12856 		file_drop(fd);
12857 		vnode_put(vp);
12858 		return error;
12859 	}
12860 #endif
12861 
12862 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12863 
12864 	file_drop(fd);
12865 
12866 	/*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
12867 	if (vp) {
12868 		vnode_put(vp);
12869 	}
12870 
12871 	return error;
12872 }
12873 /* end of fsctl system call */
12874 
12875 #define FILESEC_ACCESS_ENTITLEMENT              \
12876 	"com.apple.private.vfs.filesec-access"
12877 
12878 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)12879 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
12880 {
12881 	if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
12882 		/*
12883 		 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
12884 		 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
12885 		 */
12886 		if ((!setting && vfs_context_issuser(ctx)) ||
12887 		    IOTaskHasEntitlement(vfs_context_task(ctx),
12888 		    FILESEC_ACCESS_ENTITLEMENT)) {
12889 			return 0;
12890 		}
12891 	}
12892 
12893 	return EPERM;
12894 }
12895 
12896 /*
12897  *  Retrieve the data of an extended attribute.
12898  */
12899 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)12900 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
12901 {
12902 	vnode_t vp;
12903 	struct nameidata nd;
12904 	char attrname[XATTR_MAXNAMELEN + 1];
12905 	vfs_context_t ctx = vfs_context_current();
12906 	uio_t auio = NULL;
12907 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12908 	size_t attrsize = 0;
12909 	size_t namelen;
12910 	u_int32_t nameiflags;
12911 	int error;
12912 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12913 
12914 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12915 		return EINVAL;
12916 	}
12917 
12918 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12919 	NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
12920 	if ((error = namei(&nd))) {
12921 		return error;
12922 	}
12923 	vp = nd.ni_vp;
12924 	nameidone(&nd);
12925 
12926 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12927 	if (error != 0) {
12928 		goto out;
12929 	}
12930 	if (xattr_protected(attrname) &&
12931 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12932 		goto out;
12933 	}
12934 	/*
12935 	 * the specific check for 0xffffffff is a hack to preserve
12936 	 * binaray compatibilty in K64 with applications that discovered
12937 	 * that passing in a buf pointer and a size of -1 resulted in
12938 	 * just the size of the indicated extended attribute being returned.
12939 	 * this isn't part of the documented behavior, but because of the
12940 	 * original implemtation's check for "uap->size > 0", this behavior
12941 	 * was allowed. In K32 that check turned into a signed comparison
12942 	 * even though uap->size is unsigned...  in K64, we blow by that
12943 	 * check because uap->size is unsigned and doesn't get sign smeared
12944 	 * in the munger for a 32 bit user app.  we also need to add a
12945 	 * check to limit the maximum size of the buffer being passed in...
12946 	 * unfortunately, the underlying fileystems seem to just malloc
12947 	 * the requested size even if the actual extended attribute is tiny.
12948 	 * because that malloc is for kernel wired memory, we have to put a
12949 	 * sane limit on it.
12950 	 *
12951 	 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
12952 	 * U64 running on K64 will yield -1 (64 bits wide)
12953 	 * U32/U64 running on K32 will yield -1 (32 bits wide)
12954 	 */
12955 	if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
12956 		goto no_uio;
12957 	}
12958 
12959 	if (uap->value) {
12960 		if (uap->size > (size_t)XATTR_MAXSIZE) {
12961 			uap->size = XATTR_MAXSIZE;
12962 		}
12963 
12964 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12965 		    &uio_buf[0], sizeof(uio_buf));
12966 		uio_addiov(auio, uap->value, uap->size);
12967 	}
12968 no_uio:
12969 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
12970 out:
12971 	vnode_put(vp);
12972 
12973 	if (auio) {
12974 		*retval = uap->size - uio_resid(auio);
12975 	} else {
12976 		*retval = (user_ssize_t)attrsize;
12977 	}
12978 
12979 	return error;
12980 }
12981 
12982 /*
12983  * Retrieve the data of an extended attribute.
12984  */
12985 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)12986 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
12987 {
12988 	vnode_t vp;
12989 	char attrname[XATTR_MAXNAMELEN + 1];
12990 	vfs_context_t ctx = vfs_context_current();
12991 	uio_t auio = NULL;
12992 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12993 	size_t attrsize = 0;
12994 	size_t namelen;
12995 	int error;
12996 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12997 
12998 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12999 		return EINVAL;
13000 	}
13001 
13002 	if ((error = file_vnode(uap->fd, &vp))) {
13003 		return error;
13004 	}
13005 	if ((error = vnode_getwithref(vp))) {
13006 		file_drop(uap->fd);
13007 		return error;
13008 	}
13009 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13010 	if (error != 0) {
13011 		goto out;
13012 	}
13013 	if (xattr_protected(attrname) &&
13014 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13015 		goto out;
13016 	}
13017 	if (uap->value && uap->size > 0) {
13018 		if (uap->size > (size_t)XATTR_MAXSIZE) {
13019 			uap->size = XATTR_MAXSIZE;
13020 		}
13021 
13022 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13023 		    &uio_buf[0], sizeof(uio_buf));
13024 		uio_addiov(auio, uap->value, uap->size);
13025 	}
13026 
13027 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13028 out:
13029 	(void)vnode_put(vp);
13030 	file_drop(uap->fd);
13031 
13032 	if (auio) {
13033 		*retval = uap->size - uio_resid(auio);
13034 	} else {
13035 		*retval = (user_ssize_t)attrsize;
13036 	}
13037 	return error;
13038 }
13039 
13040 /* struct for checkdirs iteration */
13041 struct setxattr_ctx {
13042 	struct nameidata nd;
13043 	char attrname[XATTR_MAXNAMELEN + 1];
13044 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13045 };
13046 
13047 /*
13048  * Set the data of an extended attribute.
13049  */
13050 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13051 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13052 {
13053 	vnode_t vp;
13054 	vfs_context_t ctx = vfs_context_current();
13055 	uio_t auio = NULL;
13056 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13057 	size_t namelen;
13058 	u_int32_t nameiflags;
13059 	int error;
13060 	struct setxattr_ctx *sactx;
13061 
13062 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13063 		return EINVAL;
13064 	}
13065 
13066 	sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13067 	if (sactx == NULL) {
13068 		return ENOMEM;
13069 	}
13070 
13071 	error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13072 	if (error != 0) {
13073 		if (error == EPERM) {
13074 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13075 			error = ENAMETOOLONG;
13076 		}
13077 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13078 		goto out;
13079 	}
13080 	if (xattr_protected(sactx->attrname) &&
13081 	    (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13082 		goto out;
13083 	}
13084 	if (uap->size != 0 && uap->value == 0) {
13085 		error = EINVAL;
13086 		goto out;
13087 	}
13088 	if (uap->size > INT_MAX) {
13089 		error = E2BIG;
13090 		goto out;
13091 	}
13092 
13093 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13094 #if CONFIG_FILE_LEASES
13095 	nameiflags |= WANTPARENT;
13096 #endif
13097 	NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13098 	if ((error = namei(&sactx->nd))) {
13099 		goto out;
13100 	}
13101 	vp = sactx->nd.ni_vp;
13102 #if CONFIG_FILE_LEASES
13103 	vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13104 	vnode_put(sactx->nd.ni_dvp);
13105 #endif
13106 	nameidone(&sactx->nd);
13107 
13108 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13109 	    &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13110 	uio_addiov(auio, uap->value, uap->size);
13111 
13112 	error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13113 #if CONFIG_FSE
13114 	if (error == 0) {
13115 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13116 		    FSE_ARG_VNODE, vp,
13117 		    FSE_ARG_DONE);
13118 	}
13119 #endif
13120 	vnode_put(vp);
13121 out:
13122 	kfree_type(struct setxattr_ctx, sactx);
13123 	*retval = 0;
13124 	return error;
13125 }
13126 
13127 /*
13128  * Set the data of an extended attribute.
13129  */
13130 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13131 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13132 {
13133 	vnode_t vp;
13134 	char attrname[XATTR_MAXNAMELEN + 1];
13135 	vfs_context_t ctx = vfs_context_current();
13136 	uio_t auio = NULL;
13137 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13138 	size_t namelen;
13139 	int error;
13140 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13141 
13142 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13143 		return EINVAL;
13144 	}
13145 
13146 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13147 	if (error != 0) {
13148 		if (error == EPERM) {
13149 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13150 			return ENAMETOOLONG;
13151 		}
13152 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13153 		return error;
13154 	}
13155 	if (xattr_protected(attrname) &&
13156 	    (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13157 		return error;
13158 	}
13159 	if (uap->size != 0 && uap->value == 0) {
13160 		return EINVAL;
13161 	}
13162 	if (uap->size > INT_MAX) {
13163 		return E2BIG;
13164 	}
13165 	if ((error = file_vnode(uap->fd, &vp))) {
13166 		return error;
13167 	}
13168 	if ((error = vnode_getwithref(vp))) {
13169 		file_drop(uap->fd);
13170 		return error;
13171 	}
13172 
13173 #if CONFIG_FILE_LEASES
13174 	vnode_breakdirlease(vp, true, O_WRONLY);
13175 #endif
13176 
13177 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13178 	    &uio_buf[0], sizeof(uio_buf));
13179 	uio_addiov(auio, uap->value, uap->size);
13180 
13181 	error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13182 #if CONFIG_FSE
13183 	if (error == 0) {
13184 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13185 		    FSE_ARG_VNODE, vp,
13186 		    FSE_ARG_DONE);
13187 	}
13188 #endif
13189 	vnode_put(vp);
13190 	file_drop(uap->fd);
13191 	*retval = 0;
13192 	return error;
13193 }
13194 
13195 /*
13196  * Remove an extended attribute.
13197  * XXX Code duplication here.
13198  */
13199 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13200 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13201 {
13202 	vnode_t vp;
13203 	struct nameidata nd;
13204 	char attrname[XATTR_MAXNAMELEN + 1];
13205 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13206 	vfs_context_t ctx = vfs_context_current();
13207 	size_t namelen;
13208 	u_int32_t nameiflags;
13209 	int error;
13210 
13211 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13212 		return EINVAL;
13213 	}
13214 
13215 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13216 	if (error != 0) {
13217 		return error;
13218 	}
13219 	if (xattr_protected(attrname)) {
13220 		return EPERM;
13221 	}
13222 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13223 #if CONFIG_FILE_LEASES
13224 	nameiflags |= WANTPARENT;
13225 #endif
13226 	NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13227 	if ((error = namei(&nd))) {
13228 		return error;
13229 	}
13230 	vp = nd.ni_vp;
13231 #if CONFIG_FILE_LEASES
13232 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13233 	vnode_put(nd.ni_dvp);
13234 #endif
13235 	nameidone(&nd);
13236 
13237 	error = vn_removexattr(vp, attrname, uap->options, ctx);
13238 #if CONFIG_FSE
13239 	if (error == 0) {
13240 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13241 		    FSE_ARG_VNODE, vp,
13242 		    FSE_ARG_DONE);
13243 	}
13244 #endif
13245 	vnode_put(vp);
13246 	*retval = 0;
13247 	return error;
13248 }
13249 
13250 /*
13251  * Remove an extended attribute.
13252  * XXX Code duplication here.
13253  */
13254 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)13255 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13256 {
13257 	vnode_t vp;
13258 	char attrname[XATTR_MAXNAMELEN + 1];
13259 	size_t namelen;
13260 	int error;
13261 #if CONFIG_FSE
13262 	vfs_context_t ctx = vfs_context_current();
13263 #endif
13264 
13265 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13266 		return EINVAL;
13267 	}
13268 
13269 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13270 	if (error != 0) {
13271 		return error;
13272 	}
13273 	if (xattr_protected(attrname)) {
13274 		return EPERM;
13275 	}
13276 	if ((error = file_vnode(uap->fd, &vp))) {
13277 		return error;
13278 	}
13279 	if ((error = vnode_getwithref(vp))) {
13280 		file_drop(uap->fd);
13281 		return error;
13282 	}
13283 
13284 #if CONFIG_FILE_LEASES
13285 	vnode_breakdirlease(vp, true, O_WRONLY);
13286 #endif
13287 
13288 	error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13289 #if CONFIG_FSE
13290 	if (error == 0) {
13291 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13292 		    FSE_ARG_VNODE, vp,
13293 		    FSE_ARG_DONE);
13294 	}
13295 #endif
13296 	vnode_put(vp);
13297 	file_drop(uap->fd);
13298 	*retval = 0;
13299 	return error;
13300 }
13301 
13302 /*
13303  * Retrieve the list of extended attribute names.
13304  * XXX Code duplication here.
13305  */
13306 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)13307 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13308 {
13309 	vnode_t vp;
13310 	struct nameidata nd;
13311 	vfs_context_t ctx = vfs_context_current();
13312 	uio_t auio = NULL;
13313 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13314 	size_t attrsize = 0;
13315 	u_int32_t nameiflags;
13316 	int error;
13317 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13318 
13319 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13320 		return EINVAL;
13321 	}
13322 
13323 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13324 	NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13325 	if ((error = namei(&nd))) {
13326 		return error;
13327 	}
13328 	vp = nd.ni_vp;
13329 	nameidone(&nd);
13330 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13331 		auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
13332 		    &uio_buf[0], sizeof(uio_buf));
13333 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13334 	}
13335 
13336 	error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13337 
13338 	vnode_put(vp);
13339 	if (auio) {
13340 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13341 	} else {
13342 		*retval = (user_ssize_t)attrsize;
13343 	}
13344 	return error;
13345 }
13346 
13347 /*
13348  * Retrieve the list of extended attribute names.
13349  * XXX Code duplication here.
13350  */
13351 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)13352 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13353 {
13354 	vnode_t vp;
13355 	uio_t auio = NULL;
13356 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13357 	size_t attrsize = 0;
13358 	int error;
13359 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13360 
13361 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13362 		return EINVAL;
13363 	}
13364 
13365 	if ((error = file_vnode(uap->fd, &vp))) {
13366 		return error;
13367 	}
13368 	if ((error = vnode_getwithref(vp))) {
13369 		file_drop(uap->fd);
13370 		return error;
13371 	}
13372 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13373 		auio = uio_createwithbuffer(1, 0, spacetype,
13374 		    UIO_READ, &uio_buf[0], sizeof(uio_buf));
13375 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13376 	}
13377 
13378 	error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13379 
13380 	vnode_put(vp);
13381 	file_drop(uap->fd);
13382 	if (auio) {
13383 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13384 	} else {
13385 		*retval = (user_ssize_t)attrsize;
13386 	}
13387 	return error;
13388 }
13389 
13390 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)13391 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13392     vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13393 {
13394 	int error;
13395 	struct mount *mp = NULL;
13396 	vnode_t vp;
13397 	int length;
13398 	int bpflags;
13399 	/* maximum number of times to retry build_path */
13400 	unsigned int retries = 0x10;
13401 
13402 	if (bufsize > FSGETPATH_MAXBUFLEN) {
13403 		return EINVAL;
13404 	}
13405 
13406 	if (buf == NULL) {
13407 		return ENOMEM;
13408 	}
13409 
13410 retry:
13411 	if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13412 		error = ENOTSUP;  /* unexpected failure */
13413 		return ENOTSUP;
13414 	}
13415 
13416 #if CONFIG_UNION_MOUNTS
13417 unionget:
13418 #endif /* CONFIG_UNION_MOUNTS */
13419 	if (objid == 2) {
13420 		struct vfs_attr vfsattr;
13421 		int use_vfs_root = TRUE;
13422 
13423 		VFSATTR_INIT(&vfsattr);
13424 		VFSATTR_WANTED(&vfsattr, f_capabilities);
13425 		if (!(options & FSOPT_ISREALFSID) &&
13426 		    vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
13427 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13428 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13429 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13430 				use_vfs_root = FALSE;
13431 			}
13432 		}
13433 
13434 		if (use_vfs_root) {
13435 			error = VFS_ROOT(mp, &vp, ctx);
13436 		} else {
13437 			error = VFS_VGET(mp, objid, &vp, ctx);
13438 		}
13439 	} else {
13440 		error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13441 	}
13442 
13443 #if CONFIG_UNION_MOUNTS
13444 	if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13445 		/*
13446 		 * If the fileid isn't found and we're in a union
13447 		 * mount volume, then see if the fileid is in the
13448 		 * mounted-on volume.
13449 		 */
13450 		struct mount *tmp = mp;
13451 		mp = vnode_mount(tmp->mnt_vnodecovered);
13452 		vfs_unbusy(tmp);
13453 		if (vfs_busy(mp, LK_NOWAIT) == 0) {
13454 			goto unionget;
13455 		}
13456 	} else {
13457 		vfs_unbusy(mp);
13458 	}
13459 #else
13460 	vfs_unbusy(mp);
13461 #endif /* CONFIG_UNION_MOUNTS */
13462 
13463 	if (error) {
13464 		return error;
13465 	}
13466 
13467 #if CONFIG_MACF
13468 	error = mac_vnode_check_fsgetpath(ctx, vp);
13469 	if (error) {
13470 		vnode_put(vp);
13471 		return error;
13472 	}
13473 #endif
13474 
13475 	/* Obtain the absolute path to this vnode. */
13476 	bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13477 	if (options & FSOPT_NOFIRMLINKPATH) {
13478 		bpflags |= BUILDPATH_NO_FIRMLINK;
13479 	}
13480 	bpflags |= BUILDPATH_CHECK_MOVED;
13481 	error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
13482 	vnode_put(vp);
13483 
13484 	if (error) {
13485 		/* there was a race building the path, try a few more times */
13486 		if (error == EAGAIN) {
13487 			--retries;
13488 			if (retries > 0) {
13489 				goto retry;
13490 			}
13491 
13492 			error = ENOENT;
13493 		}
13494 		goto out;
13495 	}
13496 
13497 	AUDIT_ARG(text, buf);
13498 
13499 	if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13500 		unsigned long path_words[NUMPARMS];
13501 		size_t path_len = sizeof(path_words);
13502 
13503 		if ((size_t)length < path_len) {
13504 			memcpy((char *)path_words, buf, length);
13505 			memset((char *)path_words + length, 0, path_len - length);
13506 
13507 			path_len = length;
13508 		} else {
13509 			memcpy((char *)path_words, buf + (length - path_len), path_len);
13510 		}
13511 
13512 		kdebug_vfs_lookup(path_words, (int)path_len, vp,
13513 		    KDBG_VFS_LOOKUP_FLAG_LOOKUP);
13514 	}
13515 
13516 	*pathlen = length; /* may be superseded by error */
13517 
13518 out:
13519 	return error;
13520 }
13521 
13522 /*
13523  * Obtain the full pathname of a file system object by id.
13524  */
13525 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)13526 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13527     uint32_t options, user_ssize_t *retval)
13528 {
13529 	vfs_context_t ctx = vfs_context_current();
13530 	fsid_t fsid;
13531 	char *realpath;
13532 	int length;
13533 	int error;
13534 
13535 	if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13536 		return EINVAL;
13537 	}
13538 
13539 	if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13540 		return error;
13541 	}
13542 	AUDIT_ARG(value32, fsid.val[0]);
13543 	AUDIT_ARG(value64, objid);
13544 	/* Restrict output buffer size for now. */
13545 
13546 	if (bufsize > FSGETPATH_MAXBUFLEN || bufsize <= 0) {
13547 		return EINVAL;
13548 	}
13549 	realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13550 	if (realpath == NULL) {
13551 		return ENOMEM;
13552 	}
13553 
13554 	error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
13555 	    options, &length);
13556 
13557 	if (error) {
13558 		goto out;
13559 	}
13560 
13561 	error = copyout((caddr_t)realpath, buf, length);
13562 
13563 	*retval = (user_ssize_t)length; /* may be superseded by error */
13564 out:
13565 	kfree_data(realpath, bufsize);
13566 	return error;
13567 }
13568 
13569 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)13570 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13571 {
13572 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13573 	           0, retval);
13574 }
13575 
13576 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)13577 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13578 {
13579 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13580 	           uap->options, retval);
13581 }
13582 
13583 /*
13584  * Common routine to handle various flavors of statfs data heading out
13585  *	to user space.
13586  *
13587  * Returns:	0			Success
13588  *		EFAULT
13589  */
13590 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)13591 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
13592     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
13593     boolean_t partial_copy)
13594 {
13595 	int             error;
13596 	int             my_size, copy_size;
13597 
13598 	if (is_64_bit) {
13599 		struct user64_statfs sfs;
13600 		my_size = copy_size = sizeof(sfs);
13601 		bzero(&sfs, my_size);
13602 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13603 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13604 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13605 		sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
13606 		sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
13607 		sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
13608 		sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
13609 		sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
13610 		sfs.f_files = (user64_long_t)sfsp->f_files;
13611 		sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
13612 		sfs.f_fsid = sfsp->f_fsid;
13613 		sfs.f_owner = sfsp->f_owner;
13614 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
13615 			strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
13616 		} else {
13617 			strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
13618 		}
13619 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13620 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13621 
13622 		if (partial_copy) {
13623 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13624 		}
13625 		error = copyout((caddr_t)&sfs, bufp, copy_size);
13626 	} else {
13627 		struct user32_statfs sfs;
13628 
13629 		my_size = copy_size = sizeof(sfs);
13630 		bzero(&sfs, my_size);
13631 
13632 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13633 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13634 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13635 
13636 		/*
13637 		 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
13638 		 * have to fudge the numbers here in that case.   We inflate the blocksize in order
13639 		 * to reflect the filesystem size as best we can.
13640 		 */
13641 		if ((sfsp->f_blocks > INT_MAX)
13642 		    /* Hack for 4061702 . I think the real fix is for Carbon to
13643 		     * look for some volume capability and not depend on hidden
13644 		     * semantics agreed between a FS and carbon.
13645 		     * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
13646 		     * for Carbon to set bNoVolumeSizes volume attribute.
13647 		     * Without this the webdavfs files cannot be copied onto
13648 		     * disk as they look huge. This change should not affect
13649 		     * XSAN as they should not setting these to -1..
13650 		     */
13651 		    && (sfsp->f_blocks != 0xffffffffffffffffULL)
13652 		    && (sfsp->f_bfree != 0xffffffffffffffffULL)
13653 		    && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
13654 			int             shift;
13655 
13656 			/*
13657 			 * Work out how far we have to shift the block count down to make it fit.
13658 			 * Note that it's possible to have to shift so far that the resulting
13659 			 * blocksize would be unreportably large.  At that point, we will clip
13660 			 * any values that don't fit.
13661 			 *
13662 			 * For safety's sake, we also ensure that f_iosize is never reported as
13663 			 * being smaller than f_bsize.
13664 			 */
13665 			for (shift = 0; shift < 32; shift++) {
13666 				if ((sfsp->f_blocks >> shift) <= INT_MAX) {
13667 					break;
13668 				}
13669 				if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
13670 					break;
13671 				}
13672 			}
13673 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
13674 			sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
13675 			sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
13676 			sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
13677 #undef __SHIFT_OR_CLIP
13678 			sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
13679 			sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
13680 		} else {
13681 			/* filesystem is small enough to be reported honestly */
13682 			sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
13683 			sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
13684 			sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
13685 			sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
13686 			sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
13687 		}
13688 		sfs.f_files = (user32_long_t)sfsp->f_files;
13689 		sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
13690 		sfs.f_fsid = sfsp->f_fsid;
13691 		sfs.f_owner = sfsp->f_owner;
13692 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
13693 			strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
13694 		} else {
13695 			strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
13696 		}
13697 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13698 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13699 
13700 		if (partial_copy) {
13701 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13702 		}
13703 		error = copyout((caddr_t)&sfs, bufp, copy_size);
13704 	}
13705 
13706 	if (sizep != NULL) {
13707 		*sizep = my_size;
13708 	}
13709 	return error;
13710 }
13711 
13712 /*
13713  * copy stat structure into user_stat structure.
13714  */
13715 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)13716 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
13717 {
13718 	bzero(usbp, sizeof(*usbp));
13719 
13720 	usbp->st_dev = sbp->st_dev;
13721 	usbp->st_ino = sbp->st_ino;
13722 	usbp->st_mode = sbp->st_mode;
13723 	usbp->st_nlink = sbp->st_nlink;
13724 	usbp->st_uid = sbp->st_uid;
13725 	usbp->st_gid = sbp->st_gid;
13726 	usbp->st_rdev = sbp->st_rdev;
13727 #ifndef _POSIX_C_SOURCE
13728 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13729 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13730 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13731 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13732 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13733 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13734 #else
13735 	usbp->st_atime = sbp->st_atime;
13736 	usbp->st_atimensec = sbp->st_atimensec;
13737 	usbp->st_mtime = sbp->st_mtime;
13738 	usbp->st_mtimensec = sbp->st_mtimensec;
13739 	usbp->st_ctime = sbp->st_ctime;
13740 	usbp->st_ctimensec = sbp->st_ctimensec;
13741 #endif
13742 	usbp->st_size = sbp->st_size;
13743 	usbp->st_blocks = sbp->st_blocks;
13744 	usbp->st_blksize = sbp->st_blksize;
13745 	usbp->st_flags = sbp->st_flags;
13746 	usbp->st_gen = sbp->st_gen;
13747 	usbp->st_lspare = sbp->st_lspare;
13748 	usbp->st_qspare[0] = sbp->st_qspare[0];
13749 	usbp->st_qspare[1] = sbp->st_qspare[1];
13750 }
13751 
13752 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)13753 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
13754 {
13755 	bzero(usbp, sizeof(*usbp));
13756 
13757 	usbp->st_dev = sbp->st_dev;
13758 	usbp->st_ino = sbp->st_ino;
13759 	usbp->st_mode = sbp->st_mode;
13760 	usbp->st_nlink = sbp->st_nlink;
13761 	usbp->st_uid = sbp->st_uid;
13762 	usbp->st_gid = sbp->st_gid;
13763 	usbp->st_rdev = sbp->st_rdev;
13764 #ifndef _POSIX_C_SOURCE
13765 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13766 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13767 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13768 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13769 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13770 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13771 #else
13772 	usbp->st_atime = sbp->st_atime;
13773 	usbp->st_atimensec = sbp->st_atimensec;
13774 	usbp->st_mtime = sbp->st_mtime;
13775 	usbp->st_mtimensec = sbp->st_mtimensec;
13776 	usbp->st_ctime = sbp->st_ctime;
13777 	usbp->st_ctimensec = sbp->st_ctimensec;
13778 #endif
13779 	usbp->st_size = sbp->st_size;
13780 	usbp->st_blocks = sbp->st_blocks;
13781 	usbp->st_blksize = sbp->st_blksize;
13782 	usbp->st_flags = sbp->st_flags;
13783 	usbp->st_gen = sbp->st_gen;
13784 	usbp->st_lspare = sbp->st_lspare;
13785 	usbp->st_qspare[0] = sbp->st_qspare[0];
13786 	usbp->st_qspare[1] = sbp->st_qspare[1];
13787 }
13788 
13789 /*
13790  * copy stat64 structure into user_stat64 structure.
13791  */
13792 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)13793 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
13794 {
13795 	bzero(usbp, sizeof(*usbp));
13796 
13797 	usbp->st_dev = sbp->st_dev;
13798 	usbp->st_ino = sbp->st_ino;
13799 	usbp->st_mode = sbp->st_mode;
13800 	usbp->st_nlink = sbp->st_nlink;
13801 	usbp->st_uid = sbp->st_uid;
13802 	usbp->st_gid = sbp->st_gid;
13803 	usbp->st_rdev = sbp->st_rdev;
13804 #ifndef _POSIX_C_SOURCE
13805 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13806 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13807 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13808 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13809 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13810 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13811 	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
13812 	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
13813 #else
13814 	usbp->st_atime = sbp->st_atime;
13815 	usbp->st_atimensec = sbp->st_atimensec;
13816 	usbp->st_mtime = sbp->st_mtime;
13817 	usbp->st_mtimensec = sbp->st_mtimensec;
13818 	usbp->st_ctime = sbp->st_ctime;
13819 	usbp->st_ctimensec = sbp->st_ctimensec;
13820 	usbp->st_birthtime = sbp->st_birthtime;
13821 	usbp->st_birthtimensec = sbp->st_birthtimensec;
13822 #endif
13823 	usbp->st_size = sbp->st_size;
13824 	usbp->st_blocks = sbp->st_blocks;
13825 	usbp->st_blksize = sbp->st_blksize;
13826 	usbp->st_flags = sbp->st_flags;
13827 	usbp->st_gen = sbp->st_gen;
13828 	usbp->st_lspare = sbp->st_lspare;
13829 	usbp->st_qspare[0] = sbp->st_qspare[0];
13830 	usbp->st_qspare[1] = sbp->st_qspare[1];
13831 }
13832 
13833 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)13834 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
13835 {
13836 	bzero(usbp, sizeof(*usbp));
13837 
13838 	usbp->st_dev = sbp->st_dev;
13839 	usbp->st_ino = sbp->st_ino;
13840 	usbp->st_mode = sbp->st_mode;
13841 	usbp->st_nlink = sbp->st_nlink;
13842 	usbp->st_uid = sbp->st_uid;
13843 	usbp->st_gid = sbp->st_gid;
13844 	usbp->st_rdev = sbp->st_rdev;
13845 #ifndef _POSIX_C_SOURCE
13846 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13847 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13848 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13849 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13850 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13851 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13852 	usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
13853 	usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
13854 #else
13855 	usbp->st_atime = sbp->st_atime;
13856 	usbp->st_atimensec = sbp->st_atimensec;
13857 	usbp->st_mtime = sbp->st_mtime;
13858 	usbp->st_mtimensec = sbp->st_mtimensec;
13859 	usbp->st_ctime = sbp->st_ctime;
13860 	usbp->st_ctimensec = sbp->st_ctimensec;
13861 	usbp->st_birthtime = sbp->st_birthtime;
13862 	usbp->st_birthtimensec = sbp->st_birthtimensec;
13863 #endif
13864 	usbp->st_size = sbp->st_size;
13865 	usbp->st_blocks = sbp->st_blocks;
13866 	usbp->st_blksize = sbp->st_blksize;
13867 	usbp->st_flags = sbp->st_flags;
13868 	usbp->st_gen = sbp->st_gen;
13869 	usbp->st_lspare = sbp->st_lspare;
13870 	usbp->st_qspare[0] = sbp->st_qspare[0];
13871 	usbp->st_qspare[1] = sbp->st_qspare[1];
13872 }
13873 
13874 /*
13875  * Purge buffer cache for simulating cold starts
13876  */
13877 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)13878 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
13879 {
13880 	ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
13881 
13882 	return VNODE_RETURNED;
13883 }
13884 
13885 static int
vfs_purge_callback(mount_t mp,__unused void * arg)13886 vfs_purge_callback(mount_t mp, __unused void * arg)
13887 {
13888 	vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
13889 
13890 	return VFS_RETURNED;
13891 }
13892 
13893 static TUNABLE_WRITEABLE(boolean_t, vfs_purge_vm_pagers, "vfs_purge_vm_pagers", TRUE);
13894 SYSCTL_INT(_vfs, OID_AUTO, purge_vm_pagers, CTLFLAG_RW | CTLFLAG_LOCKED, &vfs_purge_vm_pagers, 0, "VFS purge also purges file-backed VM pagers");
13895 
13896 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)13897 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
13898 {
13899 	if (!kauth_cred_issuser(kauth_cred_get())) {
13900 		return EPERM;
13901 	}
13902 
13903 	vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
13904 
13905 	/* also flush any VM pagers backed by files */
13906 	if (vfs_purge_vm_pagers) {
13907 		vm_purge_filebacked_pagers();
13908 	}
13909 
13910 	return 0;
13911 }
13912 
13913 /*
13914  * gets the vnode associated with the (unnamed) snapshot directory
13915  * for a Filesystem. The snapshot directory vnode is returned with
13916  * an iocount on it.
13917  */
13918 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)13919 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
13920 {
13921 	return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
13922 }
13923 
13924 /*
13925  * Get the snapshot vnode.
13926  *
13927  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
13928  * needs nameidone() on ndp.
13929  *
13930  * If the snapshot vnode exists it is returned in ndp->ni_vp.
13931  *
13932  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
13933  * not needed.
13934  */
13935 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)13936 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
13937     user_addr_t name, struct nameidata *ndp, int32_t op,
13938 #if !CONFIG_TRIGGERS
13939     __unused
13940 #endif
13941     enum path_operation pathop,
13942     vfs_context_t ctx)
13943 {
13944 	int error, i;
13945 	caddr_t name_buf;
13946 	size_t name_len;
13947 	struct vfs_attr vfa;
13948 
13949 	*sdvpp = NULLVP;
13950 	*rvpp = NULLVP;
13951 
13952 	error = vnode_getfromfd(ctx, dirfd, rvpp);
13953 	if (error) {
13954 		return error;
13955 	}
13956 
13957 	if (!vnode_isvroot(*rvpp)) {
13958 		error = EINVAL;
13959 		goto out;
13960 	}
13961 
13962 	/* Make sure the filesystem supports snapshots */
13963 	VFSATTR_INIT(&vfa);
13964 	VFSATTR_WANTED(&vfa, f_capabilities);
13965 	if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
13966 	    !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
13967 	    !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
13968 	    VOL_CAP_INT_SNAPSHOT)) ||
13969 	    !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
13970 	    VOL_CAP_INT_SNAPSHOT))) {
13971 		error = ENOTSUP;
13972 		goto out;
13973 	}
13974 
13975 	error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
13976 	if (error) {
13977 		goto out;
13978 	}
13979 
13980 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13981 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13982 	if (error) {
13983 		goto out1;
13984 	}
13985 
13986 	/*
13987 	 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
13988 	 * (the length returned by copyinstr includes the terminating NUL)
13989 	 */
13990 	if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
13991 	    (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
13992 		error = EINVAL;
13993 		goto out1;
13994 	}
13995 	for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
13996 		;
13997 	}
13998 	if (i < (int)name_len) {
13999 		error = EINVAL;
14000 		goto out1;
14001 	}
14002 
14003 #if CONFIG_MACF
14004 	if (op == CREATE) {
14005 		error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
14006 		    name_buf);
14007 	} else if (op == DELETE) {
14008 		error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
14009 		    name_buf);
14010 	}
14011 	if (error) {
14012 		goto out1;
14013 	}
14014 #endif
14015 
14016 	/* Check if the snapshot already exists ... */
14017 	NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
14018 	    UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14019 	ndp->ni_dvp = *sdvpp;
14020 
14021 	error = namei(ndp);
14022 out1:
14023 	zfree(ZV_NAMEI, name_buf);
14024 out:
14025 	if (error) {
14026 		if (*sdvpp) {
14027 			vnode_put(*sdvpp);
14028 			*sdvpp = NULLVP;
14029 		}
14030 		if (*rvpp) {
14031 			vnode_put(*rvpp);
14032 			*rvpp = NULLVP;
14033 		}
14034 	}
14035 	return error;
14036 }
14037 
14038 /*
14039  * create a filesystem snapshot (for supporting filesystems)
14040  *
14041  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14042  * We get to the (unnamed) snapshot directory vnode and create the vnode
14043  * for the snapshot in it.
14044  *
14045  * Restrictions:
14046  *
14047  *    a) Passed in name for snapshot cannot have slashes.
14048  *    b) name can't be "." or ".."
14049  *
14050  * Since this requires superuser privileges, vnode_authorize calls are not
14051  * made.
14052  */
14053 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14054 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
14055     vfs_context_t ctx)
14056 {
14057 	vnode_t rvp, snapdvp;
14058 	int error;
14059 	struct nameidata *ndp;
14060 
14061 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14062 
14063 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14064 	    OP_LINK, ctx);
14065 	if (error) {
14066 		goto out;
14067 	}
14068 
14069 	if (ndp->ni_vp) {
14070 		vnode_put(ndp->ni_vp);
14071 		error = EEXIST;
14072 	} else {
14073 		struct vnode_attr *vap;
14074 		vnode_t vp = NULLVP;
14075 
14076 		vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14077 
14078 		VATTR_INIT(vap);
14079 		VATTR_SET(vap, va_type, VREG);
14080 		VATTR_SET(vap, va_mode, 0);
14081 
14082 		error = vn_create(snapdvp, &vp, ndp, vap,
14083 		    VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14084 		if (!error && vp) {
14085 			vnode_put(vp);
14086 		}
14087 
14088 		kfree_type(struct vnode_attr, vap);
14089 	}
14090 
14091 	nameidone(ndp);
14092 	vnode_put(snapdvp);
14093 	vnode_put(rvp);
14094 out:
14095 	kfree_type(struct nameidata, ndp);
14096 
14097 	return error;
14098 }
14099 
14100 /*
14101  * Delete a Filesystem snapshot
14102  *
14103  * get the vnode for the unnamed snapshot directory and the snapshot and
14104  * delete the snapshot.
14105  */
14106 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14107 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14108     vfs_context_t ctx)
14109 {
14110 	vnode_t rvp, snapdvp;
14111 	int error;
14112 	struct nameidata *ndp;
14113 
14114 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14115 
14116 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14117 	    OP_UNLINK, ctx);
14118 	if (error) {
14119 		goto out;
14120 	}
14121 
14122 	error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14123 	    VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14124 
14125 	vnode_put(ndp->ni_vp);
14126 	nameidone(ndp);
14127 	vnode_put(snapdvp);
14128 	vnode_put(rvp);
14129 out:
14130 	kfree_type(struct nameidata, ndp);
14131 
14132 	return error;
14133 }
14134 
14135 /*
14136  * Revert a filesystem to a snapshot
14137  *
14138  * Marks the filesystem to revert to the given snapshot on next mount.
14139  */
14140 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14141 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14142     vfs_context_t ctx)
14143 {
14144 	int error;
14145 	vnode_t rvp;
14146 	mount_t mp;
14147 	struct fs_snapshot_revert_args revert_data;
14148 	struct componentname cnp;
14149 	caddr_t name_buf;
14150 	size_t name_len;
14151 
14152 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14153 	if (error) {
14154 		return error;
14155 	}
14156 	mp = vnode_mount(rvp);
14157 
14158 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14159 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14160 	if (error) {
14161 		zfree(ZV_NAMEI, name_buf);
14162 		vnode_put(rvp);
14163 		return error;
14164 	}
14165 
14166 #if CONFIG_MACF
14167 	error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14168 	if (error) {
14169 		zfree(ZV_NAMEI, name_buf);
14170 		vnode_put(rvp);
14171 		return error;
14172 	}
14173 #endif
14174 
14175 	/*
14176 	 * Grab mount_iterref so that we can release the vnode,
14177 	 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14178 	 */
14179 	error = mount_iterref(mp, 0);
14180 	vnode_put(rvp);
14181 	if (error) {
14182 		zfree(ZV_NAMEI, name_buf);
14183 		return error;
14184 	}
14185 
14186 	memset(&cnp, 0, sizeof(cnp));
14187 	cnp.cn_pnbuf = (char *)name_buf;
14188 	cnp.cn_nameiop = LOOKUP;
14189 	cnp.cn_flags = ISLASTCN | HASBUF;
14190 	cnp.cn_pnlen = MAXPATHLEN;
14191 	cnp.cn_nameptr = cnp.cn_pnbuf;
14192 	cnp.cn_namelen = (int)name_len;
14193 	revert_data.sr_cnp = &cnp;
14194 
14195 	error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14196 	mount_iterdrop(mp);
14197 	zfree(ZV_NAMEI, name_buf);
14198 
14199 	if (error) {
14200 		/* If there was any error, try again using VNOP_IOCTL */
14201 
14202 		vnode_t snapdvp;
14203 		struct nameidata namend;
14204 
14205 		error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14206 		    OP_LOOKUP, ctx);
14207 		if (error) {
14208 			return error;
14209 		}
14210 
14211 
14212 		error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14213 		    0, ctx);
14214 
14215 		vnode_put(namend.ni_vp);
14216 		nameidone(&namend);
14217 		vnode_put(snapdvp);
14218 		vnode_put(rvp);
14219 	}
14220 
14221 	return error;
14222 }
14223 
14224 /*
14225  * rename a Filesystem snapshot
14226  *
14227  * get the vnode for the unnamed snapshot directory and the snapshot and
14228  * rename the snapshot. This is a very specialised (and simple) case of
14229  * rename(2) (which has to deal with a lot more complications). It differs
14230  * slightly from rename(2) in that EEXIST is returned if the new name exists.
14231  */
14232 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)14233 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14234     __unused uint32_t flags, vfs_context_t ctx)
14235 {
14236 	vnode_t rvp, snapdvp;
14237 	int error, i;
14238 	caddr_t newname_buf;
14239 	size_t name_len;
14240 	vnode_t fvp;
14241 	struct nameidata *fromnd, *tond;
14242 	/* carving out a chunk for structs that are too big to be on stack. */
14243 	struct {
14244 		struct nameidata from_node;
14245 		struct nameidata to_node;
14246 	} * __rename_data;
14247 
14248 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14249 	fromnd = &__rename_data->from_node;
14250 	tond = &__rename_data->to_node;
14251 
14252 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14253 	    OP_UNLINK, ctx);
14254 	if (error) {
14255 		goto out;
14256 	}
14257 	fvp  = fromnd->ni_vp;
14258 
14259 	newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14260 	error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14261 	if (error) {
14262 		goto out1;
14263 	}
14264 
14265 	/*
14266 	 * Some sanity checks- new name can't be empty, "." or ".." or have
14267 	 * slashes.
14268 	 * (the length returned by copyinstr includes the terminating NUL)
14269 	 *
14270 	 * The FS rename VNOP is suppossed to handle this but we'll pick it
14271 	 * off here itself.
14272 	 */
14273 	if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14274 	    (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14275 		error = EINVAL;
14276 		goto out1;
14277 	}
14278 	for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14279 		;
14280 	}
14281 	if (i < (int)name_len) {
14282 		error = EINVAL;
14283 		goto out1;
14284 	}
14285 
14286 #if CONFIG_MACF
14287 	error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14288 	    newname_buf);
14289 	if (error) {
14290 		goto out1;
14291 	}
14292 #endif
14293 
14294 	NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14295 	    UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14296 	tond->ni_dvp = snapdvp;
14297 
14298 	error = namei(tond);
14299 	if (error) {
14300 		goto out2;
14301 	} else if (tond->ni_vp) {
14302 		/*
14303 		 * snapshot rename behaves differently than rename(2) - if the
14304 		 * new name exists, EEXIST is returned.
14305 		 */
14306 		vnode_put(tond->ni_vp);
14307 		error = EEXIST;
14308 		goto out2;
14309 	}
14310 
14311 	error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14312 	    &tond->ni_cnd, ctx);
14313 
14314 out2:
14315 	nameidone(tond);
14316 out1:
14317 	zfree(ZV_NAMEI, newname_buf);
14318 	vnode_put(fvp);
14319 	vnode_put(snapdvp);
14320 	vnode_put(rvp);
14321 	nameidone(fromnd);
14322 out:
14323 	kfree_type(typeof(*__rename_data), __rename_data);
14324 	return error;
14325 }
14326 
14327 /*
14328  * Mount a Filesystem snapshot
14329  *
14330  * get the vnode for the unnamed snapshot directory and the snapshot and
14331  * mount the snapshot.
14332  */
14333 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)14334 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14335     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14336 {
14337 	mount_t mp;
14338 	vnode_t rvp, snapdvp, snapvp, vp, pvp;
14339 	struct fs_snapshot_mount_args smnt_data;
14340 	int error;
14341 	struct nameidata *snapndp, *dirndp;
14342 	/* carving out a chunk for structs that are too big to be on stack. */
14343 	struct {
14344 		struct nameidata snapnd;
14345 		struct nameidata dirnd;
14346 	} * __snapshot_mount_data;
14347 
14348 	__snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14349 	snapndp = &__snapshot_mount_data->snapnd;
14350 	dirndp = &__snapshot_mount_data->dirnd;
14351 
14352 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
14353 	    OP_LOOKUP, ctx);
14354 	if (error) {
14355 		goto out;
14356 	}
14357 
14358 	snapvp  = snapndp->ni_vp;
14359 	if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
14360 		error = EIO;
14361 		goto out1;
14362 	}
14363 
14364 	/* Get the vnode to be covered */
14365 	NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14366 	    UIO_USERSPACE, directory, ctx);
14367 	error = namei(dirndp);
14368 	if (error) {
14369 		goto out1;
14370 	}
14371 
14372 	vp = dirndp->ni_vp;
14373 	pvp = dirndp->ni_dvp;
14374 	mp = vnode_mount(rvp);
14375 
14376 	if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14377 		error = EINVAL;
14378 		goto out2;
14379 	}
14380 
14381 #if CONFIG_MACF
14382 	error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
14383 	    mp->mnt_vfsstat.f_fstypename);
14384 	if (error) {
14385 		goto out2;
14386 	}
14387 #endif
14388 
14389 	smnt_data.sm_mp  = mp;
14390 	smnt_data.sm_cnp = &snapndp->ni_cnd;
14391 	error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
14392 	    &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
14393 	    KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14394 
14395 out2:
14396 	vnode_put(vp);
14397 	vnode_put(pvp);
14398 	nameidone(dirndp);
14399 out1:
14400 	vnode_put(snapvp);
14401 	vnode_put(snapdvp);
14402 	vnode_put(rvp);
14403 	nameidone(snapndp);
14404 out:
14405 	kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14406 	return error;
14407 }
14408 
14409 /*
14410  * Root from a snapshot of the filesystem
14411  *
14412  * Marks the filesystem to root from the given snapshot on next boot.
14413  */
14414 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14415 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14416     vfs_context_t ctx)
14417 {
14418 	int error;
14419 	vnode_t rvp;
14420 	mount_t mp;
14421 	struct fs_snapshot_root_args root_data;
14422 	struct componentname cnp;
14423 	caddr_t name_buf;
14424 	size_t name_len;
14425 
14426 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14427 	if (error) {
14428 		return error;
14429 	}
14430 	mp = vnode_mount(rvp);
14431 
14432 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14433 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14434 	if (error) {
14435 		zfree(ZV_NAMEI, name_buf);
14436 		vnode_put(rvp);
14437 		return error;
14438 	}
14439 
14440 	// XXX MAC checks ?
14441 
14442 	/*
14443 	 * Grab mount_iterref so that we can release the vnode,
14444 	 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14445 	 */
14446 	error = mount_iterref(mp, 0);
14447 	vnode_put(rvp);
14448 	if (error) {
14449 		zfree(ZV_NAMEI, name_buf);
14450 		return error;
14451 	}
14452 
14453 	memset(&cnp, 0, sizeof(cnp));
14454 	cnp.cn_pnbuf = (char *)name_buf;
14455 	cnp.cn_nameiop = LOOKUP;
14456 	cnp.cn_flags = ISLASTCN | HASBUF;
14457 	cnp.cn_pnlen = MAXPATHLEN;
14458 	cnp.cn_nameptr = cnp.cn_pnbuf;
14459 	cnp.cn_namelen = (int)name_len;
14460 	root_data.sr_cnp = &cnp;
14461 
14462 	error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
14463 
14464 	mount_iterdrop(mp);
14465 	zfree(ZV_NAMEI, name_buf);
14466 
14467 	return error;
14468 }
14469 
14470 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)14471 vfs_context_can_snapshot(vfs_context_t ctx)
14472 {
14473 	static const char * const snapshot_entitlements[] = {
14474 		"com.apple.private.vfs.snapshot",
14475 		"com.apple.developer.vfs.snapshot",
14476 		"com.apple.private.apfs.arv.limited.snapshot",
14477 	};
14478 	static const size_t nentitlements =
14479 	    sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14480 	size_t i;
14481 
14482 	task_t task = vfs_context_task(ctx);
14483 	for (i = 0; i < nentitlements; i++) {
14484 		if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
14485 			return TRUE;
14486 		}
14487 	}
14488 	return FALSE;
14489 }
14490 
14491 /*
14492  * FS snapshot operations dispatcher
14493  */
14494 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)14495 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14496     __unused int32_t *retval)
14497 {
14498 	int error;
14499 	vfs_context_t ctx = vfs_context_current();
14500 
14501 	AUDIT_ARG(fd, uap->dirfd);
14502 	AUDIT_ARG(value32, uap->op);
14503 
14504 	if (!vfs_context_can_snapshot(ctx)) {
14505 		return EPERM;
14506 	}
14507 
14508 	/*
14509 	 * Enforce user authorization for snapshot modification operations,
14510 	 * or if trying to root from snapshot.
14511 	 */
14512 	if (uap->op != SNAPSHOT_OP_MOUNT) {
14513 		vnode_t dvp = NULLVP;
14514 		vnode_t devvp = NULLVP;
14515 		mount_t mp;
14516 
14517 		error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
14518 		if (error) {
14519 			return error;
14520 		}
14521 		mp = vnode_mount(dvp);
14522 		devvp = mp->mnt_devvp;
14523 
14524 		/* get an iocount on devvp */
14525 		if (devvp == NULLVP) {
14526 			error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
14527 			/* for mounts which arent block devices */
14528 			if (error == ENOENT) {
14529 				error = ENXIO;
14530 			}
14531 		} else {
14532 			error = vnode_getwithref(devvp);
14533 		}
14534 
14535 		if (error) {
14536 			vnode_put(dvp);
14537 			return error;
14538 		}
14539 
14540 		if ((vfs_context_issuser(ctx) == 0) &&
14541 		    (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14542 		    (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
14543 			error = EPERM;
14544 		}
14545 		vnode_put(dvp);
14546 		vnode_put(devvp);
14547 
14548 		if (error) {
14549 			return error;
14550 		}
14551 	}
14552 
14553 	switch (uap->op) {
14554 	case SNAPSHOT_OP_CREATE:
14555 		error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
14556 		break;
14557 	case SNAPSHOT_OP_DELETE:
14558 		error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
14559 		break;
14560 	case SNAPSHOT_OP_RENAME:
14561 		error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
14562 		    uap->flags, ctx);
14563 		break;
14564 	case SNAPSHOT_OP_MOUNT:
14565 		error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
14566 		    uap->data, uap->flags, ctx);
14567 		break;
14568 	case SNAPSHOT_OP_REVERT:
14569 		error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
14570 		break;
14571 #if CONFIG_MNT_ROOTSNAP
14572 	case SNAPSHOT_OP_ROOT:
14573 		error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
14574 		break;
14575 #endif /* CONFIG_MNT_ROOTSNAP */
14576 	default:
14577 		error = ENOSYS;
14578 	}
14579 
14580 	return error;
14581 }
14582