xref: /xnu-8020.140.41/bsd/vfs/vfs_syscalls.c (revision 27b03b360a988dfd3dfdf34262bb0042026747cc)
1 /*
2  * Copyright (c) 1995-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1989, 1993
30  *	The Regents of the University of California.  All rights reserved.
31  * (c) UNIX System Laboratories, Inc.
32  * All or some portions of this file are derived from material licensed
33  * to the University of California by American Telephone and Telegraph
34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35  * the permission of UNIX System Laboratories, Inc.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. All advertising materials mentioning features or use of this software
46  *    must display the following acknowledgement:
47  *	This product includes software developed by the University of
48  *	California, Berkeley and its contributors.
49  * 4. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.41 (Berkeley) 6/15/95
66  */
67 /*
68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69  * support for mandatory and extensible security protections.  This notice
70  * is included in support of clause 2.2 (b) of the Apple Public License,
71  * Version 2.0.
72  */
73 
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112 
113 #include <vfs/vfs_disk_conditioner.h>
114 
115 #include <security/audit/audit.h>
116 #include <bsm/audit_kevents.h>
117 
118 #include <mach/mach_types.h>
119 #include <kern/kern_types.h>
120 #include <kern/kalloc.h>
121 #include <kern/task.h>
122 
123 #include <vm/vm_pageout.h>
124 #include <vm/vm_protos.h>
125 
126 #include <libkern/OSAtomic.h>
127 #include <os/atomic_private.h>
128 #include <pexpert/pexpert.h>
129 #include <IOKit/IOBSD.h>
130 
131 // deps for MIG call
132 #include <kern/host.h>
133 #include <kern/ipc_misc.h>
134 #include <mach/host_priv.h>
135 #include <mach/vfs_nspace.h>
136 #include <os/log.h>
137 
138 #include <nfs/nfs_conf.h>
139 
140 #if ROUTEFS
141 #include <miscfs/routefs/routefs.h>
142 #endif /* ROUTEFS */
143 
144 #if CONFIG_MACF
145 #include <security/mac.h>
146 #include <security/mac_framework.h>
147 #endif
148 
149 #if CONFIG_FSE
150 #define GET_PATH(x) \
151 	((x) = get_pathbuff())
152 #define RELEASE_PATH(x) \
153 	release_pathbuff(x)
154 #else
155 #define GET_PATH(x)     \
156 	((x) = zalloc(ZV_NAMEI))
157 #define RELEASE_PATH(x) \
158 	zfree(ZV_NAMEI, x)
159 #endif /* CONFIG_FSE */
160 
161 #ifndef HFS_GET_BOOT_INFO
162 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
163 #endif
164 
165 #ifndef HFS_SET_BOOT_INFO
166 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
167 #endif
168 
169 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
170 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
171 #endif
172 
173 extern void disk_conditioner_unmount(mount_t mp);
174 
175 /* struct for checkdirs iteration */
176 struct cdirargs {
177 	vnode_t olddp;
178 	vnode_t newdp;
179 };
180 /* callback  for checkdirs iteration */
181 static int checkdirs_callback(proc_t p, void * arg);
182 
183 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
184 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
185 void enablequotas(struct mount *mp, vfs_context_t ctx);
186 static int getfsstat_callback(mount_t mp, void * arg);
187 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
188 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
189 static int sync_callback(mount_t, void *);
190 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
191     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
192     boolean_t partial_copy);
193 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
194 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
195     struct componentname *cnp, user_addr_t fsmountargs,
196     int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
197 void vfs_notify_mount(vnode_t pdvp);
198 
199 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
200 
201 struct fd_vn_data * fg_vn_data_alloc(void);
202 
203 /*
204  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
205  * Concurrent lookups (or lookups by ids) on hard links can cause the
206  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
207  * does) to return ENOENT as the path cannot be returned from the name cache
208  * alone. We have no option but to retry and hope to get one namei->reverse path
209  * generation done without an intervening lookup, lookup by id on the hard link
210  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
211  * which currently are the MAC hooks for rename, unlink and rmdir.
212  */
213 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
214 
215 /* Max retry limit for rename due to vnode recycling. */
216 #define MAX_RENAME_ERECYCLE_RETRIES 1024
217 
218 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
219     int unlink_flags);
220 
221 #ifdef CONFIG_IMGSRC_ACCESS
222 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
223 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
224 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
225 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
226 static void mount_end_update(mount_t mp);
227 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
228 #endif /* CONFIG_IMGSRC_ACCESS */
229 
230 //snapshot functions
231 #if CONFIG_MNT_ROOTSNAP
232 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
233 #else
234 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
235 #endif
236 
237 __private_extern__
238 int sync_internal(void);
239 
240 __private_extern__
241 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
242 
243 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
244 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
245 
246 /* vars for sync mutex */
247 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
248 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
249 
250 extern lck_rw_t rootvnode_rw_lock;
251 
252 /*
253  * incremented each time a mount or unmount operation occurs
254  * used to invalidate the cached value of the rootvp in the
255  * mount structure utilized by cache_lookup_path
256  */
257 uint32_t mount_generation = 0;
258 
259 /* counts number of mount and unmount operations */
260 unsigned int vfs_nummntops = 0;
261 
262 /* system-wide, per-boot unique mount ID */
263 static _Atomic uint64_t mount_unique_id = 1;
264 
265 extern const struct fileops vnops;
266 #if CONFIG_APPLEDOUBLE
267 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
268 #endif /* CONFIG_APPLEDOUBLE */
269 
270 /*
271  * Virtual File System System Calls
272  */
273 
274 /*
275  * Private in-kernel mounting spi (specific use-cases only)
276  */
277 boolean_t
vfs_iskernelmount(mount_t mp)278 vfs_iskernelmount(mount_t mp)
279 {
280 	return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
281 }
282 
283 __private_extern__
284 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)285 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
286     void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
287     vfs_context_t ctx)
288 {
289 	struct nameidata nd;
290 	boolean_t did_namei;
291 	int error;
292 
293 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
294 	    UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
295 
296 	kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
297 
298 	/*
299 	 * Get the vnode to be covered if it's not supplied
300 	 */
301 	if (vp == NULLVP) {
302 		error = namei(&nd);
303 		if (error) {
304 			if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
305 				printf("failed to locate mount-on path: %s ", path);
306 			}
307 			return error;
308 		}
309 		vp = nd.ni_vp;
310 		pvp = nd.ni_dvp;
311 		did_namei = TRUE;
312 	} else {
313 		char *pnbuf = CAST_DOWN(char *, path);
314 
315 		nd.ni_cnd.cn_pnbuf = pnbuf;
316 		nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
317 		did_namei = FALSE;
318 	}
319 
320 	kern_flags |= KERNEL_MOUNT_KMOUNT;
321 	error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
322 	    syscall_flags, kern_flags, NULL, ctx);
323 
324 	if (did_namei) {
325 		vnode_put(vp);
326 		vnode_put(pvp);
327 		nameidone(&nd);
328 	}
329 
330 	return error;
331 }
332 
333 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)334 vfs_mount_at_path(const char *fstype, const char *path,
335     vnode_t pvp, vnode_t vp, void *data, size_t datalen,
336     int mnt_flags, int flags)
337 {
338 	int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
339 	int error, km_flags = 0;
340 
341 	/*
342 	 * This call is currently restricted to specific use cases.
343 	 */
344 	if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
345 		return ENOTSUP;
346 	}
347 
348 #if !defined(XNU_TARGET_OS_OSX)
349 	if (strcmp(fstype, "lifs") == 0) {
350 		syscall_flags |= MNT_NOEXEC;
351 	}
352 #endif
353 
354 	if (flags & VFS_MOUNT_FLAG_NOAUTH) {
355 		km_flags |= KERNEL_MOUNT_NOAUTH;
356 	}
357 	if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
358 		km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
359 	}
360 
361 	error = kernel_mount(fstype, pvp, vp, path, data, datalen,
362 	    syscall_flags, km_flags, vfs_context_kernel());
363 	if (error) {
364 		printf("%s: mount on %s failed, error %d\n", __func__, path,
365 		    error);
366 	}
367 
368 	return error;
369 }
370 
371 int
vfs_mount_override_type_name(mount_t mp,const char * name)372 vfs_mount_override_type_name(mount_t mp, const char *name)
373 {
374 	if (mp == NULL || name == NULL) {
375 		return EINVAL;
376 	}
377 
378 	/* Override the FS type name. */
379 	mount_lock_spin(mp);
380 	strlcpy(mp->fstypename_override, name, sizeof(mp->fstypename_override));
381 	mp->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
382 	mount_unlock(mp);
383 
384 	return 0;
385 }
386 
387 /*
388  * Mount a file system.
389  */
390 /* ARGSUSED */
391 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)392 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
393 {
394 	struct __mac_mount_args muap;
395 
396 	muap.type = uap->type;
397 	muap.path = uap->path;
398 	muap.flags = uap->flags;
399 	muap.data = uap->data;
400 	muap.mac_p = USER_ADDR_NULL;
401 	return __mac_mount(p, &muap, retval);
402 }
403 
404 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)405 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
406 {
407 	struct componentname    cn;
408 	vfs_context_t           ctx = vfs_context_current();
409 	size_t                  dummy = 0;
410 	int                     error;
411 	int                     flags = uap->flags;
412 	char                    fstypename[MFSNAMELEN];
413 	char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
414 	vnode_t                 pvp;
415 	vnode_t                 vp;
416 
417 	AUDIT_ARG(fd, uap->fd);
418 	AUDIT_ARG(fflags, flags);
419 	/* fstypename will get audited by mount_common */
420 
421 	/* Sanity check the flags */
422 	if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
423 		return ENOTSUP;
424 	}
425 
426 	if (flags & MNT_UNION) {
427 		return EPERM;
428 	}
429 
430 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
431 	if (error) {
432 		return error;
433 	}
434 
435 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
436 		return error;
437 	}
438 
439 	if ((error = vnode_getwithref(vp)) != 0) {
440 		file_drop(uap->fd);
441 		return error;
442 	}
443 
444 	pvp = vnode_getparent(vp);
445 	if (pvp == NULL) {
446 		vnode_put(vp);
447 		file_drop(uap->fd);
448 		return EINVAL;
449 	}
450 
451 	memset(&cn, 0, sizeof(struct componentname));
452 	cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
453 	cn.cn_pnlen = MAXPATHLEN;
454 
455 	if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
456 		zfree(ZV_NAMEI, cn.cn_pnbuf);
457 		vnode_put(pvp);
458 		vnode_put(vp);
459 		file_drop(uap->fd);
460 		return error;
461 	}
462 
463 	error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
464 
465 	zfree(ZV_NAMEI, cn.cn_pnbuf);
466 	vnode_put(pvp);
467 	vnode_put(vp);
468 	file_drop(uap->fd);
469 
470 	return error;
471 }
472 
473 void
vfs_notify_mount(vnode_t pdvp)474 vfs_notify_mount(vnode_t pdvp)
475 {
476 	vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
477 	lock_vnode_and_post(pdvp, NOTE_WRITE);
478 }
479 
480 /*
481  * __mac_mount:
482  *	Mount a file system taking into account MAC label behavior.
483  *	See mount(2) man page for more information
484  *
485  * Parameters:    p                        Process requesting the mount
486  *                uap                      User argument descriptor (see below)
487  *                retval                   (ignored)
488  *
489  * Indirect:      uap->type                Filesystem type
490  *                uap->path                Path to mount
491  *                uap->data                Mount arguments
492  *                uap->mac_p               MAC info
493  *                uap->flags               Mount flags
494  *
495  *
496  * Returns:        0                       Success
497  *                !0                       Not success
498  */
499 boolean_t root_fs_upgrade_try = FALSE;
500 
501 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)502 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
503 {
504 	vnode_t pvp = NULL;
505 	vnode_t vp = NULL;
506 	int need_nameidone = 0;
507 	vfs_context_t ctx = vfs_context_current();
508 	char fstypename[MFSNAMELEN];
509 	struct nameidata nd;
510 	size_t dummy = 0;
511 	char *labelstr = NULL;
512 	size_t labelsz = 0;
513 	int flags = uap->flags;
514 	int error;
515 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
516 	boolean_t is_64bit = IS_64BIT_PROCESS(p);
517 #else
518 #pragma unused(p)
519 #endif
520 	/*
521 	 * Get the fs type name from user space
522 	 */
523 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
524 	if (error) {
525 		return error;
526 	}
527 
528 	/*
529 	 * Get the vnode to be covered
530 	 */
531 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
532 	    UIO_USERSPACE, uap->path, ctx);
533 	if (flags & MNT_NOFOLLOW) {
534 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
535 	}
536 	error = namei(&nd);
537 	if (error) {
538 		goto out;
539 	}
540 	need_nameidone = 1;
541 	vp = nd.ni_vp;
542 	pvp = nd.ni_dvp;
543 
544 #ifdef CONFIG_IMGSRC_ACCESS
545 	/* Mounting image source cannot be batched with other operations */
546 	if (flags == MNT_IMGSRC_BY_INDEX) {
547 		error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
548 		    ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
549 		goto out;
550 	}
551 #endif /* CONFIG_IMGSRC_ACCESS */
552 
553 #if CONFIG_MACF
554 	/*
555 	 * Get the label string (if any) from user space
556 	 */
557 	if (uap->mac_p != USER_ADDR_NULL) {
558 		struct user_mac mac;
559 		size_t ulen = 0;
560 
561 		if (is_64bit) {
562 			struct user64_mac mac64;
563 			error = copyin(uap->mac_p, &mac64, sizeof(mac64));
564 			mac.m_buflen = (user_size_t)mac64.m_buflen;
565 			mac.m_string = (user_addr_t)mac64.m_string;
566 		} else {
567 			struct user32_mac mac32;
568 			error = copyin(uap->mac_p, &mac32, sizeof(mac32));
569 			mac.m_buflen = mac32.m_buflen;
570 			mac.m_string = mac32.m_string;
571 		}
572 		if (error) {
573 			goto out;
574 		}
575 		if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
576 		    (mac.m_buflen < 2)) {
577 			error = EINVAL;
578 			goto out;
579 		}
580 		labelsz = mac.m_buflen;
581 		labelstr = kalloc_data(labelsz, Z_WAITOK);
582 		error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
583 		if (error) {
584 			goto out;
585 		}
586 		AUDIT_ARG(mac_string, labelstr);
587 	}
588 #endif /* CONFIG_MACF */
589 
590 	AUDIT_ARG(fflags, flags);
591 
592 #if !CONFIG_UNION_MOUNTS
593 	if (flags & MNT_UNION) {
594 		error = EPERM;
595 		goto out;
596 	}
597 #endif
598 
599 	if ((vp->v_flag & VROOT) &&
600 	    (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
601 #if CONFIG_UNION_MOUNTS
602 		if (!(flags & MNT_UNION)) {
603 			flags |= MNT_UPDATE;
604 		} else {
605 			/*
606 			 * For a union mount on '/', treat it as fresh
607 			 * mount instead of update.
608 			 * Otherwise, union mouting on '/' used to panic the
609 			 * system before, since mnt_vnodecovered was found to
610 			 * be NULL for '/' which is required for unionlookup
611 			 * after it gets ENOENT on union mount.
612 			 */
613 			flags = (flags & ~(MNT_UPDATE));
614 		}
615 #else
616 		flags |= MNT_UPDATE;
617 #endif /* CONFIG_UNION_MOUNTS */
618 
619 #if SECURE_KERNEL
620 		if ((flags & MNT_RDONLY) == 0) {
621 			/* Release kernels are not allowed to mount "/" as rw */
622 			error = EPERM;
623 			goto out;
624 		}
625 #endif
626 
627 		/*
628 		 * See 7392553 for more details on why this check exists.
629 		 * Suffice to say: If this check is ON and something tries
630 		 * to mount the rootFS RW, we'll turn off the codesign
631 		 * bitmap optimization.
632 		 */
633 #if CHECK_CS_VALIDATION_BITMAP
634 		if ((flags & MNT_RDONLY) == 0) {
635 			root_fs_upgrade_try = TRUE;
636 		}
637 #endif
638 	}
639 
640 	error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
641 	    labelstr, ctx);
642 
643 out:
644 
645 #if CONFIG_MACF
646 	kfree_data(labelstr, labelsz);
647 #endif /* CONFIG_MACF */
648 
649 	if (vp) {
650 		vnode_put(vp);
651 	}
652 	if (pvp) {
653 		vnode_put(pvp);
654 	}
655 	if (need_nameidone) {
656 		nameidone(&nd);
657 	}
658 
659 	return error;
660 }
661 
662 /*
663  * common mount implementation (final stage of mounting)
664  *
665  * Arguments:
666  *  fstypename	file system type (ie it's vfs name)
667  *  pvp		parent of covered vnode
668  *  vp		covered vnode
669  *  cnp		component name (ie path) of covered vnode
670  *  flags	generic mount flags
671  *  fsmountargs	file system specific data
672  *  labelstr	optional MAC label
673  *  kernelmount	TRUE for mounts initiated from inside the kernel
674  *  ctx		caller's context
675  */
676 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)677 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
678     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
679     char *labelstr, vfs_context_t ctx)
680 {
681 #if !CONFIG_MACF
682 #pragma unused(labelstr)
683 #endif
684 	struct vnode *devvp = NULLVP;
685 	struct vnode *device_vnode = NULLVP;
686 #if CONFIG_MACF
687 	struct vnode *rvp;
688 #endif
689 	struct mount *mp;
690 	struct vfstable *vfsp = (struct vfstable *)0;
691 	struct proc *p = vfs_context_proc(ctx);
692 	int error, flag = 0;
693 	bool flag_set = false;
694 	user_addr_t devpath = USER_ADDR_NULL;
695 	int ronly = 0;
696 	int mntalloc = 0;
697 	boolean_t vfsp_ref = FALSE;
698 	boolean_t is_rwlock_locked = FALSE;
699 	boolean_t did_rele = FALSE;
700 	boolean_t have_usecount = FALSE;
701 	boolean_t did_set_lmount = FALSE;
702 	boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
703 
704 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
705 	/* Check for mutually-exclusive flag bits */
706 	uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
707 	int bitcount = 0;
708 	while (checkflags != 0) {
709 		checkflags &= (checkflags - 1);
710 		bitcount++;
711 	}
712 
713 	if (bitcount > 1) {
714 		//not allowed to request multiple mount-by-role flags
715 		error = EINVAL;
716 		goto out1;
717 	}
718 #endif
719 
720 	/*
721 	 * Process an update for an existing mount
722 	 */
723 	if (flags & MNT_UPDATE) {
724 		if ((vp->v_flag & VROOT) == 0) {
725 			error = EINVAL;
726 			goto out1;
727 		}
728 		mp = vp->v_mount;
729 
730 		/* if unmount or mount in progress, return error */
731 		mount_lock_spin(mp);
732 		if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
733 			mount_unlock(mp);
734 			error = EBUSY;
735 			goto out1;
736 		}
737 		mp->mnt_lflag |= MNT_LMOUNT;
738 		did_set_lmount = TRUE;
739 		mount_unlock(mp);
740 		lck_rw_lock_exclusive(&mp->mnt_rwlock);
741 		is_rwlock_locked = TRUE;
742 		/*
743 		 * We only allow the filesystem to be reloaded if it
744 		 * is currently mounted read-only.
745 		 */
746 		if ((flags & MNT_RELOAD) &&
747 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
748 			error = ENOTSUP;
749 			goto out1;
750 		}
751 
752 		/*
753 		 * If content protection is enabled, update mounts are not
754 		 * allowed to turn it off.
755 		 */
756 		if ((mp->mnt_flag & MNT_CPROTECT) &&
757 		    ((flags & MNT_CPROTECT) == 0)) {
758 			error = EINVAL;
759 			goto out1;
760 		}
761 
762 		/*
763 		 * can't turn off MNT_REMOVABLE either but it may be an unexpected
764 		 * failure to return an error for this so we'll just silently
765 		 * add it if it is not passed in.
766 		 */
767 		if ((mp->mnt_flag & MNT_REMOVABLE) &&
768 		    ((flags & MNT_REMOVABLE) == 0)) {
769 			flags |= MNT_REMOVABLE;
770 		}
771 
772 		/* Can't downgrade the backer of the root FS */
773 		if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
774 		    (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
775 			error = ENOTSUP;
776 			goto out1;
777 		}
778 
779 		/*
780 		 * Only root, or the user that did the original mount is
781 		 * permitted to update it.
782 		 */
783 		if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
784 		    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
785 			goto out1;
786 		}
787 #if CONFIG_MACF
788 		error = mac_mount_check_remount(ctx, mp);
789 		if (error != 0) {
790 			goto out1;
791 		}
792 #endif
793 		/*
794 		 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
795 		 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
796 		 */
797 		if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
798 			flags |= MNT_NOSUID | MNT_NODEV;
799 			if (mp->mnt_flag & MNT_NOEXEC) {
800 				flags |= MNT_NOEXEC;
801 			}
802 		}
803 		flag = mp->mnt_flag;
804 		flag_set = true;
805 
806 
807 
808 		mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
809 
810 		vfsp = mp->mnt_vtable;
811 		goto update;
812 	} // MNT_UPDATE
813 
814 	/*
815 	 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
816 	 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
817 	 */
818 	if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
819 		flags |= MNT_NOSUID | MNT_NODEV;
820 		if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
821 			flags |= MNT_NOEXEC;
822 		}
823 	}
824 
825 	/* XXXAUDIT: Should we capture the type on the error path as well? */
826 	/* XXX cast-away const (audit_arg_text() does not modify its input) */
827 	AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
828 	mount_list_lock();
829 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
830 		if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
831 			vfsp->vfc_refcount++;
832 			vfsp_ref = TRUE;
833 			break;
834 		}
835 	}
836 	mount_list_unlock();
837 	if (vfsp == NULL) {
838 		error = ENODEV;
839 		goto out1;
840 	}
841 
842 	/*
843 	 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
844 	 * except in ROSV configs and for the initial BaseSystem root.
845 	 */
846 	if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
847 	    ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
848 	    ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
849 		error = EINVAL;  /* unsupported request */
850 		goto out1;
851 	}
852 
853 	error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
854 	if (error != 0) {
855 		goto out1;
856 	}
857 
858 	/*
859 	 * Allocate and initialize the filesystem (mount_t)
860 	 */
861 	mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
862 	mntalloc = 1;
863 
864 	/* Initialize the default IO constraints */
865 	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
866 	mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
867 	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
868 	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
869 	mp->mnt_devblocksize = DEV_BSIZE;
870 	mp->mnt_alignmentmask = PAGE_MASK;
871 	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
872 	mp->mnt_ioscale = 1;
873 	mp->mnt_ioflags = 0;
874 	mp->mnt_realrootvp = NULLVP;
875 	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
876 
877 	mp->mnt_lflag |= MNT_LMOUNT;
878 	did_set_lmount = TRUE;
879 
880 	TAILQ_INIT(&mp->mnt_vnodelist);
881 	TAILQ_INIT(&mp->mnt_workerqueue);
882 	TAILQ_INIT(&mp->mnt_newvnodes);
883 	mount_lock_init(mp);
884 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
885 	is_rwlock_locked = TRUE;
886 	mp->mnt_op = vfsp->vfc_vfsops;
887 	mp->mnt_vtable = vfsp;
888 	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
889 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
890 	strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
891 	do {
892 		int pathlen = MAXPATHLEN;
893 
894 		if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
895 			strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
896 		}
897 	} while (0);
898 	mp->mnt_vnodecovered = vp;
899 	mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
900 	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
901 	mp->mnt_devbsdunit = 0;
902 	mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
903 
904 	/* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
905 	vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
906 
907 	if (kernelmount) {
908 		mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
909 	}
910 	if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
911 		mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
912 	}
913 
914 	if (KERNEL_MOUNT_DEVFS & internal_flags) {
915 		// kernel mounted devfs
916 		mp->mnt_kern_flag |= MNTK_SYSTEM;
917 	}
918 
919 update:
920 
921 	/*
922 	 * Set the mount level flags.
923 	 */
924 	if (flags & MNT_RDONLY) {
925 		mp->mnt_flag |= MNT_RDONLY;
926 	} else if (mp->mnt_flag & MNT_RDONLY) {
927 		// disallow read/write upgrades of file systems that
928 		// had the TYPENAME_OVERRIDE feature set.
929 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
930 			error = EPERM;
931 			goto out1;
932 		}
933 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
934 	}
935 	mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
936 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
937 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
938 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
939 	    MNT_QUARANTINE | MNT_CPROTECT);
940 
941 #if SECURE_KERNEL
942 #if !CONFIG_MNT_SUID
943 	/*
944 	 * On release builds of iOS based platforms, always enforce NOSUID on
945 	 * all mounts. We do this here because we can catch update mounts as well as
946 	 * non-update mounts in this case.
947 	 */
948 	mp->mnt_flag |= (MNT_NOSUID);
949 #endif
950 #endif
951 
952 	mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
953 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
954 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
955 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
956 	    MNT_QUARANTINE | MNT_CPROTECT);
957 
958 #if CONFIG_MACF
959 	if (flags & MNT_MULTILABEL) {
960 		if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
961 			error = EINVAL;
962 			goto out1;
963 		}
964 		mp->mnt_flag |= MNT_MULTILABEL;
965 	}
966 #endif
967 	/*
968 	 * Process device path for local file systems if requested.
969 	 *
970 	 * Snapshot and mount-by-role mounts do not use this path; they are
971 	 * passing other opaque data in the device path field.
972 	 *
973 	 * Basesystemroot mounts pass a device path to be resolved here,
974 	 * but it's just a char * already inside the kernel, which
975 	 * kernel_mount() shoved into a user_addr_t to call us. So for such
976 	 * mounts we must skip copyin (both of the address and of the string
977 	 * (in NDINIT).
978 	 */
979 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
980 	    !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
981 		boolean_t do_copyin_devpath = true;
982 #if CONFIG_BASESYSTEMROOT
983 		if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
984 			// KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
985 			// We have been passed fsmountargs, which is typed as a user_addr_t,
986 			// but is actually a char ** pointing to a (kernelspace) string.
987 			// We manually unpack it with a series of casts and dereferences
988 			// that reverses what was done just above us on the stack in
989 			// imageboot_pivot_image().
990 			// After retrieving the path to the dev node (which we will NDINIT
991 			// in a moment), we pass NULL fsmountargs on to the filesystem.
992 			_Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
993 			char **devnamepp = (char **)fsmountargs;
994 			char *devnamep = *devnamepp;
995 			devpath = CAST_USER_ADDR_T(devnamep);
996 			do_copyin_devpath = false;
997 			fsmountargs = USER_ADDR_NULL;
998 
999 			//Now that we have a mp, denote that this mount is for the basesystem.
1000 			mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1001 		}
1002 #endif // CONFIG_BASESYSTEMROOT
1003 
1004 		if (do_copyin_devpath) {
1005 			if (vfs_context_is64bit(ctx)) {
1006 				if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1007 					goto out1;
1008 				}
1009 				fsmountargs += sizeof(devpath);
1010 			} else {
1011 				user32_addr_t tmp;
1012 				if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1013 					goto out1;
1014 				}
1015 				/* munge into LP64 addr */
1016 				devpath = CAST_USER_ADDR_T(tmp);
1017 				fsmountargs += sizeof(tmp);
1018 			}
1019 		}
1020 
1021 		/* Lookup device and authorize access to it */
1022 		if ((devpath)) {
1023 			struct nameidata nd;
1024 
1025 			enum uio_seg seg = UIO_USERSPACE;
1026 #if CONFIG_BASESYSTEMROOT
1027 			if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1028 				seg = UIO_SYSSPACE;
1029 			}
1030 #endif // CONFIG_BASESYSTEMROOT
1031 
1032 			NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1033 			if ((error = namei(&nd))) {
1034 				goto out1;
1035 			}
1036 
1037 			strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1038 			devvp = nd.ni_vp;
1039 
1040 			nameidone(&nd);
1041 
1042 			if (devvp->v_type != VBLK) {
1043 				error = ENOTBLK;
1044 				goto out2;
1045 			}
1046 			if (major(devvp->v_rdev) >= nblkdev) {
1047 				error = ENXIO;
1048 				goto out2;
1049 			}
1050 			/*
1051 			 * If mount by non-root, then verify that user has necessary
1052 			 * permissions on the device.
1053 			 */
1054 			if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1055 				mode_t accessmode = KAUTH_VNODE_READ_DATA;
1056 
1057 				if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1058 					accessmode |= KAUTH_VNODE_WRITE_DATA;
1059 				}
1060 				if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1061 					goto out2;
1062 				}
1063 			}
1064 		}
1065 		/* On first mount, preflight and open device */
1066 		if (devpath && ((flags & MNT_UPDATE) == 0)) {
1067 			if ((error = vnode_ref(devvp))) {
1068 				goto out2;
1069 			}
1070 			/*
1071 			 * Disallow multiple mounts of the same device.
1072 			 * Disallow mounting of a device that is currently in use
1073 			 * (except for root, which might share swap device for miniroot).
1074 			 * Flush out any old buffers remaining from a previous use.
1075 			 */
1076 			if ((error = vfs_mountedon(devvp))) {
1077 				goto out3;
1078 			}
1079 
1080 			if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1081 				error = EBUSY;
1082 				goto out3;
1083 			}
1084 			if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1085 				error = ENOTBLK;
1086 				goto out3;
1087 			}
1088 			if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1089 				goto out3;
1090 			}
1091 
1092 			ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1093 #if CONFIG_MACF
1094 			error = mac_vnode_check_open(ctx,
1095 			    devvp,
1096 			    ronly ? FREAD : FREAD | FWRITE);
1097 			if (error) {
1098 				goto out3;
1099 			}
1100 #endif /* MAC */
1101 			if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1102 				goto out3;
1103 			}
1104 
1105 			mp->mnt_devvp = devvp;
1106 			device_vnode = devvp;
1107 		} else if ((mp->mnt_flag & MNT_RDONLY) &&
1108 		    (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1109 		    (device_vnode = mp->mnt_devvp)) {
1110 			dev_t dev;
1111 			int maj;
1112 			/*
1113 			 * If upgrade to read-write by non-root, then verify
1114 			 * that user has necessary permissions on the device.
1115 			 */
1116 			vnode_getalways(device_vnode);
1117 
1118 			if (suser(vfs_context_ucred(ctx), NULL) &&
1119 			    (error = vnode_authorize(device_vnode, NULL,
1120 			    KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1121 			    ctx)) != 0) {
1122 				vnode_put(device_vnode);
1123 				goto out2;
1124 			}
1125 
1126 			/* Tell the device that we're upgrading */
1127 			dev = (dev_t)device_vnode->v_rdev;
1128 			maj = major(dev);
1129 
1130 			if ((u_int)maj >= (u_int)nblkdev) {
1131 				panic("Volume mounted on a device with invalid major number.");
1132 			}
1133 
1134 			error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1135 			vnode_put(device_vnode);
1136 			device_vnode = NULLVP;
1137 			if (error != 0) {
1138 				goto out2;
1139 			}
1140 		}
1141 	} // localargs && !(snapshot | data | vm)
1142 
1143 #if CONFIG_MACF
1144 	if ((flags & MNT_UPDATE) == 0) {
1145 		mac_mount_label_init(mp);
1146 		mac_mount_label_associate(ctx, mp);
1147 	}
1148 	if (labelstr) {
1149 		if ((flags & MNT_UPDATE) != 0) {
1150 			error = mac_mount_check_label_update(ctx, mp);
1151 			if (error != 0) {
1152 				goto out3;
1153 			}
1154 		}
1155 	}
1156 #endif
1157 	/*
1158 	 * Mount the filesystem.  We already asserted that internal_flags
1159 	 * cannot have more than one mount-by-role bit set.
1160 	 */
1161 	if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1162 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1163 		    (caddr_t)fsmountargs, 0, ctx);
1164 	} else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1165 #if CONFIG_ROSV_STARTUP
1166 		struct mount *origin_mp = (struct mount*)fsmountargs;
1167 		fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1168 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1169 		if (error) {
1170 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1171 		} else {
1172 			/* Mark volume associated with system volume */
1173 			mp->mnt_kern_flag |= MNTK_SYSTEM;
1174 
1175 			/* Attempt to acquire the mnt_devvp and set it up */
1176 			struct vnode *mp_devvp = NULL;
1177 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1178 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1179 				    0, &mp_devvp, vfs_context_kernel());
1180 				if (!lerr) {
1181 					mp->mnt_devvp = mp_devvp;
1182 					//vnode_lookup took an iocount, need to drop it.
1183 					vnode_put(mp_devvp);
1184 					// now set `device_vnode` to the devvp that was acquired.
1185 					// this is needed in order to ensure vfs_init_io_attributes is invoked.
1186 					// note that though the iocount above was dropped, the mount acquires
1187 					// an implicit reference against the device.
1188 					device_vnode = mp_devvp;
1189 				}
1190 			}
1191 		}
1192 #else
1193 		error = EINVAL;
1194 #endif
1195 	} else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1196 #if CONFIG_MOUNT_VM
1197 		struct mount *origin_mp = (struct mount*)fsmountargs;
1198 		fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1199 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1200 		if (error) {
1201 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1202 		} else {
1203 			/* Mark volume associated with system volume and a swap mount */
1204 			mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1205 			/* Attempt to acquire the mnt_devvp and set it up */
1206 			struct vnode *mp_devvp = NULL;
1207 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1208 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1209 				    0, &mp_devvp, vfs_context_kernel());
1210 				if (!lerr) {
1211 					mp->mnt_devvp = mp_devvp;
1212 					//vnode_lookup took an iocount, need to drop it.
1213 					vnode_put(mp_devvp);
1214 
1215 					// now set `device_vnode` to the devvp that was acquired.
1216 					// note that though the iocount above was dropped, the mount acquires
1217 					// an implicit reference against the device.
1218 					device_vnode = mp_devvp;
1219 				}
1220 			}
1221 		}
1222 #else
1223 		error = EINVAL;
1224 #endif
1225 	} else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1226 #if CONFIG_MOUNT_PREBOOTRECOVERY
1227 		struct mount *origin_mp = (struct mount*)fsmountargs;
1228 		uint32_t mount_role = 0;
1229 		if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1230 			mount_role = VFS_PREBOOT_ROLE;
1231 		} else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1232 			mount_role = VFS_RECOVERY_ROLE;
1233 		}
1234 
1235 		if (mount_role != 0) {
1236 			fs_role_mount_args_t frma = {origin_mp, mount_role};
1237 			error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1238 			if (error) {
1239 				printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1240 			} else {
1241 				// NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1242 				/* Mark volume associated with system volume */
1243 				//mp->mnt_kern_flag |= MNTK_SYSTEM;
1244 				/* Attempt to acquire the mnt_devvp and set it up */
1245 				struct vnode *mp_devvp = NULL;
1246 				if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1247 					errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1248 					    0, &mp_devvp, vfs_context_kernel());
1249 					if (!lerr) {
1250 						mp->mnt_devvp = mp_devvp;
1251 						//vnode_lookup took an iocount, need to drop it.
1252 						vnode_put(mp_devvp);
1253 
1254 						// now set `device_vnode` to the devvp that was acquired.
1255 						// note that though the iocount above was dropped, the mount acquires
1256 						// an implicit reference against the device.
1257 						device_vnode = mp_devvp;
1258 					}
1259 				}
1260 			}
1261 		} else {
1262 			printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1263 			error = EINVAL;
1264 		}
1265 #else
1266 		error = EINVAL;
1267 #endif
1268 	} else {
1269 		error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1270 	}
1271 
1272 	if (flags & MNT_UPDATE) {
1273 		if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1274 			mp->mnt_flag &= ~MNT_RDONLY;
1275 		}
1276 		mp->mnt_flag &= ~
1277 		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1278 		mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1279 		if (error) {
1280 			mp->mnt_flag = flag;  /* restore flag value */
1281 		}
1282 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1283 		lck_rw_done(&mp->mnt_rwlock);
1284 		is_rwlock_locked = FALSE;
1285 		if (!error) {
1286 			enablequotas(mp, ctx);
1287 		}
1288 		goto exit;
1289 	}
1290 
1291 	/*
1292 	 * Put the new filesystem on the mount list after root.
1293 	 */
1294 	if (error == 0) {
1295 		struct vfs_attr vfsattr;
1296 #if CONFIG_MACF
1297 		error = mac_mount_check_mount_late(ctx, mp);
1298 		if (error != 0) {
1299 			goto out4;
1300 		}
1301 
1302 		if (vfs_flags(mp) & MNT_MULTILABEL) {
1303 			error = VFS_ROOT(mp, &rvp, ctx);
1304 			if (error) {
1305 				printf("%s() VFS_ROOT returned %d\n", __func__, error);
1306 				goto out4;
1307 			}
1308 			error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1309 			/*
1310 			 * drop reference provided by VFS_ROOT
1311 			 */
1312 			vnode_put(rvp);
1313 
1314 			if (error) {
1315 				goto out4;
1316 			}
1317 		}
1318 #endif  /* MAC */
1319 
1320 		vnode_lock_spin(vp);
1321 		CLR(vp->v_flag, VMOUNT);
1322 		vp->v_mountedhere = mp;
1323 		vnode_unlock(vp);
1324 
1325 		/*
1326 		 * taking the name_cache_lock exclusively will
1327 		 * insure that everyone is out of the fast path who
1328 		 * might be trying to use a now stale copy of
1329 		 * vp->v_mountedhere->mnt_realrootvp
1330 		 * bumping mount_generation causes the cached values
1331 		 * to be invalidated
1332 		 */
1333 		name_cache_lock();
1334 		mount_generation++;
1335 		name_cache_unlock();
1336 
1337 		error = vnode_ref(vp);
1338 		if (error != 0) {
1339 			goto out4;
1340 		}
1341 
1342 		have_usecount = TRUE;
1343 
1344 		error = checkdirs(vp, ctx);
1345 		if (error != 0) {
1346 			/* Unmount the filesystem as cdir/rdirs cannot be updated */
1347 			goto out4;
1348 		}
1349 		/*
1350 		 * there is no cleanup code here so I have made it void
1351 		 * we need to revisit this
1352 		 */
1353 		(void)VFS_START(mp, 0, ctx);
1354 
1355 		if (mount_list_add(mp) != 0) {
1356 			/*
1357 			 * The system is shutting down trying to umount
1358 			 * everything, so fail with a plausible errno.
1359 			 */
1360 			error = EBUSY;
1361 			goto out4;
1362 		}
1363 		lck_rw_done(&mp->mnt_rwlock);
1364 		is_rwlock_locked = FALSE;
1365 
1366 		/* Check if this mounted file system supports EAs or named streams. */
1367 		/* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1368 		VFSATTR_INIT(&vfsattr);
1369 		VFSATTR_WANTED(&vfsattr, f_capabilities);
1370 		if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1371 		    vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1372 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1373 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1374 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1375 				mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1376 			}
1377 #if NAMEDSTREAMS
1378 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1379 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1380 				mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1381 			}
1382 #endif
1383 			/* Check if this file system supports path from id lookups. */
1384 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1385 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1386 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1387 			} else if (mp->mnt_flag & MNT_DOVOLFS) {
1388 				/* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1389 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1390 			}
1391 
1392 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1393 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1394 				mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1395 			}
1396 		}
1397 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1398 			mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1399 		}
1400 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1401 			mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1402 		}
1403 		/* increment the operations count */
1404 		OSAddAtomic(1, &vfs_nummntops);
1405 		enablequotas(mp, ctx);
1406 
1407 		if (device_vnode) {
1408 			device_vnode->v_specflags |= SI_MOUNTEDON;
1409 
1410 			/*
1411 			 *   cache the IO attributes for the underlying physical media...
1412 			 *   an error return indicates the underlying driver doesn't
1413 			 *   support all the queries necessary... however, reasonable
1414 			 *   defaults will have been set, so no reason to bail or care
1415 			 */
1416 			vfs_init_io_attributes(device_vnode, mp);
1417 		}
1418 
1419 		/* Now that mount is setup, notify the listeners */
1420 		vfs_notify_mount(pvp);
1421 		IOBSDMountChange(mp, kIOMountChangeMount);
1422 	} else {
1423 		/* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1424 		if (mp->mnt_vnodelist.tqh_first != NULL) {
1425 			panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1426 			    mp->mnt_vtable->vfc_name, error);
1427 		}
1428 
1429 		vnode_lock_spin(vp);
1430 		CLR(vp->v_flag, VMOUNT);
1431 		vnode_unlock(vp);
1432 		mount_list_lock();
1433 		mp->mnt_vtable->vfc_refcount--;
1434 		mount_list_unlock();
1435 
1436 		if (device_vnode) {
1437 			vnode_rele(device_vnode);
1438 			VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1439 		}
1440 		lck_rw_done(&mp->mnt_rwlock);
1441 		is_rwlock_locked = FALSE;
1442 
1443 		/*
1444 		 * if we get here, we have a mount structure that needs to be freed,
1445 		 * but since the coveredvp hasn't yet been updated to point at it,
1446 		 * no need to worry about other threads holding a crossref on this mp
1447 		 * so it's ok to just free it
1448 		 */
1449 		mount_lock_destroy(mp);
1450 #if CONFIG_MACF
1451 		mac_mount_label_destroy(mp);
1452 #endif
1453 		zfree(mount_zone, mp);
1454 		did_set_lmount = false;
1455 	}
1456 exit:
1457 	/*
1458 	 * drop I/O count on the device vp if there was one
1459 	 */
1460 	if (devpath && devvp) {
1461 		vnode_put(devvp);
1462 	}
1463 
1464 	if (did_set_lmount) {
1465 		mount_lock_spin(mp);
1466 		mp->mnt_lflag &= ~MNT_LMOUNT;
1467 		mount_unlock(mp);
1468 	}
1469 
1470 	return error;
1471 
1472 /* Error condition exits */
1473 out4:
1474 	(void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1475 
1476 	/*
1477 	 * If the mount has been placed on the covered vp,
1478 	 * it may have been discovered by now, so we have
1479 	 * to treat this just like an unmount
1480 	 */
1481 	mount_lock_spin(mp);
1482 	mp->mnt_lflag |= MNT_LDEAD;
1483 	mount_unlock(mp);
1484 
1485 	if (device_vnode != NULLVP) {
1486 		vnode_rele(device_vnode);
1487 		VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1488 		    ctx);
1489 		did_rele = TRUE;
1490 	}
1491 
1492 	vnode_lock_spin(vp);
1493 
1494 	mp->mnt_crossref++;
1495 	vp->v_mountedhere = (mount_t) 0;
1496 
1497 	vnode_unlock(vp);
1498 
1499 	if (have_usecount) {
1500 		vnode_rele(vp);
1501 	}
1502 out3:
1503 	if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1504 		vnode_rele(devvp);
1505 	}
1506 out2:
1507 	if (devpath && devvp) {
1508 		vnode_put(devvp);
1509 	}
1510 out1:
1511 	/* Release mnt_rwlock only when it was taken */
1512 	if (is_rwlock_locked == TRUE) {
1513 		if (flag_set) {
1514 			mp->mnt_flag = flag;  /* restore mnt_flag value */
1515 		}
1516 		lck_rw_done(&mp->mnt_rwlock);
1517 	}
1518 
1519 	if (did_set_lmount) {
1520 		mount_lock_spin(mp);
1521 		mp->mnt_lflag &= ~MNT_LMOUNT;
1522 		mount_unlock(mp);
1523 	}
1524 
1525 	if (mntalloc) {
1526 		if (mp->mnt_crossref) {
1527 			mount_dropcrossref(mp, vp, 0);
1528 		} else {
1529 			mount_lock_destroy(mp);
1530 #if CONFIG_MACF
1531 			mac_mount_label_destroy(mp);
1532 #endif
1533 			zfree(mount_zone, mp);
1534 		}
1535 	}
1536 	if (vfsp_ref) {
1537 		mount_list_lock();
1538 		vfsp->vfc_refcount--;
1539 		mount_list_unlock();
1540 	}
1541 
1542 	return error;
1543 }
1544 
1545 /*
1546  * Flush in-core data, check for competing mount attempts,
1547  * and set VMOUNT
1548  */
1549 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)1550 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1551 {
1552 #if !CONFIG_MACF
1553 #pragma unused(cnp,fsname)
1554 #endif
1555 	struct vnode_attr va;
1556 	int error;
1557 	boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1558 	boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1559 	boolean_t is_busy;
1560 
1561 	if (!skip_auth) {
1562 		/*
1563 		 * If the user is not root, ensure that they own the directory
1564 		 * onto which we are attempting to mount.
1565 		 */
1566 		VATTR_INIT(&va);
1567 		VATTR_WANTED(&va, va_uid);
1568 		if ((error = vnode_getattr(vp, &va, ctx)) ||
1569 		    (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1570 		    (!vfs_context_issuser(ctx)))) {
1571 			error = EPERM;
1572 			goto out;
1573 		}
1574 	}
1575 
1576 	if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1577 		goto out;
1578 	}
1579 
1580 	if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1581 		goto out;
1582 	}
1583 
1584 	if (vp->v_type != VDIR) {
1585 		error = ENOTDIR;
1586 		goto out;
1587 	}
1588 
1589 	vnode_lock_spin(vp);
1590 	is_busy = is_fmount ?
1591 	    (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL)) :
1592 	    (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL));
1593 	if (is_busy) {
1594 		vnode_unlock(vp);
1595 		error = EBUSY;
1596 		goto out;
1597 	}
1598 	SET(vp->v_flag, VMOUNT);
1599 	vnode_unlock(vp);
1600 
1601 #if CONFIG_MACF
1602 	error = mac_mount_check_mount(ctx, vp,
1603 	    cnp, fsname);
1604 	if (error != 0) {
1605 		vnode_lock_spin(vp);
1606 		CLR(vp->v_flag, VMOUNT);
1607 		vnode_unlock(vp);
1608 	}
1609 #endif
1610 
1611 out:
1612 	return error;
1613 }
1614 
1615 #if CONFIG_IMGSRC_ACCESS
1616 
1617 #define DEBUG_IMGSRC 0
1618 
1619 #if DEBUG_IMGSRC
1620 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1621 #else
1622 #define IMGSRC_DEBUG(args...) do { } while(0)
1623 #endif
1624 
1625 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)1626 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1627 {
1628 	struct nameidata nd;
1629 	vnode_t vp, realdevvp;
1630 	mode_t accessmode;
1631 	int error;
1632 	enum uio_seg uio = UIO_USERSPACE;
1633 
1634 	if (ctx == vfs_context_kernel()) {
1635 		uio = UIO_SYSSPACE;
1636 	}
1637 
1638 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1639 	if ((error = namei(&nd))) {
1640 		IMGSRC_DEBUG("namei() failed with %d\n", error);
1641 		return error;
1642 	}
1643 
1644 	vp = nd.ni_vp;
1645 
1646 	if (!vnode_isblk(vp)) {
1647 		IMGSRC_DEBUG("Not block device.\n");
1648 		error = ENOTBLK;
1649 		goto out;
1650 	}
1651 
1652 	realdevvp = mp->mnt_devvp;
1653 	if (realdevvp == NULLVP) {
1654 		IMGSRC_DEBUG("No device backs the mount.\n");
1655 		error = ENXIO;
1656 		goto out;
1657 	}
1658 
1659 	error = vnode_getwithref(realdevvp);
1660 	if (error != 0) {
1661 		IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1662 		goto out;
1663 	}
1664 
1665 	if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1666 		IMGSRC_DEBUG("Wrong dev_t.\n");
1667 		error = ENXIO;
1668 		goto out1;
1669 	}
1670 
1671 	strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1672 
1673 	/*
1674 	 * If mount by non-root, then verify that user has necessary
1675 	 * permissions on the device.
1676 	 */
1677 	if (!vfs_context_issuser(ctx)) {
1678 		accessmode = KAUTH_VNODE_READ_DATA;
1679 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1680 			accessmode |= KAUTH_VNODE_WRITE_DATA;
1681 		}
1682 		if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1683 			IMGSRC_DEBUG("Access denied.\n");
1684 			goto out1;
1685 		}
1686 	}
1687 
1688 	*devvpp = vp;
1689 
1690 out1:
1691 	vnode_put(realdevvp);
1692 
1693 out:
1694 	nameidone(&nd);
1695 
1696 	if (error) {
1697 		vnode_put(vp);
1698 	}
1699 
1700 	return error;
1701 }
1702 
1703 /*
1704  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1705  * and call checkdirs()
1706  */
1707 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)1708 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1709 {
1710 	int error;
1711 
1712 	mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1713 
1714 	IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
1715 	    mp->mnt_vtable->vfc_name, vnode_getname(vp));
1716 
1717 	vnode_lock_spin(vp);
1718 	CLR(vp->v_flag, VMOUNT);
1719 	vp->v_mountedhere = mp;
1720 	vnode_unlock(vp);
1721 
1722 	/*
1723 	 * taking the name_cache_lock exclusively will
1724 	 * insure that everyone is out of the fast path who
1725 	 * might be trying to use a now stale copy of
1726 	 * vp->v_mountedhere->mnt_realrootvp
1727 	 * bumping mount_generation causes the cached values
1728 	 * to be invalidated
1729 	 */
1730 	name_cache_lock();
1731 	mount_generation++;
1732 	name_cache_unlock();
1733 
1734 	error = vnode_ref(vp);
1735 	if (error != 0) {
1736 		goto out;
1737 	}
1738 
1739 	error = checkdirs(vp, ctx);
1740 	if (error != 0) {
1741 		/* Unmount the filesystem as cdir/rdirs cannot be updated */
1742 		vnode_rele(vp);
1743 		goto out;
1744 	}
1745 
1746 out:
1747 	if (error != 0) {
1748 		mp->mnt_vnodecovered = NULLVP;
1749 	}
1750 	return error;
1751 }
1752 
1753 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)1754 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1755 {
1756 	vnode_rele(vp);
1757 	vnode_lock_spin(vp);
1758 	vp->v_mountedhere = (mount_t)NULL;
1759 	vnode_unlock(vp);
1760 
1761 	mp->mnt_vnodecovered = NULLVP;
1762 }
1763 
1764 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)1765 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1766 {
1767 	int error;
1768 
1769 	/* unmount in progress return error */
1770 	mount_lock_spin(mp);
1771 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1772 		mount_unlock(mp);
1773 		return EBUSY;
1774 	}
1775 	mount_unlock(mp);
1776 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
1777 
1778 	/*
1779 	 * We only allow the filesystem to be reloaded if it
1780 	 * is currently mounted read-only.
1781 	 */
1782 	if ((flags & MNT_RELOAD) &&
1783 	    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1784 		error = ENOTSUP;
1785 		goto out;
1786 	}
1787 
1788 	/*
1789 	 * Only root, or the user that did the original mount is
1790 	 * permitted to update it.
1791 	 */
1792 	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1793 	    (!vfs_context_issuser(ctx))) {
1794 		error = EPERM;
1795 		goto out;
1796 	}
1797 #if CONFIG_MACF
1798 	error = mac_mount_check_remount(ctx, mp);
1799 	if (error != 0) {
1800 		goto out;
1801 	}
1802 #endif
1803 
1804 out:
1805 	if (error) {
1806 		lck_rw_done(&mp->mnt_rwlock);
1807 	}
1808 
1809 	return error;
1810 }
1811 
1812 static void
mount_end_update(mount_t mp)1813 mount_end_update(mount_t mp)
1814 {
1815 	lck_rw_done(&mp->mnt_rwlock);
1816 }
1817 
1818 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)1819 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1820 {
1821 	vnode_t vp;
1822 
1823 	if (height >= MAX_IMAGEBOOT_NESTING) {
1824 		return EINVAL;
1825 	}
1826 
1827 	vp = imgsrc_rootvnodes[height];
1828 	if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1829 		*rvpp = vp;
1830 		return 0;
1831 	} else {
1832 		return ENOENT;
1833 	}
1834 }
1835 
1836 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)1837 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
1838     struct componentname *cnp, const char *fsname, vfs_context_t ctx,
1839     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1840 {
1841 	int error;
1842 	mount_t mp;
1843 	boolean_t placed = FALSE;
1844 	struct vfstable *vfsp;
1845 	user_addr_t devpath;
1846 	char *old_mntonname;
1847 	vnode_t rvp;
1848 	vnode_t devvp;
1849 	uint32_t height;
1850 	uint32_t flags;
1851 
1852 	/* If we didn't imageboot, nothing to move */
1853 	if (imgsrc_rootvnodes[0] == NULLVP) {
1854 		return EINVAL;
1855 	}
1856 
1857 	/* Only root can do this */
1858 	if (!vfs_context_issuser(ctx)) {
1859 		return EPERM;
1860 	}
1861 
1862 	IMGSRC_DEBUG("looking for root vnode.\n");
1863 
1864 	/*
1865 	 * Get root vnode of filesystem we're moving.
1866 	 */
1867 	if (by_index) {
1868 		if (is64bit) {
1869 			struct user64_mnt_imgsrc_args mia64;
1870 			error = copyin(fsmountargs, &mia64, sizeof(mia64));
1871 			if (error != 0) {
1872 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
1873 				return error;
1874 			}
1875 
1876 			height = mia64.mi_height;
1877 			flags = mia64.mi_flags;
1878 			devpath = (user_addr_t)mia64.mi_devpath;
1879 		} else {
1880 			struct user32_mnt_imgsrc_args mia32;
1881 			error = copyin(fsmountargs, &mia32, sizeof(mia32));
1882 			if (error != 0) {
1883 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
1884 				return error;
1885 			}
1886 
1887 			height = mia32.mi_height;
1888 			flags = mia32.mi_flags;
1889 			devpath = mia32.mi_devpath;
1890 		}
1891 	} else {
1892 		/*
1893 		 * For binary compatibility--assumes one level of nesting.
1894 		 */
1895 		if (is64bit) {
1896 			if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1897 				return error;
1898 			}
1899 		} else {
1900 			user32_addr_t tmp;
1901 			if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1902 				return error;
1903 			}
1904 
1905 			/* munge into LP64 addr */
1906 			devpath = CAST_USER_ADDR_T(tmp);
1907 		}
1908 
1909 		height = 0;
1910 		flags = 0;
1911 	}
1912 
1913 	if (flags != 0) {
1914 		IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1915 		return EINVAL;
1916 	}
1917 
1918 	error = get_imgsrc_rootvnode(height, &rvp);
1919 	if (error != 0) {
1920 		IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
1921 		return error;
1922 	}
1923 
1924 	IMGSRC_DEBUG("got old root vnode\n");
1925 
1926 	old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
1927 
1928 	/* Can only move once */
1929 	mp = vnode_mount(rvp);
1930 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1931 		IMGSRC_DEBUG("Already moved.\n");
1932 		error = EBUSY;
1933 		goto out0;
1934 	}
1935 
1936 	IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
1937 	IMGSRC_DEBUG("Starting updated.\n");
1938 
1939 	/* Get exclusive rwlock on mount, authorize update on mp */
1940 	error = mount_begin_update(mp, ctx, 0);
1941 	if (error != 0) {
1942 		IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1943 		goto out0;
1944 	}
1945 
1946 	/*
1947 	 * It can only be moved once.  Flag is set under the rwlock,
1948 	 * so we're now safe to proceed.
1949 	 */
1950 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1951 		IMGSRC_DEBUG("Already moved [2]\n");
1952 		goto out1;
1953 	}
1954 
1955 	IMGSRC_DEBUG("Preparing coveredvp.\n");
1956 
1957 	/* Mark covered vnode as mount in progress, authorize placing mount on top */
1958 	error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
1959 	if (error != 0) {
1960 		IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1961 		goto out1;
1962 	}
1963 
1964 	IMGSRC_DEBUG("Covered vp OK.\n");
1965 
1966 	/* Sanity check the name caller has provided */
1967 	vfsp = mp->mnt_vtable;
1968 	if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1969 		IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
1970 		    vfsp->vfc_name, fsname);
1971 		error = EINVAL;
1972 		goto out2;
1973 	}
1974 
1975 	/* Check the device vnode and update mount-from name, for local filesystems */
1976 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1977 		IMGSRC_DEBUG("Local, doing device validation.\n");
1978 
1979 		if (devpath != USER_ADDR_NULL) {
1980 			error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1981 			if (error) {
1982 				IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1983 				goto out2;
1984 			}
1985 
1986 			vnode_put(devvp);
1987 		}
1988 	}
1989 
1990 	/*
1991 	 * Place mp on top of vnode, ref the vnode,  call checkdirs(),
1992 	 * and increment the name cache's mount generation
1993 	 */
1994 
1995 	IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1996 	error = place_mount_and_checkdirs(mp, vp, ctx);
1997 	if (error != 0) {
1998 		goto out2;
1999 	}
2000 
2001 	placed = TRUE;
2002 
2003 	strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2004 	strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2005 
2006 	/* Forbid future moves */
2007 	mount_lock(mp);
2008 	mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2009 	mount_unlock(mp);
2010 
2011 	/* Finally, add to mount list, completely ready to go */
2012 	if (mount_list_add(mp) != 0) {
2013 		/*
2014 		 * The system is shutting down trying to umount
2015 		 * everything, so fail with a plausible errno.
2016 		 */
2017 		error = EBUSY;
2018 		goto out3;
2019 	}
2020 
2021 	mount_end_update(mp);
2022 	vnode_put(rvp);
2023 	zfree(ZV_NAMEI, old_mntonname);
2024 
2025 	vfs_notify_mount(pvp);
2026 
2027 	return 0;
2028 out3:
2029 	strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2030 
2031 	mount_lock(mp);
2032 	mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2033 	mount_unlock(mp);
2034 
2035 out2:
2036 	/*
2037 	 * Placing the mp on the vnode clears VMOUNT,
2038 	 * so cleanup is different after that point
2039 	 */
2040 	if (placed) {
2041 		/* Rele the vp, clear VMOUNT and v_mountedhere */
2042 		undo_place_on_covered_vp(mp, vp);
2043 	} else {
2044 		vnode_lock_spin(vp);
2045 		CLR(vp->v_flag, VMOUNT);
2046 		vnode_unlock(vp);
2047 	}
2048 out1:
2049 	mount_end_update(mp);
2050 
2051 out0:
2052 	vnode_put(rvp);
2053 	zfree(ZV_NAMEI, old_mntonname);
2054 	return error;
2055 }
2056 
2057 #endif /* CONFIG_IMGSRC_ACCESS */
2058 
2059 void
enablequotas(struct mount * mp,vfs_context_t ctx)2060 enablequotas(struct mount *mp, vfs_context_t ctx)
2061 {
2062 	struct nameidata qnd;
2063 	int type;
2064 	char qfpath[MAXPATHLEN];
2065 	const char *qfname = QUOTAFILENAME;
2066 	const char *qfopsname = QUOTAOPSNAME;
2067 	const char *qfextension[] = INITQFNAMES;
2068 
2069 	/* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2070 	if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2071 		return;
2072 	}
2073 	/*
2074 	 * Enable filesystem disk quotas if necessary.
2075 	 * We ignore errors as this should not interfere with final mount
2076 	 */
2077 	for (type = 0; type < MAXQUOTAS; type++) {
2078 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2079 		NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2080 		    CAST_USER_ADDR_T(qfpath), ctx);
2081 		if (namei(&qnd) != 0) {
2082 			continue;           /* option file to trigger quotas is not present */
2083 		}
2084 		vnode_put(qnd.ni_vp);
2085 		nameidone(&qnd);
2086 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2087 
2088 		(void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2089 	}
2090 	return;
2091 }
2092 
2093 
2094 static int
checkdirs_callback(proc_t p,void * arg)2095 checkdirs_callback(proc_t p, void * arg)
2096 {
2097 	struct cdirargs *cdrp = (struct cdirargs *)arg;
2098 	vnode_t olddp = cdrp->olddp;
2099 	vnode_t newdp = cdrp->newdp;
2100 	struct filedesc *fdp = &p->p_fd;
2101 	vnode_t new_cvp = newdp;
2102 	vnode_t new_rvp = newdp;
2103 	vnode_t old_cvp = NULL;
2104 	vnode_t old_rvp = NULL;
2105 
2106 	/*
2107 	 * XXX Also needs to iterate each thread in the process to see if it
2108 	 * XXX is using a per-thread current working directory, and, if so,
2109 	 * XXX update that as well.
2110 	 */
2111 
2112 	/*
2113 	 * First, with the proc_fdlock held, check to see if we will need
2114 	 * to do any work.  If not, we will get out fast.
2115 	 */
2116 	proc_fdlock(p);
2117 	if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2118 		proc_fdunlock(p);
2119 		return PROC_RETURNED;
2120 	}
2121 	proc_fdunlock(p);
2122 
2123 	/*
2124 	 * Ok, we will have to do some work.  Always take two refs
2125 	 * because we might need that many.  We'll dispose of whatever
2126 	 * we ended up not using.
2127 	 */
2128 	if (vnode_ref(newdp) != 0) {
2129 		return PROC_RETURNED;
2130 	}
2131 	if (vnode_ref(newdp) != 0) {
2132 		vnode_rele(newdp);
2133 		return PROC_RETURNED;
2134 	}
2135 
2136 	proc_dirs_lock_exclusive(p);
2137 	/*
2138 	 * Now do the work.  Note: we dropped the proc_fdlock, so we
2139 	 * have to do all of the checks again.
2140 	 */
2141 	proc_fdlock(p);
2142 	if (fdp->fd_cdir == olddp) {
2143 		old_cvp = olddp;
2144 		fdp->fd_cdir = newdp;
2145 		new_cvp = NULL;
2146 	}
2147 	if (fdp->fd_rdir == olddp) {
2148 		old_rvp = olddp;
2149 		fdp->fd_rdir = newdp;
2150 		new_rvp = NULL;
2151 	}
2152 	proc_fdunlock(p);
2153 	proc_dirs_unlock_exclusive(p);
2154 
2155 	/*
2156 	 * Dispose of any references that are no longer needed.
2157 	 */
2158 	if (old_cvp != NULL) {
2159 		vnode_rele(old_cvp);
2160 	}
2161 	if (old_rvp != NULL) {
2162 		vnode_rele(old_rvp);
2163 	}
2164 	if (new_cvp != NULL) {
2165 		vnode_rele(new_cvp);
2166 	}
2167 	if (new_rvp != NULL) {
2168 		vnode_rele(new_rvp);
2169 	}
2170 
2171 	return PROC_RETURNED;
2172 }
2173 
2174 
2175 
2176 /*
2177  * Scan all active processes to see if any of them have a current
2178  * or root directory onto which the new filesystem has just been
2179  * mounted. If so, replace them with the new mount point.
2180  */
2181 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2182 checkdirs(vnode_t olddp, vfs_context_t ctx)
2183 {
2184 	vnode_t newdp;
2185 	vnode_t tvp;
2186 	int err;
2187 	struct cdirargs cdr;
2188 
2189 	if (olddp->v_usecount == 1) {
2190 		return 0;
2191 	}
2192 	err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2193 
2194 	if (err != 0) {
2195 #if DIAGNOSTIC
2196 		panic("mount: lost mount: error %d", err);
2197 #endif
2198 		return err;
2199 	}
2200 
2201 	cdr.olddp = olddp;
2202 	cdr.newdp = newdp;
2203 	/* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2204 	proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2205 
2206 	if (rootvnode == olddp) {
2207 		vnode_ref(newdp);
2208 		lck_rw_lock_exclusive(&rootvnode_rw_lock);
2209 		tvp = rootvnode;
2210 		rootvnode = newdp;
2211 		lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2212 		vnode_rele(tvp);
2213 	}
2214 
2215 	vnode_put(newdp);
2216 	return 0;
2217 }
2218 
2219 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT        \
2220 	"com.apple.private.vfs.role-account-unmount"
2221 
2222 /*
2223  * Unmount a file system.
2224  *
2225  * Note: unmount takes a path to the vnode mounted on as argument,
2226  * not special file (as before).
2227  */
2228 /* ARGSUSED */
2229 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2230 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2231 {
2232 	vnode_t vp;
2233 	struct mount *mp;
2234 	int error;
2235 	struct nameidata nd;
2236 	vfs_context_t ctx;
2237 
2238 	/*
2239 	 * If the process has the entitlement, use the kernel's context when
2240 	 * performing lookup on the mount path as the process might lack proper
2241 	 * permission to access the directory.
2242 	 */
2243 	ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2244 	    vfs_context_kernel() : vfs_context_current();
2245 
2246 	NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2247 	    UIO_USERSPACE, uap->path, ctx);
2248 	error = namei(&nd);
2249 	if (error) {
2250 		return error;
2251 	}
2252 	vp = nd.ni_vp;
2253 	mp = vp->v_mount;
2254 	nameidone(&nd);
2255 
2256 #if CONFIG_MACF
2257 	error = mac_mount_check_umount(ctx, mp);
2258 	if (error != 0) {
2259 		vnode_put(vp);
2260 		return error;
2261 	}
2262 #endif
2263 	/*
2264 	 * Must be the root of the filesystem
2265 	 */
2266 	if ((vp->v_flag & VROOT) == 0) {
2267 		vnode_put(vp);
2268 		return EINVAL;
2269 	}
2270 	mount_ref(mp, 0);
2271 	vnode_put(vp);
2272 	/* safedounmount consumes the mount ref */
2273 	return safedounmount(mp, uap->flags, ctx);
2274 }
2275 
2276 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2277 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2278 {
2279 	mount_t mp;
2280 
2281 	mp = mount_list_lookupby_fsid(fsid, 0, 1);
2282 	if (mp == (mount_t)0) {
2283 		return ENOENT;
2284 	}
2285 	mount_ref(mp, 0);
2286 	mount_iterdrop(mp);
2287 	/* safedounmount consumes the mount ref */
2288 	return safedounmount(mp, flags, ctx);
2289 }
2290 
2291 /*
2292  * The mount struct comes with a mount ref which will be consumed.
2293  * Do the actual file system unmount, prevent some common foot shooting.
2294  */
2295 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2296 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2297 {
2298 	int error;
2299 	proc_t p = vfs_context_proc(ctx);
2300 
2301 	/*
2302 	 * If the file system is not responding and MNT_NOBLOCK
2303 	 * is set and not a forced unmount then return EBUSY.
2304 	 */
2305 	if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2306 	    (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2307 		error = EBUSY;
2308 		goto out;
2309 	}
2310 
2311 	/*
2312 	 * Skip authorization in two cases:
2313 	 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2314 	 *   This entitlement allows non-root processes unmount volumes mounted by
2315 	 *   other processes.
2316 	 * - If the mount is tagged as permissive and this is not a forced-unmount
2317 	 *   attempt.
2318 	 */
2319 	if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2320 	    (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2321 		/*
2322 		 * Only root, or the user that did the original mount is
2323 		 * permitted to unmount this filesystem.
2324 		 */
2325 		if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2326 		    (error = suser(kauth_cred_get(), &p->p_acflag))) {
2327 			goto out;
2328 		}
2329 	}
2330 	/*
2331 	 * Don't allow unmounting the root file system, or other volumes
2332 	 * associated with it (for example, the associated VM or DATA mounts) .
2333 	 */
2334 	if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2335 		if (!(mp->mnt_flag & MNT_ROOTFS)) {
2336 			printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2337 			    mp->mnt_vfsstat.f_mntonname);
2338 		}
2339 		error = EBUSY; /* the root (or associated volumes) is always busy */
2340 		goto out;
2341 	}
2342 
2343 	/*
2344 	 * If the mount is providing the root filesystem's disk image
2345 	 * (i.e. imageboot), don't allow unmounting
2346 	 */
2347 	if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2348 		error = EBUSY;
2349 		goto out;
2350 	}
2351 
2352 	return dounmount(mp, flags, 1, ctx);
2353 
2354 out:
2355 	mount_drop(mp, 0);
2356 	return error;
2357 }
2358 
2359 /*
2360  * Do the actual file system unmount.
2361  */
2362 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2363 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2364 {
2365 	vnode_t coveredvp = (vnode_t)0;
2366 	int error;
2367 	int needwakeup = 0;
2368 	int forcedunmount = 0;
2369 	int lflags = 0;
2370 	struct vnode *devvp = NULLVP;
2371 #if CONFIG_TRIGGERS
2372 	proc_t p = vfs_context_proc(ctx);
2373 	int did_vflush = 0;
2374 	int pflags_save = 0;
2375 #endif /* CONFIG_TRIGGERS */
2376 
2377 #if CONFIG_FSE
2378 	if (!(flags & MNT_FORCE)) {
2379 		fsevent_unmount(mp, ctx);  /* has to come first! */
2380 	}
2381 #endif
2382 
2383 	mount_lock(mp);
2384 
2385 	/*
2386 	 * If already an unmount in progress just return EBUSY.
2387 	 * Even a forced unmount cannot override.
2388 	 */
2389 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2390 		if (withref != 0) {
2391 			mount_drop(mp, 1);
2392 		}
2393 		mount_unlock(mp);
2394 		return EBUSY;
2395 	}
2396 
2397 	if (flags & MNT_FORCE) {
2398 		forcedunmount = 1;
2399 		mp->mnt_lflag |= MNT_LFORCE;
2400 	}
2401 
2402 #if CONFIG_TRIGGERS
2403 	if (flags & MNT_NOBLOCK && p != kernproc) {
2404 		pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2405 	}
2406 #endif
2407 
2408 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
2409 	mp->mnt_lflag |= MNT_LUNMOUNT;
2410 	mp->mnt_flag &= ~MNT_ASYNC;
2411 	/*
2412 	 * anyone currently in the fast path that
2413 	 * trips over the cached rootvp will be
2414 	 * dumped out and forced into the slow path
2415 	 * to regenerate a new cached value
2416 	 */
2417 	mp->mnt_realrootvp = NULLVP;
2418 	mount_unlock(mp);
2419 
2420 	if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2421 		/*
2422 		 * Force unmount any mounts in this filesystem.
2423 		 * If any unmounts fail - just leave them dangling.
2424 		 * Avoids recursion.
2425 		 */
2426 		(void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2427 	}
2428 
2429 	/*
2430 	 * taking the name_cache_lock exclusively will
2431 	 * insure that everyone is out of the fast path who
2432 	 * might be trying to use a now stale copy of
2433 	 * vp->v_mountedhere->mnt_realrootvp
2434 	 * bumping mount_generation causes the cached values
2435 	 * to be invalidated
2436 	 */
2437 	name_cache_lock();
2438 	mount_generation++;
2439 	name_cache_unlock();
2440 
2441 
2442 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2443 	if (withref != 0) {
2444 		mount_drop(mp, 0);
2445 	}
2446 	error = 0;
2447 	if (forcedunmount == 0) {
2448 		ubc_umount(mp); /* release cached vnodes */
2449 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2450 			error = VFS_SYNC(mp, MNT_WAIT, ctx);
2451 			if (error) {
2452 				mount_lock(mp);
2453 				mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2454 				mp->mnt_lflag &= ~MNT_LUNMOUNT;
2455 				mp->mnt_lflag &= ~MNT_LFORCE;
2456 				goto out;
2457 			}
2458 		}
2459 	}
2460 
2461 	IOBSDMountChange(mp, kIOMountChangeUnmount);
2462 
2463 #if CONFIG_TRIGGERS
2464 	vfs_nested_trigger_unmounts(mp, flags, ctx);
2465 	did_vflush = 1;
2466 #endif
2467 	if (forcedunmount) {
2468 		lflags |= FORCECLOSE;
2469 	}
2470 	error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2471 	if ((forcedunmount == 0) && error) {
2472 		mount_lock(mp);
2473 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2474 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2475 		mp->mnt_lflag &= ~MNT_LFORCE;
2476 		goto out;
2477 	}
2478 
2479 	/* make sure there are no one in the mount iterations or lookup */
2480 	mount_iterdrain(mp);
2481 
2482 	error = VFS_UNMOUNT(mp, flags, ctx);
2483 	if (error) {
2484 		mount_iterreset(mp);
2485 		mount_lock(mp);
2486 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2487 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2488 		mp->mnt_lflag &= ~MNT_LFORCE;
2489 		goto out;
2490 	}
2491 
2492 	/* increment the operations count */
2493 	if (!error) {
2494 		OSAddAtomic(1, &vfs_nummntops);
2495 	}
2496 
2497 	if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2498 		/* hold an io reference and drop the usecount before close */
2499 		devvp = mp->mnt_devvp;
2500 		vnode_getalways(devvp);
2501 		vnode_rele(devvp);
2502 		VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2503 		    ctx);
2504 		vnode_clearmountedon(devvp);
2505 		vnode_put(devvp);
2506 	}
2507 	lck_rw_done(&mp->mnt_rwlock);
2508 	mount_list_remove(mp);
2509 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2510 
2511 	/* mark the mount point hook in the vp but not drop the ref yet */
2512 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2513 		/*
2514 		 * The covered vnode needs special handling. Trying to get an
2515 		 * iocount must not block here as this may lead to deadlocks
2516 		 * if the Filesystem to which the covered vnode belongs is
2517 		 * undergoing forced unmounts. Since we hold a usecount, the
2518 		 * vnode cannot be reused (it can, however, still be terminated)
2519 		 */
2520 		vnode_getalways(coveredvp);
2521 		vnode_lock_spin(coveredvp);
2522 
2523 		mp->mnt_crossref++;
2524 		coveredvp->v_mountedhere = (struct mount *)0;
2525 		CLR(coveredvp->v_flag, VMOUNT);
2526 
2527 		vnode_unlock(coveredvp);
2528 		vnode_put(coveredvp);
2529 	}
2530 
2531 	mount_list_lock();
2532 	mp->mnt_vtable->vfc_refcount--;
2533 	mount_list_unlock();
2534 
2535 	cache_purgevfs(mp);     /* remove cache entries for this file sys */
2536 	vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2537 	mount_lock(mp);
2538 	mp->mnt_lflag |= MNT_LDEAD;
2539 
2540 	if (mp->mnt_lflag & MNT_LWAIT) {
2541 		/*
2542 		 * do the wakeup here
2543 		 * in case we block in mount_refdrain
2544 		 * which will drop the mount lock
2545 		 * and allow anyone blocked in vfs_busy
2546 		 * to wakeup and see the LDEAD state
2547 		 */
2548 		mp->mnt_lflag &= ~MNT_LWAIT;
2549 		wakeup((caddr_t)mp);
2550 	}
2551 	mount_refdrain(mp);
2552 
2553 	/* free disk_conditioner_info structure for this mount */
2554 	disk_conditioner_unmount(mp);
2555 
2556 out:
2557 	if (mp->mnt_lflag & MNT_LWAIT) {
2558 		mp->mnt_lflag &= ~MNT_LWAIT;
2559 		needwakeup = 1;
2560 	}
2561 
2562 #if CONFIG_TRIGGERS
2563 	if (flags & MNT_NOBLOCK && p != kernproc) {
2564 		// Restore P_NOREMOTEHANG bit to its previous value
2565 		if ((pflags_save & P_NOREMOTEHANG) == 0) {
2566 			OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2567 		}
2568 	}
2569 
2570 	/*
2571 	 * Callback and context are set together under the mount lock, and
2572 	 * never cleared, so we're safe to examine them here, drop the lock,
2573 	 * and call out.
2574 	 */
2575 	if (mp->mnt_triggercallback != NULL) {
2576 		mount_unlock(mp);
2577 		if (error == 0) {
2578 			mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2579 		} else if (did_vflush) {
2580 			mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2581 		}
2582 	} else {
2583 		mount_unlock(mp);
2584 	}
2585 #else
2586 	mount_unlock(mp);
2587 #endif /* CONFIG_TRIGGERS */
2588 
2589 	lck_rw_done(&mp->mnt_rwlock);
2590 
2591 	if (needwakeup) {
2592 		wakeup((caddr_t)mp);
2593 	}
2594 
2595 	if (!error) {
2596 		if ((coveredvp != NULLVP)) {
2597 			vnode_t pvp = NULLVP;
2598 
2599 			/*
2600 			 * The covered vnode needs special handling. Trying to
2601 			 * get an iocount must not block here as this may lead
2602 			 * to deadlocks if the Filesystem to which the covered
2603 			 * vnode belongs is undergoing forced unmounts. Since we
2604 			 * hold a usecount, the  vnode cannot be reused
2605 			 * (it can, however, still be terminated).
2606 			 */
2607 			vnode_getalways(coveredvp);
2608 
2609 			mount_dropcrossref(mp, coveredvp, 0);
2610 			/*
2611 			 * We'll _try_ to detect if this really needs to be
2612 			 * done. The coveredvp can only be in termination (or
2613 			 * terminated) if the coveredvp's mount point is in a
2614 			 * forced unmount (or has been) since we still hold the
2615 			 * ref.
2616 			 */
2617 			if (!vnode_isrecycled(coveredvp)) {
2618 				pvp = vnode_getparent(coveredvp);
2619 #if CONFIG_TRIGGERS
2620 				if (coveredvp->v_resolve) {
2621 					vnode_trigger_rearm(coveredvp, ctx);
2622 				}
2623 #endif
2624 			}
2625 
2626 			vnode_rele(coveredvp);
2627 			vnode_put(coveredvp);
2628 			coveredvp = NULLVP;
2629 
2630 			if (pvp) {
2631 				lock_vnode_and_post(pvp, NOTE_WRITE);
2632 				vnode_put(pvp);
2633 			}
2634 		} else if (mp->mnt_flag & MNT_ROOTFS) {
2635 			mount_lock_destroy(mp);
2636 #if CONFIG_MACF
2637 			mac_mount_label_destroy(mp);
2638 #endif
2639 			zfree(mount_zone, mp);
2640 		} else {
2641 			panic("dounmount: no coveredvp");
2642 		}
2643 	}
2644 	return error;
2645 }
2646 
2647 /*
2648  * Unmount any mounts in this filesystem.
2649  */
2650 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)2651 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2652 {
2653 	mount_t smp;
2654 	fsid_t *fsids, fsid;
2655 	int fsids_sz;
2656 	int count = 0, i, m = 0;
2657 	vnode_t vp;
2658 
2659 	mount_list_lock();
2660 
2661 	// Get an array to hold the submounts fsids.
2662 	TAILQ_FOREACH(smp, &mountlist, mnt_list)
2663 	count++;
2664 	fsids_sz = count * sizeof(fsid_t);
2665 	fsids = kalloc_data(fsids_sz, Z_NOWAIT);
2666 	if (fsids == NULL) {
2667 		mount_list_unlock();
2668 		goto out;
2669 	}
2670 	fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
2671 
2672 	/*
2673 	 * Fill the array with submount fsids.
2674 	 * Since mounts are always added to the tail of the mount list, the
2675 	 * list is always in mount order.
2676 	 * For each mount check if the mounted-on vnode belongs to a
2677 	 * mount that's already added to our array of mounts to be unmounted.
2678 	 */
2679 	for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2680 		vp = smp->mnt_vnodecovered;
2681 		if (vp == NULL) {
2682 			continue;
2683 		}
2684 		fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
2685 		for (i = 0; i <= m; i++) {
2686 			if (fsids[i].val[0] == fsid.val[0] &&
2687 			    fsids[i].val[1] == fsid.val[1]) {
2688 				fsids[++m] = smp->mnt_vfsstat.f_fsid;
2689 				break;
2690 			}
2691 		}
2692 	}
2693 	mount_list_unlock();
2694 
2695 	// Unmount the submounts in reverse order. Ignore errors.
2696 	for (i = m; i > 0; i--) {
2697 		smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2698 		if (smp) {
2699 			mount_ref(smp, 0);
2700 			mount_iterdrop(smp);
2701 			(void) dounmount(smp, flags, 1, ctx);
2702 		}
2703 	}
2704 out:
2705 	kfree_data(fsids, fsids_sz);
2706 }
2707 
2708 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)2709 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2710 {
2711 	vnode_lock(dp);
2712 	mp->mnt_crossref--;
2713 
2714 	if (mp->mnt_crossref < 0) {
2715 		panic("mount cross refs -ve");
2716 	}
2717 
2718 	if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2719 		if (need_put) {
2720 			vnode_put_locked(dp);
2721 		}
2722 		vnode_unlock(dp);
2723 
2724 		mount_lock_destroy(mp);
2725 #if CONFIG_MACF
2726 		mac_mount_label_destroy(mp);
2727 #endif
2728 		zfree(mount_zone, mp);
2729 		return;
2730 	}
2731 	if (need_put) {
2732 		vnode_put_locked(dp);
2733 	}
2734 	vnode_unlock(dp);
2735 }
2736 
2737 
2738 /*
2739  * Sync each mounted filesystem.
2740  */
2741 #if DIAGNOSTIC
2742 int syncprt = 0;
2743 #endif
2744 
2745 int print_vmpage_stat = 0;
2746 
2747 /*
2748  * sync_callback:	simple wrapper that calls VFS_SYNC() on volumes
2749  *			mounted read-write with the passed waitfor value.
2750  *
2751  * Parameters:	mp	mount-point descriptor per mounted file-system instance.
2752  *		arg	user argument (please see below)
2753  *
2754  * User argument is a pointer to 32 bit unsigned integer which describes the
2755  * type of waitfor value to set for calling VFS_SYNC().  If user argument is
2756  * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
2757  * waitfor value.
2758  *
2759  * Returns:		VFS_RETURNED
2760  */
2761 static int
sync_callback(mount_t mp,void * arg)2762 sync_callback(mount_t mp, void *arg)
2763 {
2764 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2765 		int asyncflag = mp->mnt_flag & MNT_ASYNC;
2766 		unsigned waitfor = MNT_NOWAIT;
2767 
2768 		if (arg) {
2769 			waitfor = *(uint32_t*)arg;
2770 		}
2771 
2772 		/* Sanity check for flags - these are the only valid combinations for the flag bits*/
2773 		if (waitfor != MNT_WAIT &&
2774 		    waitfor != (MNT_WAIT | MNT_VOLUME) &&
2775 		    waitfor != MNT_NOWAIT &&
2776 		    waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
2777 		    waitfor != MNT_DWAIT &&
2778 		    waitfor != (MNT_DWAIT | MNT_VOLUME)) {
2779 			panic("Passed inappropriate waitfor %u to "
2780 			    "sync_callback()", waitfor);
2781 		}
2782 
2783 		mp->mnt_flag &= ~MNT_ASYNC;
2784 		(void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
2785 		if (asyncflag) {
2786 			mp->mnt_flag |= MNT_ASYNC;
2787 		}
2788 	}
2789 
2790 	return VFS_RETURNED;
2791 }
2792 
2793 /* ARGSUSED */
2794 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)2795 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2796 {
2797 	vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2798 
2799 	if (print_vmpage_stat) {
2800 		vm_countdirtypages();
2801 	}
2802 
2803 #if DIAGNOSTIC
2804 	if (syncprt) {
2805 		vfs_bufstats();
2806 	}
2807 #endif /* DIAGNOSTIC */
2808 	return 0;
2809 }
2810 
2811 typedef enum {
2812 	SYNC_ALL = 0,
2813 	SYNC_ONLY_RELIABLE_MEDIA = 1,
2814 	SYNC_ONLY_UNRELIABLE_MEDIA = 2
2815 } sync_type_t;
2816 
2817 static int
sync_internal_callback(mount_t mp,void * arg)2818 sync_internal_callback(mount_t mp, void *arg)
2819 {
2820 	if (arg) {
2821 		int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
2822 		    (mp->mnt_flag & MNT_LOCAL);
2823 		sync_type_t sync_type = *((sync_type_t *)arg);
2824 
2825 		if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
2826 			return VFS_RETURNED;
2827 		} else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
2828 			return VFS_RETURNED;
2829 		}
2830 	}
2831 
2832 	(void)sync_callback(mp, NULL);
2833 
2834 	return VFS_RETURNED;
2835 }
2836 
2837 int sync_thread_state = 0;
2838 int sync_timeout_seconds = 5;
2839 
2840 #define SYNC_THREAD_RUN       0x0001
2841 #define SYNC_THREAD_RUNNING   0x0002
2842 
2843 #if CONFIG_PHYS_WRITE_ACCT
2844 thread_t pm_sync_thread;
2845 #endif /* CONFIG_PHYS_WRITE_ACCT */
2846 
2847 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)2848 sync_thread(__unused void *arg, __unused wait_result_t wr)
2849 {
2850 	sync_type_t sync_type;
2851 #if CONFIG_PHYS_WRITE_ACCT
2852 	pm_sync_thread = current_thread();
2853 #endif /* CONFIG_PHYS_WRITE_ACCT */
2854 
2855 	lck_mtx_lock(&sync_mtx_lck);
2856 	while (sync_thread_state & SYNC_THREAD_RUN) {
2857 		sync_thread_state &= ~SYNC_THREAD_RUN;
2858 		lck_mtx_unlock(&sync_mtx_lck);
2859 
2860 		sync_type = SYNC_ONLY_RELIABLE_MEDIA;
2861 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2862 		sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
2863 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2864 
2865 		lck_mtx_lock(&sync_mtx_lck);
2866 	}
2867 	/*
2868 	 * This wakeup _has_ to be issued before the lock is released otherwise
2869 	 * we may end up waking up a thread in sync_internal which is
2870 	 * expecting a wakeup from a thread it just created and not from this
2871 	 * thread which is about to exit.
2872 	 */
2873 	wakeup(&sync_thread_state);
2874 	sync_thread_state &= ~SYNC_THREAD_RUNNING;
2875 #if CONFIG_PHYS_WRITE_ACCT
2876 	pm_sync_thread = NULL;
2877 #endif /* CONFIG_PHYS_WRITE_ACCT */
2878 	lck_mtx_unlock(&sync_mtx_lck);
2879 
2880 	if (print_vmpage_stat) {
2881 		vm_countdirtypages();
2882 	}
2883 
2884 #if DIAGNOSTIC
2885 	if (syncprt) {
2886 		vfs_bufstats();
2887 	}
2888 #endif /* DIAGNOSTIC */
2889 }
2890 
2891 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
2892 
2893 /*
2894  * An in-kernel sync for power management to call.
2895  * This function always returns within sync_timeout seconds.
2896  */
2897 __private_extern__ int
sync_internal(void)2898 sync_internal(void)
2899 {
2900 	thread_t thd;
2901 	int error;
2902 	int thread_created = FALSE;
2903 	struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
2904 
2905 	lck_mtx_lock(&sync_mtx_lck);
2906 	sync_thread_state |= SYNC_THREAD_RUN;
2907 	if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
2908 		int kr;
2909 
2910 		sync_thread_state |= SYNC_THREAD_RUNNING;
2911 		kr = kernel_thread_start(sync_thread, NULL, &thd);
2912 		if (kr != KERN_SUCCESS) {
2913 			sync_thread_state &= ~SYNC_THREAD_RUNNING;
2914 			lck_mtx_unlock(&sync_mtx_lck);
2915 			printf("sync_thread failed\n");
2916 			return 0;
2917 		}
2918 		thread_created = TRUE;
2919 	}
2920 
2921 	error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
2922 	    (PVFS | PDROP | PCATCH), "sync_thread", &ts);
2923 	if (error) {
2924 		struct timeval now;
2925 
2926 		microtime(&now);
2927 		if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
2928 			printf("sync timed out: %d sec\n", sync_timeout_seconds);
2929 			sync_timeout_last_print.tv_sec = now.tv_sec;
2930 		}
2931 	}
2932 
2933 	if (thread_created) {
2934 		thread_deallocate(thd);
2935 	}
2936 
2937 	return 0;
2938 } /* end of sync_internal call */
2939 
2940 /*
2941  * Change filesystem quotas.
2942  */
2943 #if QUOTA
2944 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)2945 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2946 {
2947 	struct mount *mp;
2948 	int error, quota_cmd, quota_status = 0;
2949 	caddr_t datap;
2950 	size_t fnamelen;
2951 	struct nameidata nd;
2952 	vfs_context_t ctx = vfs_context_current();
2953 	struct dqblk my_dqblk = {};
2954 
2955 	AUDIT_ARG(uid, uap->uid);
2956 	AUDIT_ARG(cmd, uap->cmd);
2957 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2958 	    uap->path, ctx);
2959 	error = namei(&nd);
2960 	if (error) {
2961 		return error;
2962 	}
2963 	mp = nd.ni_vp->v_mount;
2964 	mount_ref(mp, 0);
2965 	vnode_put(nd.ni_vp);
2966 	nameidone(&nd);
2967 
2968 #if CONFIG_MACF
2969 	error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
2970 	if (error != 0) {
2971 		goto out;
2972 	}
2973 #endif
2974 
2975 	/* copyin any data we will need for downstream code */
2976 	quota_cmd = uap->cmd >> SUBCMDSHIFT;
2977 
2978 	switch (quota_cmd) {
2979 	case Q_QUOTAON:
2980 		/* uap->arg specifies a file from which to take the quotas */
2981 		fnamelen = MAXPATHLEN;
2982 		datap = zalloc(ZV_NAMEI);
2983 		error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2984 		break;
2985 	case Q_GETQUOTA:
2986 		/* uap->arg is a pointer to a dqblk structure. */
2987 		datap = (caddr_t) &my_dqblk;
2988 		break;
2989 	case Q_SETQUOTA:
2990 	case Q_SETUSE:
2991 		/* uap->arg is a pointer to a dqblk structure. */
2992 		datap = (caddr_t) &my_dqblk;
2993 		if (proc_is64bit(p)) {
2994 			struct user_dqblk       my_dqblk64;
2995 			error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
2996 			if (error == 0) {
2997 				munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2998 			}
2999 		} else {
3000 			error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3001 		}
3002 		break;
3003 	case Q_QUOTASTAT:
3004 		/* uap->arg is a pointer to an integer */
3005 		datap = (caddr_t) &quota_status;
3006 		break;
3007 	default:
3008 		datap = NULL;
3009 		break;
3010 	} /* switch */
3011 
3012 	if (error == 0) {
3013 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3014 	}
3015 
3016 	switch (quota_cmd) {
3017 	case Q_QUOTAON:
3018 		if (datap != NULL) {
3019 			zfree(ZV_NAMEI, datap);
3020 		}
3021 		break;
3022 	case Q_GETQUOTA:
3023 		/* uap->arg is a pointer to a dqblk structure we need to copy out to */
3024 		if (error == 0) {
3025 			if (proc_is64bit(p)) {
3026 				struct user_dqblk       my_dqblk64;
3027 
3028 				memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3029 				munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3030 				error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3031 			} else {
3032 				error = copyout(datap, uap->arg, sizeof(struct dqblk));
3033 			}
3034 		}
3035 		break;
3036 	case Q_QUOTASTAT:
3037 		/* uap->arg is a pointer to an integer */
3038 		if (error == 0) {
3039 			error = copyout(datap, uap->arg, sizeof(quota_status));
3040 		}
3041 		break;
3042 	default:
3043 		break;
3044 	} /* switch */
3045 
3046 out:
3047 	mount_drop(mp, 0);
3048 	return error;
3049 }
3050 #else
3051 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3052 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3053 {
3054 	return EOPNOTSUPP;
3055 }
3056 #endif /* QUOTA */
3057 
3058 /*
3059  * Get filesystem statistics.
3060  *
3061  * Returns:	0			Success
3062  *	namei:???
3063  *	vfs_update_vfsstat:???
3064  *	munge_statfs:EFAULT
3065  */
3066 /* ARGSUSED */
3067 int
statfs(__unused proc_t p,struct statfs_args * uap,__unused int32_t * retval)3068 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3069 {
3070 	struct mount *mp;
3071 	struct vfsstatfs *sp;
3072 	int error;
3073 	struct nameidata nd;
3074 	vfs_context_t ctx = vfs_context_current();
3075 	vnode_t vp;
3076 
3077 	NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3078 	    UIO_USERSPACE, uap->path, ctx);
3079 	error = namei(&nd);
3080 	if (error != 0) {
3081 		return error;
3082 	}
3083 	vp = nd.ni_vp;
3084 	mp = vp->v_mount;
3085 	sp = &mp->mnt_vfsstat;
3086 	nameidone(&nd);
3087 
3088 #if CONFIG_MACF
3089 	error = mac_mount_check_stat(ctx, mp);
3090 	if (error != 0) {
3091 		vnode_put(vp);
3092 		return error;
3093 	}
3094 #endif
3095 
3096 	error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3097 	if (error != 0) {
3098 		vnode_put(vp);
3099 		return error;
3100 	}
3101 
3102 	error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
3103 	vnode_put(vp);
3104 	return error;
3105 }
3106 
3107 /*
3108  * Get filesystem statistics.
3109  */
3110 /* ARGSUSED */
3111 int
fstatfs(__unused proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3112 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3113 {
3114 	vnode_t vp;
3115 	struct mount *mp;
3116 	struct vfsstatfs *sp;
3117 	int error;
3118 
3119 	AUDIT_ARG(fd, uap->fd);
3120 
3121 	if ((error = file_vnode(uap->fd, &vp))) {
3122 		return error;
3123 	}
3124 
3125 	error = vnode_getwithref(vp);
3126 	if (error) {
3127 		file_drop(uap->fd);
3128 		return error;
3129 	}
3130 
3131 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3132 
3133 	mp = vp->v_mount;
3134 	if (!mp) {
3135 		error = EBADF;
3136 		goto out;
3137 	}
3138 
3139 #if CONFIG_MACF
3140 	error = mac_mount_check_stat(vfs_context_current(), mp);
3141 	if (error != 0) {
3142 		goto out;
3143 	}
3144 #endif
3145 
3146 	sp = &mp->mnt_vfsstat;
3147 	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3148 		goto out;
3149 	}
3150 
3151 	error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
3152 
3153 out:
3154 	file_drop(uap->fd);
3155 	vnode_put(vp);
3156 
3157 	return error;
3158 }
3159 
3160 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3161 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3162 {
3163 	struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3164 
3165 	bzero(sfs, sizeof(*sfs));
3166 
3167 	sfs->f_bsize = vsfs->f_bsize;
3168 	sfs->f_iosize = (int32_t)vsfs->f_iosize;
3169 	sfs->f_blocks = vsfs->f_blocks;
3170 	sfs->f_bfree = vsfs->f_bfree;
3171 	sfs->f_bavail = vsfs->f_bavail;
3172 	sfs->f_files = vsfs->f_files;
3173 	sfs->f_ffree = vsfs->f_ffree;
3174 	sfs->f_fsid = vsfs->f_fsid;
3175 	sfs->f_owner = vsfs->f_owner;
3176 	sfs->f_type = mp->mnt_vtable->vfc_typenum;
3177 	sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3178 	sfs->f_fssubtype = vsfs->f_fssubtype;
3179 	sfs->f_flags_ext = (mp->mnt_kern_flag & MNTK_SYSTEMDATA) ? MNT_EXT_ROOT_DATA_VOL : 0;
3180 	if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3181 		strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3182 	} else {
3183 		strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
3184 	}
3185 	strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3186 	strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3187 }
3188 
3189 /*
3190  * Get file system statistics in 64-bit mode
3191  */
3192 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3193 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3194 {
3195 	struct mount *mp;
3196 	int error;
3197 	struct nameidata *ndp;
3198 	struct statfs64 *sfsp;
3199 	vfs_context_t ctxp = vfs_context_current();
3200 	vnode_t vp;
3201 	struct {
3202 		struct nameidata nd;
3203 		struct statfs64 sfs;
3204 	} *__nameidata_statfs64;
3205 
3206 	__nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3207 	    Z_WAITOK);
3208 	ndp = &__nameidata_statfs64->nd;
3209 
3210 	NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3211 	    UIO_USERSPACE, uap->path, ctxp);
3212 	error = namei(ndp);
3213 	if (error != 0) {
3214 		goto out;
3215 	}
3216 	vp = ndp->ni_vp;
3217 	mp = vp->v_mount;
3218 	nameidone(ndp);
3219 
3220 #if CONFIG_MACF
3221 	error = mac_mount_check_stat(ctxp, mp);
3222 	if (error != 0) {
3223 		vnode_put(vp);
3224 		goto out;
3225 	}
3226 #endif
3227 
3228 	error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3229 	if (error != 0) {
3230 		vnode_put(vp);
3231 		goto out;
3232 	}
3233 
3234 	sfsp = &__nameidata_statfs64->sfs;
3235 	vfs_get_statfs64(mp, sfsp);
3236 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3237 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3238 		/* This process does not want to see a seperate data volume mountpoint */
3239 		strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3240 	}
3241 	error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3242 	vnode_put(vp);
3243 
3244 out:
3245 	kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3246 
3247 	return error;
3248 }
3249 
3250 /*
3251  * Get file system statistics in 64-bit mode
3252  */
3253 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3254 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3255 {
3256 	struct vnode *vp;
3257 	struct mount *mp;
3258 	struct statfs64 sfs;
3259 	int error;
3260 
3261 	AUDIT_ARG(fd, uap->fd);
3262 
3263 	if ((error = file_vnode(uap->fd, &vp))) {
3264 		return error;
3265 	}
3266 
3267 	error = vnode_getwithref(vp);
3268 	if (error) {
3269 		file_drop(uap->fd);
3270 		return error;
3271 	}
3272 
3273 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3274 
3275 	mp = vp->v_mount;
3276 	if (!mp) {
3277 		error = EBADF;
3278 		goto out;
3279 	}
3280 
3281 #if CONFIG_MACF
3282 	error = mac_mount_check_stat(vfs_context_current(), mp);
3283 	if (error != 0) {
3284 		goto out;
3285 	}
3286 #endif
3287 
3288 	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3289 		goto out;
3290 	}
3291 
3292 	vfs_get_statfs64(mp, &sfs);
3293 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3294 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3295 		/* This process does not want to see a seperate data volume mountpoint */
3296 		strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3297 	}
3298 	error = copyout(&sfs, uap->buf, sizeof(sfs));
3299 
3300 out:
3301 	file_drop(uap->fd);
3302 	vnode_put(vp);
3303 
3304 	return error;
3305 }
3306 
3307 struct getfsstat_struct {
3308 	user_addr_t     sfsp;
3309 	user_addr_t     *mp;
3310 	int             count;
3311 	int             maxcount;
3312 	int             flags;
3313 	int             error;
3314 };
3315 
3316 
3317 static int
getfsstat_callback(mount_t mp,void * arg)3318 getfsstat_callback(mount_t mp, void * arg)
3319 {
3320 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3321 	struct vfsstatfs *sp;
3322 	int error, my_size;
3323 	vfs_context_t ctx = vfs_context_current();
3324 
3325 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3326 #if CONFIG_MACF
3327 		error = mac_mount_check_stat(ctx, mp);
3328 		if (error != 0) {
3329 			fstp->error = error;
3330 			return VFS_RETURNED_DONE;
3331 		}
3332 #endif
3333 		sp = &mp->mnt_vfsstat;
3334 		/*
3335 		 * If MNT_NOWAIT is specified, do not refresh the
3336 		 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3337 		 */
3338 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3339 		    (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3340 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3341 		    (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3342 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3343 			return VFS_RETURNED;
3344 		}
3345 
3346 		/*
3347 		 * Need to handle LP64 version of struct statfs
3348 		 */
3349 		error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3350 		if (error) {
3351 			fstp->error = error;
3352 			return VFS_RETURNED_DONE;
3353 		}
3354 		fstp->sfsp += my_size;
3355 
3356 		if (fstp->mp) {
3357 #if CONFIG_MACF
3358 			error = mac_mount_label_get(mp, *fstp->mp);
3359 			if (error) {
3360 				fstp->error = error;
3361 				return VFS_RETURNED_DONE;
3362 			}
3363 #endif
3364 			fstp->mp++;
3365 		}
3366 	}
3367 	fstp->count++;
3368 	return VFS_RETURNED;
3369 }
3370 
3371 /*
3372  * Get statistics on all filesystems.
3373  */
3374 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3375 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3376 {
3377 	struct __mac_getfsstat_args muap;
3378 
3379 	muap.buf = uap->buf;
3380 	muap.bufsize = uap->bufsize;
3381 	muap.mac = USER_ADDR_NULL;
3382 	muap.macsize = 0;
3383 	muap.flags = uap->flags;
3384 
3385 	return __mac_getfsstat(p, &muap, retval);
3386 }
3387 
3388 /*
3389  * __mac_getfsstat: Get MAC-related file system statistics
3390  *
3391  * Parameters:    p                        (ignored)
3392  *                uap                      User argument descriptor (see below)
3393  *                retval                   Count of file system statistics (N stats)
3394  *
3395  * Indirect:      uap->bufsize             Buffer size
3396  *                uap->macsize             MAC info size
3397  *                uap->buf                 Buffer where information will be returned
3398  *                uap->mac                 MAC info
3399  *                uap->flags               File system flags
3400  *
3401  *
3402  * Returns:        0                       Success
3403  *                !0                       Not success
3404  *
3405  */
3406 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3407 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3408 {
3409 	user_addr_t sfsp;
3410 	user_addr_t *mp;
3411 	size_t count, maxcount, bufsize, macsize;
3412 	struct getfsstat_struct fst;
3413 
3414 	if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3415 		return EINVAL;
3416 	}
3417 
3418 	bufsize = (size_t) uap->bufsize;
3419 	macsize = (size_t) uap->macsize;
3420 
3421 	if (IS_64BIT_PROCESS(p)) {
3422 		maxcount = bufsize / sizeof(struct user64_statfs);
3423 	} else {
3424 		maxcount = bufsize / sizeof(struct user32_statfs);
3425 	}
3426 	sfsp = uap->buf;
3427 	count = 0;
3428 
3429 	mp = NULL;
3430 
3431 #if CONFIG_MACF
3432 	if (uap->mac != USER_ADDR_NULL) {
3433 		u_int32_t *mp0;
3434 		int error;
3435 		unsigned int i;
3436 
3437 		count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3438 		if (count != maxcount) {
3439 			return EINVAL;
3440 		}
3441 
3442 		/* Copy in the array */
3443 		mp0 = kalloc_data(macsize, Z_WAITOK);
3444 		if (mp0 == NULL) {
3445 			return ENOMEM;
3446 		}
3447 
3448 		error = copyin(uap->mac, mp0, macsize);
3449 		if (error) {
3450 			kfree_data(mp0, macsize);
3451 			return error;
3452 		}
3453 
3454 		/* Normalize to an array of user_addr_t */
3455 		mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3456 		if (mp == NULL) {
3457 			kfree_data(mp0, macsize);
3458 			return ENOMEM;
3459 		}
3460 
3461 		for (i = 0; i < count; i++) {
3462 			if (IS_64BIT_PROCESS(p)) {
3463 				mp[i] = ((user_addr_t *)mp0)[i];
3464 			} else {
3465 				mp[i] = (user_addr_t)mp0[i];
3466 			}
3467 		}
3468 		kfree_data(mp0, macsize);
3469 	}
3470 #endif
3471 
3472 
3473 	fst.sfsp = sfsp;
3474 	fst.mp = mp;
3475 	fst.flags = uap->flags;
3476 	fst.count = 0;
3477 	fst.error = 0;
3478 	fst.maxcount = (int)maxcount;
3479 
3480 
3481 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3482 
3483 	if (mp) {
3484 		kfree_data(mp, count * sizeof(user_addr_t));
3485 	}
3486 
3487 	if (fst.error) {
3488 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3489 		return fst.error;
3490 	}
3491 
3492 	if (fst.sfsp && fst.count > fst.maxcount) {
3493 		*retval = fst.maxcount;
3494 	} else {
3495 		*retval = fst.count;
3496 	}
3497 	return 0;
3498 }
3499 
3500 static int
getfsstat64_callback(mount_t mp,void * arg)3501 getfsstat64_callback(mount_t mp, void * arg)
3502 {
3503 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3504 	struct vfsstatfs *sp;
3505 	struct statfs64 sfs;
3506 	int error;
3507 
3508 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3509 #if CONFIG_MACF
3510 		error = mac_mount_check_stat(vfs_context_current(), mp);
3511 		if (error != 0) {
3512 			fstp->error = error;
3513 			return VFS_RETURNED_DONE;
3514 		}
3515 #endif
3516 		sp = &mp->mnt_vfsstat;
3517 		/*
3518 		 * If MNT_NOWAIT is specified, do not refresh the fsstat
3519 		 * cache. MNT_WAIT overrides MNT_NOWAIT.
3520 		 *
3521 		 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3522 		 * getfsstat, since the constants are out of the same
3523 		 * namespace.
3524 		 */
3525 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3526 		    ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3527 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3528 		    (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3529 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3530 			return VFS_RETURNED;
3531 		}
3532 
3533 		vfs_get_statfs64(mp, &sfs);
3534 		error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3535 		if (error) {
3536 			fstp->error = error;
3537 			return VFS_RETURNED_DONE;
3538 		}
3539 		fstp->sfsp += sizeof(sfs);
3540 	}
3541 	fstp->count++;
3542 	return VFS_RETURNED;
3543 }
3544 
3545 /*
3546  * Get statistics on all file systems in 64 bit mode.
3547  */
3548 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)3549 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3550 {
3551 	user_addr_t sfsp;
3552 	int count, maxcount;
3553 	struct getfsstat_struct fst;
3554 
3555 	maxcount = uap->bufsize / sizeof(struct statfs64);
3556 
3557 	sfsp = uap->buf;
3558 	count = 0;
3559 
3560 	fst.sfsp = sfsp;
3561 	fst.flags = uap->flags;
3562 	fst.count = 0;
3563 	fst.error = 0;
3564 	fst.maxcount = maxcount;
3565 
3566 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3567 
3568 	if (fst.error) {
3569 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3570 		return fst.error;
3571 	}
3572 
3573 	if (fst.sfsp && fst.count > fst.maxcount) {
3574 		*retval = fst.maxcount;
3575 	} else {
3576 		*retval = fst.count;
3577 	}
3578 
3579 	return 0;
3580 }
3581 
3582 /*
3583  * gets the associated vnode with the file descriptor passed.
3584  * as input
3585  *
3586  * INPUT
3587  * ctx - vfs context of caller
3588  * fd - file descriptor for which vnode is required.
3589  * vpp - Pointer to pointer to vnode to be returned.
3590  *
3591  * The vnode is returned with an iocount so any vnode obtained
3592  * by this call needs a vnode_put
3593  *
3594  */
3595 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)3596 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3597 {
3598 	int error;
3599 	vnode_t vp;
3600 	struct fileproc *fp;
3601 	proc_t p = vfs_context_proc(ctx);
3602 
3603 	*vpp =  NULLVP;
3604 
3605 	error = fp_getfvp(p, fd, &fp, &vp);
3606 	if (error) {
3607 		return error;
3608 	}
3609 
3610 	error = vnode_getwithref(vp);
3611 	if (error) {
3612 		(void)fp_drop(p, fd, fp, 0);
3613 		return error;
3614 	}
3615 
3616 	(void)fp_drop(p, fd, fp, 0);
3617 	*vpp = vp;
3618 	return error;
3619 }
3620 
3621 /*
3622  * Wrapper function around namei to start lookup from a directory
3623  * specified by a file descriptor ni_dirfd.
3624  *
3625  * In addition to all the errors returned by namei, this call can
3626  * return ENOTDIR if the file descriptor does not refer to a directory.
3627  * and EBADF if the file descriptor is not valid.
3628  */
3629 int
nameiat(struct nameidata * ndp,int dirfd)3630 nameiat(struct nameidata *ndp, int dirfd)
3631 {
3632 	if ((dirfd != AT_FDCWD) &&
3633 	    !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3634 	    !(ndp->ni_cnd.cn_flags & USEDVP)) {
3635 		int error = 0;
3636 		char c;
3637 
3638 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3639 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
3640 			if (error) {
3641 				return error;
3642 			}
3643 		} else {
3644 			c = *((char *)(ndp->ni_dirp));
3645 		}
3646 
3647 		if (c != '/') {
3648 			vnode_t dvp_at;
3649 
3650 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3651 			    &dvp_at);
3652 			if (error) {
3653 				return error;
3654 			}
3655 
3656 			if (vnode_vtype(dvp_at) != VDIR) {
3657 				vnode_put(dvp_at);
3658 				return ENOTDIR;
3659 			}
3660 
3661 			ndp->ni_dvp = dvp_at;
3662 			ndp->ni_cnd.cn_flags |= USEDVP;
3663 			error = namei(ndp);
3664 			ndp->ni_cnd.cn_flags &= ~USEDVP;
3665 			vnode_put(dvp_at);
3666 			return error;
3667 		}
3668 	}
3669 
3670 	return namei(ndp);
3671 }
3672 
3673 /*
3674  * Change current working directory to a given file descriptor.
3675  */
3676 /* ARGSUSED */
3677 static int
common_fchdir(proc_t p,struct fchdir_args * uap,int per_thread)3678 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
3679 {
3680 	vnode_t vp;
3681 	vnode_t tdp;
3682 	vnode_t tvp;
3683 	struct mount *mp;
3684 	int error, should_put = 1;
3685 	vfs_context_t ctx = vfs_context_current();
3686 
3687 	AUDIT_ARG(fd, uap->fd);
3688 	if (per_thread && uap->fd == -1) {
3689 		/*
3690 		 * Switching back from per-thread to per process CWD; verify we
3691 		 * in fact have one before proceeding.  The only success case
3692 		 * for this code path is to return 0 preemptively after zapping
3693 		 * the thread structure contents.
3694 		 */
3695 		thread_t th = vfs_context_thread(ctx);
3696 		if (th) {
3697 			uthread_t uth = get_bsdthread_info(th);
3698 			tvp = uth->uu_cdir;
3699 			uth->uu_cdir = NULLVP;
3700 			if (tvp != NULLVP) {
3701 				vnode_rele(tvp);
3702 				return 0;
3703 			}
3704 		}
3705 		return EBADF;
3706 	}
3707 
3708 	if ((error = file_vnode(uap->fd, &vp))) {
3709 		return error;
3710 	}
3711 	if ((error = vnode_getwithref(vp))) {
3712 		file_drop(uap->fd);
3713 		return error;
3714 	}
3715 
3716 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
3717 
3718 	if (vp->v_type != VDIR) {
3719 		error = ENOTDIR;
3720 		goto out;
3721 	}
3722 
3723 #if CONFIG_MACF
3724 	error = mac_vnode_check_chdir(ctx, vp);
3725 	if (error) {
3726 		goto out;
3727 	}
3728 #endif
3729 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3730 	if (error) {
3731 		goto out;
3732 	}
3733 
3734 	while (!error && (mp = vp->v_mountedhere) != NULL) {
3735 		if (vfs_busy(mp, LK_NOWAIT)) {
3736 			error = EACCES;
3737 			goto out;
3738 		}
3739 		error = VFS_ROOT(mp, &tdp, ctx);
3740 		vfs_unbusy(mp);
3741 		if (error) {
3742 			break;
3743 		}
3744 		vnode_put(vp);
3745 		vp = tdp;
3746 	}
3747 	if (error) {
3748 		goto out;
3749 	}
3750 	if ((error = vnode_ref(vp))) {
3751 		goto out;
3752 	}
3753 	vnode_put(vp);
3754 	should_put = 0;
3755 
3756 	if (per_thread) {
3757 		thread_t th = vfs_context_thread(ctx);
3758 		if (th) {
3759 			uthread_t uth = get_bsdthread_info(th);
3760 			tvp = uth->uu_cdir;
3761 			uth->uu_cdir = vp;
3762 			OSBitOrAtomic(P_THCWD, &p->p_flag);
3763 		} else {
3764 			vnode_rele(vp);
3765 			error = ENOENT;
3766 			goto out;
3767 		}
3768 	} else {
3769 		proc_dirs_lock_exclusive(p);
3770 		proc_fdlock(p);
3771 		tvp = p->p_fd.fd_cdir;
3772 		p->p_fd.fd_cdir = vp;
3773 		proc_fdunlock(p);
3774 		proc_dirs_unlock_exclusive(p);
3775 	}
3776 
3777 	if (tvp) {
3778 		vnode_rele(tvp);
3779 	}
3780 
3781 out:
3782 	if (should_put) {
3783 		vnode_put(vp);
3784 	}
3785 	file_drop(uap->fd);
3786 
3787 	return error;
3788 }
3789 
3790 int
fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)3791 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3792 {
3793 	return common_fchdir(p, uap, 0);
3794 }
3795 
3796 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)3797 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3798 {
3799 	return common_fchdir(p, (void *)uap, 1);
3800 }
3801 
3802 
3803 /*
3804  * Change current working directory (".").
3805  *
3806  * Returns:	0			Success
3807  *	change_dir:ENOTDIR
3808  *	change_dir:???
3809  *	vnode_ref:ENOENT		No such file or directory
3810  */
3811 /* ARGSUSED */
3812 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)3813 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
3814 {
3815 	int error;
3816 	vnode_t tvp;
3817 
3818 	error = change_dir(ndp, ctx);
3819 	if (error) {
3820 		return error;
3821 	}
3822 	if ((error = vnode_ref(ndp->ni_vp))) {
3823 		vnode_put(ndp->ni_vp);
3824 		return error;
3825 	}
3826 	/*
3827 	 * drop the iocount we picked up in change_dir
3828 	 */
3829 	vnode_put(ndp->ni_vp);
3830 
3831 	if (per_thread) {
3832 		thread_t th = vfs_context_thread(ctx);
3833 		if (th) {
3834 			uthread_t uth = get_bsdthread_info(th);
3835 			tvp = uth->uu_cdir;
3836 			uth->uu_cdir = ndp->ni_vp;
3837 			OSBitOrAtomic(P_THCWD, &p->p_flag);
3838 		} else {
3839 			vnode_rele(ndp->ni_vp);
3840 			return ENOENT;
3841 		}
3842 	} else {
3843 		proc_dirs_lock_exclusive(p);
3844 		proc_fdlock(p);
3845 		tvp = p->p_fd.fd_cdir;
3846 		p->p_fd.fd_cdir = ndp->ni_vp;
3847 		proc_fdunlock(p);
3848 		proc_dirs_unlock_exclusive(p);
3849 	}
3850 
3851 	if (tvp) {
3852 		vnode_rele(tvp);
3853 	}
3854 
3855 	return 0;
3856 }
3857 
3858 
3859 /*
3860  * Change current working directory (".").
3861  *
3862  * Returns:	0			Success
3863  *	chdir_internal:ENOTDIR
3864  *	chdir_internal:ENOENT		No such file or directory
3865  *	chdir_internal:???
3866  */
3867 /* ARGSUSED */
3868 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)3869 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3870 {
3871 	struct nameidata nd;
3872 	vfs_context_t ctx = vfs_context_current();
3873 
3874 	NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3875 	    UIO_USERSPACE, uap->path, ctx);
3876 
3877 	return chdir_internal(p, ctx, &nd, per_thread);
3878 }
3879 
3880 
3881 /*
3882  * chdir
3883  *
3884  * Change current working directory (".") for the entire process
3885  *
3886  * Parameters:  p       Process requesting the call
3887  *              uap     User argument descriptor (see below)
3888  *              retval  (ignored)
3889  *
3890  * Indirect parameters:	uap->path	Directory path
3891  *
3892  * Returns:	0			Success
3893  *              common_chdir: ENOTDIR
3894  *              common_chdir: ENOENT	No such file or directory
3895  *              common_chdir: ???
3896  *
3897  */
3898 int
chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)3899 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3900 {
3901 	return common_chdir(p, (void *)uap, 0);
3902 }
3903 
3904 /*
3905  * __pthread_chdir
3906  *
3907  * Change current working directory (".") for a single thread
3908  *
3909  * Parameters:  p       Process requesting the call
3910  *              uap     User argument descriptor (see below)
3911  *              retval  (ignored)
3912  *
3913  * Indirect parameters:	uap->path	Directory path
3914  *
3915  * Returns:	0			Success
3916  *              common_chdir: ENOTDIR
3917  *		common_chdir: ENOENT	No such file or directory
3918  *		common_chdir: ???
3919  *
3920  */
3921 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)3922 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3923 {
3924 	return common_chdir(p, (void *)uap, 1);
3925 }
3926 
3927 
3928 /*
3929  * Change notion of root (``/'') directory.
3930  */
3931 /* ARGSUSED */
3932 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)3933 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3934 {
3935 	struct filedesc *fdp = &p->p_fd;
3936 	int error;
3937 	struct nameidata nd;
3938 	vnode_t tvp;
3939 	vfs_context_t ctx = vfs_context_current();
3940 
3941 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
3942 		return error;
3943 	}
3944 
3945 	NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3946 	    UIO_USERSPACE, uap->path, ctx);
3947 	error = change_dir(&nd, ctx);
3948 	if (error) {
3949 		return error;
3950 	}
3951 
3952 #if CONFIG_MACF
3953 	error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3954 	    &nd.ni_cnd);
3955 	if (error) {
3956 		vnode_put(nd.ni_vp);
3957 		return error;
3958 	}
3959 #endif
3960 
3961 	if ((error = vnode_ref(nd.ni_vp))) {
3962 		vnode_put(nd.ni_vp);
3963 		return error;
3964 	}
3965 	vnode_put(nd.ni_vp);
3966 
3967 	/*
3968 	 * This lock provides the guarantee that as long as you hold the lock
3969 	 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
3970 	 * on a referenced vnode in namei when determining the rootvnode for
3971 	 * a process.
3972 	 */
3973 	/* needed for synchronization with lookup */
3974 	proc_dirs_lock_exclusive(p);
3975 	/* needed for setting the flag and other activities on the fd itself */
3976 	proc_fdlock(p);
3977 	tvp = fdp->fd_rdir;
3978 	fdp->fd_rdir = nd.ni_vp;
3979 	fdt_flag_set(fdp, FD_CHROOT);
3980 	proc_fdunlock(p);
3981 	proc_dirs_unlock_exclusive(p);
3982 
3983 	if (tvp != NULL) {
3984 		vnode_rele(tvp);
3985 	}
3986 
3987 	return 0;
3988 }
3989 
3990 #define PATHSTATICBUFLEN 256
3991 #define PIVOT_ROOT_ENTITLEMENT              \
3992        "com.apple.private.vfs.pivot-root"
3993 
3994 #if defined(XNU_TARGET_OS_OSX)
3995 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)3996 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
3997 {
3998 	int error;
3999 	char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4000 	char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4001 	char *new_rootfs_path_before_buf = NULL;
4002 	char *old_rootfs_path_after_buf = NULL;
4003 	char *incoming = NULL;
4004 	char *outgoing = NULL;
4005 	vnode_t incoming_rootvp = NULLVP;
4006 	size_t bytes_copied;
4007 
4008 	/*
4009 	 * XXX : Additional restrictions needed
4010 	 * - perhaps callable only once.
4011 	 */
4012 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4013 		return error;
4014 	}
4015 
4016 	/*
4017 	 * pivot_root can be executed by launchd only.
4018 	 * Enforce entitlement.
4019 	 */
4020 	if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4021 		return EPERM;
4022 	}
4023 
4024 	error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4025 	if (error == ENAMETOOLONG) {
4026 		new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4027 		error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4028 	}
4029 
4030 	if (error) {
4031 		goto out;
4032 	}
4033 
4034 	error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4035 	if (error == ENAMETOOLONG) {
4036 		old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4037 		error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4038 	}
4039 	if (error) {
4040 		goto out;
4041 	}
4042 
4043 	if (new_rootfs_path_before_buf) {
4044 		incoming = new_rootfs_path_before_buf;
4045 	} else {
4046 		incoming = &new_rootfs_path_before[0];
4047 	}
4048 
4049 	if (old_rootfs_path_after_buf) {
4050 		outgoing = old_rootfs_path_after_buf;
4051 	} else {
4052 		outgoing = &old_rootfs_path_after[0];
4053 	}
4054 
4055 	/*
4056 	 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4057 	 * Userland is not allowed to pivot to an image.
4058 	 */
4059 	error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4060 	if (error) {
4061 		goto out;
4062 	}
4063 	error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4064 	if (error) {
4065 		goto out;
4066 	}
4067 
4068 	error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4069 
4070 out:
4071 	if (incoming_rootvp != NULLVP) {
4072 		vnode_put(incoming_rootvp);
4073 		incoming_rootvp = NULLVP;
4074 	}
4075 
4076 	if (old_rootfs_path_after_buf) {
4077 		zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4078 	}
4079 
4080 	if (new_rootfs_path_before_buf) {
4081 		zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4082 	}
4083 
4084 	return error;
4085 }
4086 #else
4087 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4088 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4089 {
4090 	return nosys(p, NULL, retval);
4091 }
4092 #endif /* XNU_TARGET_OS_OSX */
4093 
4094 /*
4095  * Common routine for chroot and chdir.
4096  *
4097  * Returns:	0			Success
4098  *		ENOTDIR			Not a directory
4099  *		namei:???		[anything namei can return]
4100  *		vnode_authorize:???	[anything vnode_authorize can return]
4101  */
4102 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4103 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4104 {
4105 	vnode_t vp;
4106 	int error;
4107 
4108 	if ((error = namei(ndp))) {
4109 		return error;
4110 	}
4111 	nameidone(ndp);
4112 	vp = ndp->ni_vp;
4113 
4114 	if (vp->v_type != VDIR) {
4115 		vnode_put(vp);
4116 		return ENOTDIR;
4117 	}
4118 
4119 #if CONFIG_MACF
4120 	error = mac_vnode_check_chdir(ctx, vp);
4121 	if (error) {
4122 		vnode_put(vp);
4123 		return error;
4124 	}
4125 #endif
4126 
4127 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4128 	if (error) {
4129 		vnode_put(vp);
4130 		return error;
4131 	}
4132 
4133 	return error;
4134 }
4135 
4136 /*
4137  * Free the vnode data (for directories) associated with the file glob.
4138  */
4139 struct fd_vn_data *
fg_vn_data_alloc(void)4140 fg_vn_data_alloc(void)
4141 {
4142 	struct fd_vn_data *fvdata;
4143 
4144 	/* Allocate per fd vnode data */
4145 	fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4146 	lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4147 	return fvdata;
4148 }
4149 
4150 /*
4151  * Free the vnode data (for directories) associated with the file glob.
4152  */
4153 void
fg_vn_data_free(void * fgvndata)4154 fg_vn_data_free(void *fgvndata)
4155 {
4156 	struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4157 
4158 	kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4159 	lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4160 	kfree_type(struct fd_vn_data, fvdata);
4161 }
4162 
4163 /*
4164  * Check permissions, allocate an open file structure,
4165  * and call the device open routine if any.
4166  *
4167  * Returns:	0			Success
4168  *		EINVAL
4169  *		EINTR
4170  *	falloc:ENFILE
4171  *	falloc:EMFILE
4172  *	falloc:ENOMEM
4173  *	vn_open_auth:???
4174  *	dupfdopen:???
4175  *	VNOP_ADVLOCK:???
4176  *	vnode_setsize:???
4177  *
4178  * XXX Need to implement uid, gid
4179  */
4180 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval)4181 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4182     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval)
4183 {
4184 	proc_t p = vfs_context_proc(ctx);
4185 	uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4186 	struct fileproc *fp;
4187 	vnode_t vp;
4188 	int flags, oflags;
4189 	int type, indx, error;
4190 	struct vfs_context context;
4191 
4192 	oflags = uflags;
4193 
4194 	if ((oflags & O_ACCMODE) == O_ACCMODE) {
4195 		return EINVAL;
4196 	}
4197 
4198 	flags = FFLAGS(uflags);
4199 	CLR(flags, FENCRYPTED);
4200 	CLR(flags, FUNENCRYPTED);
4201 
4202 	AUDIT_ARG(fflags, oflags);
4203 	AUDIT_ARG(mode, vap->va_mode);
4204 
4205 	if ((error = falloc_withinit(p, &fp, &indx, ctx, fp_init, initarg)) != 0) {
4206 		return error;
4207 	}
4208 	if (flags & O_CLOEXEC) {
4209 		fp->fp_flags |= FP_CLOEXEC;
4210 	}
4211 	if (flags & O_CLOFORK) {
4212 		fp->fp_flags |= FP_CLOFORK;
4213 	}
4214 
4215 	/* setup state to recognize when fdesc_open was called */
4216 	uu->uu_dupfd = -1;
4217 
4218 	if ((error = vn_open_auth(ndp, &flags, vap))) {
4219 		if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4220 			if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4221 				*retval = indx;
4222 				return 0;
4223 			}
4224 		}
4225 		if (error == ERESTART) {
4226 			error = EINTR;
4227 		}
4228 		fp_free(p, indx, fp);
4229 		return error;
4230 	}
4231 	uu->uu_dupfd = 0;
4232 	vp = ndp->ni_vp;
4233 
4234 	fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4235 	fp->fp_glob->fg_ops = &vnops;
4236 	fp_set_data(fp, vp);
4237 
4238 	if (flags & (O_EXLOCK | O_SHLOCK)) {
4239 		struct flock lf = {
4240 			.l_whence = SEEK_SET,
4241 		};
4242 
4243 		if (flags & O_EXLOCK) {
4244 			lf.l_type = F_WRLCK;
4245 		} else {
4246 			lf.l_type = F_RDLCK;
4247 		}
4248 		type = F_FLOCK;
4249 		if ((flags & FNONBLOCK) == 0) {
4250 			type |= F_WAIT;
4251 		}
4252 #if CONFIG_MACF
4253 		error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4254 		    F_SETLK, &lf);
4255 		if (error) {
4256 			goto bad;
4257 		}
4258 #endif
4259 		if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4260 			goto bad;
4261 		}
4262 		fp->fp_glob->fg_flag |= FWASLOCKED;
4263 	}
4264 
4265 	/* try to truncate by setting the size attribute */
4266 	if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4267 		goto bad;
4268 	}
4269 
4270 	/*
4271 	 * For directories we hold some additional information in the fd.
4272 	 */
4273 	if (vnode_vtype(vp) == VDIR) {
4274 		fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4275 	} else {
4276 		fp->fp_glob->fg_vn_data = NULL;
4277 	}
4278 
4279 	vnode_put(vp);
4280 
4281 	/*
4282 	 * The first terminal open (without a O_NOCTTY) by a session leader
4283 	 * results in it being set as the controlling terminal.
4284 	 */
4285 	if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4286 	    !(flags & O_NOCTTY)) {
4287 		int tmp = 0;
4288 
4289 		(void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4290 		    (caddr_t)&tmp, ctx);
4291 	}
4292 
4293 	proc_fdlock(p);
4294 	procfdtbl_releasefd(p, indx, NULL);
4295 
4296 #if CONFIG_SECLUDED_MEMORY
4297 	if (secluded_for_filecache &&
4298 	    FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE &&
4299 	    vnode_vtype(vp) == VREG) {
4300 		memory_object_control_t moc;
4301 
4302 		moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4303 
4304 		if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4305 			/* nothing to do... */
4306 		} else if (fp->fp_glob->fg_flag & FWRITE) {
4307 			/* writable -> no longer  eligible for secluded pages */
4308 			memory_object_mark_eligible_for_secluded(moc,
4309 			    FALSE);
4310 		} else if (secluded_for_filecache == 1) {
4311 			char pathname[32] = { 0, };
4312 			size_t copied;
4313 			/* XXX FBDP: better way to detect /Applications/ ? */
4314 			if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4315 				(void)copyinstr(ndp->ni_dirp,
4316 				    pathname,
4317 				    sizeof(pathname),
4318 				    &copied);
4319 			} else {
4320 				copystr(CAST_DOWN(void *, ndp->ni_dirp),
4321 				    pathname,
4322 				    sizeof(pathname),
4323 				    &copied);
4324 			}
4325 			pathname[sizeof(pathname) - 1] = '\0';
4326 			if (strncmp(pathname,
4327 			    "/Applications/",
4328 			    strlen("/Applications/")) == 0 &&
4329 			    strncmp(pathname,
4330 			    "/Applications/Camera.app/",
4331 			    strlen("/Applications/Camera.app/")) != 0) {
4332 				/*
4333 				 * not writable
4334 				 * AND from "/Applications/"
4335 				 * AND not from "/Applications/Camera.app/"
4336 				 * ==> eligible for secluded
4337 				 */
4338 				memory_object_mark_eligible_for_secluded(moc,
4339 				    TRUE);
4340 			}
4341 		} else if (secluded_for_filecache == 2) {
4342 			size_t len = strlen(vp->v_name);
4343 			if (!strncmp(vp->v_name, "dyld", len) ||
4344 			    !strncmp(vp->v_name, "launchd", len) ||
4345 			    !strncmp(vp->v_name, "Camera", len) ||
4346 			    !strncmp(vp->v_name, "mediaserverd", len) ||
4347 			    !strncmp(vp->v_name, "SpringBoard", len) ||
4348 			    !strncmp(vp->v_name, "backboardd", len)) {
4349 				/*
4350 				 * This file matters when launching Camera:
4351 				 * do not store its contents in the secluded
4352 				 * pool that will be drained on Camera launch.
4353 				 */
4354 				memory_object_mark_eligible_for_secluded(moc,
4355 				    FALSE);
4356 			}
4357 		}
4358 	}
4359 #endif /* CONFIG_SECLUDED_MEMORY */
4360 
4361 	fp_drop(p, indx, fp, 1);
4362 	proc_fdunlock(p);
4363 
4364 	*retval = indx;
4365 
4366 	return 0;
4367 bad:
4368 	context = *vfs_context_current();
4369 	context.vc_ucred = fp->fp_glob->fg_cred;
4370 
4371 	if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4372 	    (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4373 		struct flock lf = {
4374 			.l_whence = SEEK_SET,
4375 			.l_type = F_UNLCK,
4376 		};
4377 
4378 		(void)VNOP_ADVLOCK(
4379 			vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4380 	}
4381 
4382 	vn_close(vp, fp->fp_glob->fg_flag, &context);
4383 	vnode_put(vp);
4384 	fp_free(p, indx, fp);
4385 
4386 	return error;
4387 }
4388 
4389 /*
4390  * While most of the *at syscall handlers can call nameiat() which
4391  * is a wrapper around namei, the use of namei and initialisation
4392  * of nameidata are far removed and in different functions  - namei
4393  * gets called in vn_open_auth for open1. So we'll just do here what
4394  * nameiat() does.
4395  */
4396 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd)4397 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4398     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4399     int dirfd)
4400 {
4401 	if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4402 		int error;
4403 		char c;
4404 
4405 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4406 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
4407 			if (error) {
4408 				return error;
4409 			}
4410 		} else {
4411 			c = *((char *)(ndp->ni_dirp));
4412 		}
4413 
4414 		if (c != '/') {
4415 			vnode_t dvp_at;
4416 
4417 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4418 			    &dvp_at);
4419 			if (error) {
4420 				return error;
4421 			}
4422 
4423 			if (vnode_vtype(dvp_at) != VDIR) {
4424 				vnode_put(dvp_at);
4425 				return ENOTDIR;
4426 			}
4427 
4428 			ndp->ni_dvp = dvp_at;
4429 			ndp->ni_cnd.cn_flags |= USEDVP;
4430 			error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4431 			    retval);
4432 			vnode_put(dvp_at);
4433 			return error;
4434 		}
4435 	}
4436 
4437 	return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval);
4438 }
4439 
4440 /*
4441  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4442  *
4443  * Parameters:	p			Process requesting the open
4444  *		uap			User argument descriptor (see below)
4445  *		retval			Pointer to an area to receive the
4446  *					return calue from the system call
4447  *
4448  * Indirect:	uap->path		Path to open (same as 'open')
4449  *		uap->flags		Flags to open (same as 'open'
4450  *		uap->uid		UID to set, if creating
4451  *		uap->gid		GID to set, if creating
4452  *		uap->mode		File mode, if creating (same as 'open')
4453  *		uap->xsecurity		ACL to set, if creating
4454  *
4455  * Returns:	0			Success
4456  *		!0			errno value
4457  *
4458  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
4459  *
4460  * XXX:		We should enummerate the possible errno values here, and where
4461  *		in the code they originated.
4462  */
4463 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)4464 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4465 {
4466 	int ciferror;
4467 	kauth_filesec_t xsecdst;
4468 	struct vnode_attr va;
4469 	struct nameidata nd;
4470 	int cmode;
4471 
4472 	AUDIT_ARG(owner, uap->uid, uap->gid);
4473 
4474 	xsecdst = NULL;
4475 	if ((uap->xsecurity != USER_ADDR_NULL) &&
4476 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4477 		return ciferror;
4478 	}
4479 
4480 	VATTR_INIT(&va);
4481 	cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4482 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4483 	if (uap->uid != KAUTH_UID_NONE) {
4484 		VATTR_SET(&va, va_uid, uap->uid);
4485 	}
4486 	if (uap->gid != KAUTH_GID_NONE) {
4487 		VATTR_SET(&va, va_gid, uap->gid);
4488 	}
4489 	if (xsecdst != NULL) {
4490 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4491 		va.va_vaflags |= VA_FILESEC_ACL;
4492 	}
4493 
4494 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4495 	    uap->path, vfs_context_current());
4496 
4497 	ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4498 	    NULL, NULL, retval);
4499 	if (xsecdst != NULL) {
4500 		kauth_filesec_free(xsecdst);
4501 	}
4502 
4503 	return ciferror;
4504 }
4505 
4506 /*
4507  * Go through the data-protected atomically controlled open (2)
4508  *
4509  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4510  */
4511 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)4512 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
4513 {
4514 	int flags = uap->flags;
4515 	int class = uap->class;
4516 	int dpflags = uap->dpflags;
4517 
4518 	/*
4519 	 * Follow the same path as normal open(2)
4520 	 * Look up the item if it exists, and acquire the vnode.
4521 	 */
4522 	struct vnode_attr va;
4523 	struct nameidata nd;
4524 	int cmode;
4525 	int error;
4526 
4527 	VATTR_INIT(&va);
4528 	/* Mask off all but regular access permissions */
4529 	cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4530 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4531 
4532 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4533 	    uap->path, vfs_context_current());
4534 
4535 	/*
4536 	 * Initialize the extra fields in vnode_attr to pass down our
4537 	 * extra fields.
4538 	 * 1. target cprotect class.
4539 	 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4540 	 */
4541 	if (flags & O_CREAT) {
4542 		/* lower level kernel code validates that the class is valid before applying it. */
4543 		if (class != PROTECTION_CLASS_DEFAULT) {
4544 			/*
4545 			 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4546 			 * file behave the same as open (2)
4547 			 */
4548 			VATTR_SET(&va, va_dataprotect_class, class);
4549 		}
4550 	}
4551 
4552 	if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED)) {
4553 		if (flags & (O_RDWR | O_WRONLY)) {
4554 			/* Not allowed to write raw encrypted bytes */
4555 			return EINVAL;
4556 		}
4557 		if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
4558 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4559 		}
4560 		if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
4561 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4562 		}
4563 	}
4564 
4565 	error = open1(vfs_context_current(), &nd, uap->flags, &va,
4566 	    NULL, NULL, retval);
4567 
4568 	return error;
4569 }
4570 
4571 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)4572 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4573     int fd, enum uio_seg segflg, int *retval)
4574 {
4575 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
4576 	struct {
4577 		struct vnode_attr va;
4578 		struct nameidata nd;
4579 	} *__open_data;
4580 	struct vnode_attr *vap;
4581 	struct nameidata *ndp;
4582 	int cmode;
4583 	int error;
4584 
4585 	__open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
4586 	vap = &__open_data->va;
4587 	ndp = &__open_data->nd;
4588 
4589 	VATTR_INIT(vap);
4590 	/* Mask off all but regular access permissions */
4591 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4592 	VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
4593 
4594 	NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
4595 	    segflg, path, ctx);
4596 
4597 	error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd);
4598 
4599 	kfree_type(typeof(*__open_data), __open_data);
4600 
4601 	return error;
4602 }
4603 
4604 int
open(proc_t p,struct open_args * uap,int32_t * retval)4605 open(proc_t p, struct open_args *uap, int32_t *retval)
4606 {
4607 	__pthread_testcancel(1);
4608 	return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
4609 }
4610 
4611 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)4612 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
4613     int32_t *retval)
4614 {
4615 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
4616 	           uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
4617 }
4618 
4619 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)4620 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
4621     int32_t *retval)
4622 {
4623 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
4624 	           uap->mode, uap->fd, UIO_USERSPACE, retval);
4625 }
4626 
4627 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)4628 openat(proc_t p, struct openat_args *uap, int32_t *retval)
4629 {
4630 	__pthread_testcancel(1);
4631 	return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
4632 }
4633 
4634 /*
4635  * openbyid_np: open a file given a file system id and a file system object id
4636  *	the hfs file system object id is an fsobj_id_t {uint32, uint32}
4637  *	file systems that don't support object ids it is a node id (uint64_t).
4638  *
4639  * Parameters:	p			Process requesting the open
4640  *		uap			User argument descriptor (see below)
4641  *		retval			Pointer to an area to receive the
4642  *					return calue from the system call
4643  *
4644  * Indirect:	uap->path		Path to open (same as 'open')
4645  *
4646  *		uap->fsid		id of target file system
4647  *		uap->objid		id of target file system object
4648  *		uap->flags		Flags to open (same as 'open')
4649  *
4650  * Returns:	0			Success
4651  *		!0			errno value
4652  *
4653  *
4654  * XXX:		We should enummerate the possible errno values here, and where
4655  *		in the code they originated.
4656  */
4657 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)4658 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
4659 {
4660 	fsid_t fsid;
4661 	uint64_t objid;
4662 	int error;
4663 	char *buf = NULL;
4664 	int buflen = MAXPATHLEN;
4665 	int pathlen = 0;
4666 	vfs_context_t ctx = vfs_context_current();
4667 
4668 	if ((error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_OPEN_BY_ID, 0))) {
4669 		return error;
4670 	}
4671 
4672 	if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
4673 		return error;
4674 	}
4675 
4676 	/*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
4677 	if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
4678 		return error;
4679 	}
4680 
4681 	AUDIT_ARG(value32, fsid.val[0]);
4682 	AUDIT_ARG(value64, objid);
4683 
4684 	/*resolve path from fsis, objid*/
4685 	do {
4686 		buf = kalloc_data(buflen + 1, Z_WAITOK);
4687 		if (buf == NULL) {
4688 			return ENOMEM;
4689 		}
4690 
4691 		error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
4692 		    buf, FSOPT_ISREALFSID, &pathlen);
4693 
4694 		if (error) {
4695 			kfree_data(buf, buflen + 1);
4696 			buf = NULL;
4697 		}
4698 	} while (error == ENOSPC && (buflen += MAXPATHLEN));
4699 
4700 	if (error) {
4701 		return error;
4702 	}
4703 
4704 	buf[pathlen] = 0;
4705 
4706 	error = openat_internal(
4707 		ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
4708 
4709 	kfree_data(buf, buflen + 1);
4710 
4711 	return error;
4712 }
4713 
4714 
4715 /*
4716  * Create a special file.
4717  */
4718 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
4719 
4720 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)4721 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
4722 {
4723 	struct vnode_attr va;
4724 	vfs_context_t ctx = vfs_context_current();
4725 	int error;
4726 	struct nameidata nd;
4727 	vnode_t vp, dvp;
4728 
4729 	VATTR_INIT(&va);
4730 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
4731 	VATTR_SET(&va, va_rdev, uap->dev);
4732 
4733 	/* If it's a mknod() of a FIFO, call mkfifo1() instead */
4734 	if ((uap->mode & S_IFMT) == S_IFIFO) {
4735 		return mkfifo1(ctx, uap->path, &va);
4736 	}
4737 
4738 	AUDIT_ARG(mode, (mode_t)uap->mode);
4739 	AUDIT_ARG(value32, uap->dev);
4740 
4741 	if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
4742 		return error;
4743 	}
4744 	NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
4745 	    UIO_USERSPACE, uap->path, ctx);
4746 	error = namei(&nd);
4747 	if (error) {
4748 		return error;
4749 	}
4750 	dvp = nd.ni_dvp;
4751 	vp = nd.ni_vp;
4752 
4753 	if (vp != NULL) {
4754 		error = EEXIST;
4755 		goto out;
4756 	}
4757 
4758 	switch (uap->mode & S_IFMT) {
4759 	case S_IFCHR:
4760 		VATTR_SET(&va, va_type, VCHR);
4761 		break;
4762 	case S_IFBLK:
4763 		VATTR_SET(&va, va_type, VBLK);
4764 		break;
4765 	default:
4766 		error = EINVAL;
4767 		goto out;
4768 	}
4769 
4770 #if CONFIG_MACF
4771 	error = mac_vnode_check_create(ctx,
4772 	    nd.ni_dvp, &nd.ni_cnd, &va);
4773 	if (error) {
4774 		goto out;
4775 	}
4776 #endif
4777 
4778 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
4779 		goto out;
4780 	}
4781 
4782 	if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0) {
4783 		goto out;
4784 	}
4785 
4786 	if (vp) {
4787 		int     update_flags = 0;
4788 
4789 		// Make sure the name & parent pointers are hooked up
4790 		if (vp->v_name == NULL) {
4791 			update_flags |= VNODE_UPDATE_NAME;
4792 		}
4793 		if (vp->v_parent == NULLVP) {
4794 			update_flags |= VNODE_UPDATE_PARENT;
4795 		}
4796 
4797 		if (update_flags) {
4798 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4799 		}
4800 
4801 #if CONFIG_FSE
4802 		add_fsevent(FSE_CREATE_FILE, ctx,
4803 		    FSE_ARG_VNODE, vp,
4804 		    FSE_ARG_DONE);
4805 #endif
4806 	}
4807 
4808 out:
4809 	/*
4810 	 * nameidone has to happen before we vnode_put(dvp)
4811 	 * since it may need to release the fs_nodelock on the dvp
4812 	 */
4813 	nameidone(&nd);
4814 
4815 	if (vp) {
4816 		vnode_put(vp);
4817 	}
4818 	vnode_put(dvp);
4819 
4820 	return error;
4821 }
4822 
4823 /*
4824  * Create a named pipe.
4825  *
4826  * Returns:	0			Success
4827  *		EEXIST
4828  *	namei:???
4829  *	vnode_authorize:???
4830  *	vn_create:???
4831  */
4832 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap)4833 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
4834 {
4835 	vnode_t vp, dvp;
4836 	int error;
4837 	struct nameidata nd;
4838 
4839 	NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
4840 	    UIO_USERSPACE, upath, ctx);
4841 	error = namei(&nd);
4842 	if (error) {
4843 		return error;
4844 	}
4845 	dvp = nd.ni_dvp;
4846 	vp = nd.ni_vp;
4847 
4848 	/* check that this is a new file and authorize addition */
4849 	if (vp != NULL) {
4850 		error = EEXIST;
4851 		goto out;
4852 	}
4853 	VATTR_SET(vap, va_type, VFIFO);
4854 
4855 	if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
4856 		goto out;
4857 	}
4858 
4859 	error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
4860 out:
4861 	/*
4862 	 * nameidone has to happen before we vnode_put(dvp)
4863 	 * since it may need to release the fs_nodelock on the dvp
4864 	 */
4865 	nameidone(&nd);
4866 
4867 	if (vp) {
4868 		vnode_put(vp);
4869 	}
4870 	vnode_put(dvp);
4871 
4872 	return error;
4873 }
4874 
4875 
4876 /*
4877  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
4878  *
4879  * Parameters:	p			Process requesting the open
4880  *		uap			User argument descriptor (see below)
4881  *		retval			(Ignored)
4882  *
4883  * Indirect:	uap->path		Path to fifo (same as 'mkfifo')
4884  *		uap->uid		UID to set
4885  *		uap->gid		GID to set
4886  *		uap->mode		File mode to set (same as 'mkfifo')
4887  *		uap->xsecurity		ACL to set, if creating
4888  *
4889  * Returns:	0			Success
4890  *		!0			errno value
4891  *
4892  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
4893  *
4894  * XXX:		We should enummerate the possible errno values here, and where
4895  *		in the code they originated.
4896  */
4897 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)4898 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
4899 {
4900 	int ciferror;
4901 	kauth_filesec_t xsecdst;
4902 	struct vnode_attr va;
4903 
4904 	AUDIT_ARG(owner, uap->uid, uap->gid);
4905 
4906 	xsecdst = KAUTH_FILESEC_NONE;
4907 	if (uap->xsecurity != USER_ADDR_NULL) {
4908 		if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
4909 			return ciferror;
4910 		}
4911 	}
4912 
4913 	VATTR_INIT(&va);
4914 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
4915 	if (uap->uid != KAUTH_UID_NONE) {
4916 		VATTR_SET(&va, va_uid, uap->uid);
4917 	}
4918 	if (uap->gid != KAUTH_GID_NONE) {
4919 		VATTR_SET(&va, va_gid, uap->gid);
4920 	}
4921 	if (xsecdst != KAUTH_FILESEC_NONE) {
4922 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4923 		va.va_vaflags |= VA_FILESEC_ACL;
4924 	}
4925 
4926 	ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
4927 
4928 	if (xsecdst != KAUTH_FILESEC_NONE) {
4929 		kauth_filesec_free(xsecdst);
4930 	}
4931 	return ciferror;
4932 }
4933 
4934 /* ARGSUSED */
4935 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)4936 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
4937 {
4938 	struct vnode_attr va;
4939 
4940 	VATTR_INIT(&va);
4941 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
4942 
4943 	return mkfifo1(vfs_context_current(), uap->path, &va);
4944 }
4945 
4946 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
4947 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4948 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4949 
4950 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)4951 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
4952 {
4953 	int ret, len = _len;
4954 
4955 	*truncated_path = 0;
4956 
4957 	if (firmlink) {
4958 		ret = vn_getpath(dvp, path, &len);
4959 	} else {
4960 		ret = vn_getpath_no_firmlink(dvp, path, &len);
4961 	}
4962 	if (ret == 0 && len < (MAXPATHLEN - 1)) {
4963 		if (leafname) {
4964 			path[len - 1] = '/';
4965 			len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
4966 			if (len > MAXPATHLEN) {
4967 				char *ptr;
4968 
4969 				// the string got truncated!
4970 				*truncated_path = 1;
4971 				ptr = strrchr(path, '/');
4972 				if (ptr) {
4973 					*ptr = '\0';   // chop off the string at the last directory component
4974 				}
4975 				len = (int)strlen(path) + 1;
4976 			}
4977 		}
4978 	} else if (ret == 0) {
4979 		*truncated_path = 1;
4980 	} else if (ret != 0) {
4981 		struct vnode *mydvp = dvp;
4982 
4983 		if (ret != ENOSPC) {
4984 			printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
4985 			    dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
4986 		}
4987 		*truncated_path = 1;
4988 
4989 		do {
4990 			if (mydvp->v_parent != NULL) {
4991 				mydvp = mydvp->v_parent;
4992 			} else if (mydvp->v_mount) {
4993 				strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4994 				break;
4995 			} else {
4996 				// no parent and no mount point?  only thing is to punt and say "/" changed
4997 				strlcpy(path, "/", _len);
4998 				len = 2;
4999 				mydvp = NULL;
5000 			}
5001 
5002 			if (mydvp == NULL) {
5003 				break;
5004 			}
5005 
5006 			len = _len;
5007 			if (firmlink) {
5008 				ret = vn_getpath(mydvp, path, &len);
5009 			} else {
5010 				ret = vn_getpath_no_firmlink(mydvp, path, &len);
5011 			}
5012 		} while (ret == ENOSPC);
5013 	}
5014 
5015 	return len;
5016 }
5017 
5018 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5019 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5020 {
5021 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5022 }
5023 
5024 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5025 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5026 {
5027 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5028 }
5029 
5030 /*
5031  * Make a hard file link.
5032  *
5033  * Returns:	0			Success
5034  *		EPERM
5035  *		EEXIST
5036  *		EXDEV
5037  *	namei:???
5038  *	vnode_authorize:???
5039  *	VNOP_LINK:???
5040  */
5041 /* ARGSUSED */
5042 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5043 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5044     user_addr_t link, int flag, enum uio_seg segflg)
5045 {
5046 	vnode_t vp, pvp, dvp, lvp;
5047 	struct nameidata nd;
5048 	int follow;
5049 	int error;
5050 #if CONFIG_FSE
5051 	fse_info finfo;
5052 #endif
5053 	int need_event, has_listeners, need_kpath2;
5054 	char *target_path = NULL;
5055 	char  *no_firmlink_path = NULL;
5056 	int truncated = 0;
5057 	int truncated_no_firmlink_path = 0;
5058 
5059 	vp = dvp = lvp = NULLVP;
5060 
5061 	/* look up the object we are linking to */
5062 	follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5063 	NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5064 	    segflg, path, ctx);
5065 
5066 	error = nameiat(&nd, fd1);
5067 	if (error) {
5068 		return error;
5069 	}
5070 	vp = nd.ni_vp;
5071 
5072 	nameidone(&nd);
5073 
5074 	/*
5075 	 * Normally, linking to directories is not supported.
5076 	 * However, some file systems may have limited support.
5077 	 */
5078 	if (vp->v_type == VDIR) {
5079 		if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5080 			error = EPERM;   /* POSIX */
5081 			goto out;
5082 		}
5083 
5084 		/* Linking to a directory requires ownership. */
5085 		if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5086 			struct vnode_attr dva;
5087 
5088 			VATTR_INIT(&dva);
5089 			VATTR_WANTED(&dva, va_uid);
5090 			if (vnode_getattr(vp, &dva, ctx) != 0 ||
5091 			    !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5092 			    (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5093 				error = EACCES;
5094 				goto out;
5095 			}
5096 		}
5097 	}
5098 
5099 	/* lookup the target node */
5100 #if CONFIG_TRIGGERS
5101 	nd.ni_op = OP_LINK;
5102 #endif
5103 	nd.ni_cnd.cn_nameiop = CREATE;
5104 	nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5105 	nd.ni_dirp = link;
5106 	error = nameiat(&nd, fd2);
5107 	if (error != 0) {
5108 		goto out;
5109 	}
5110 	dvp = nd.ni_dvp;
5111 	lvp = nd.ni_vp;
5112 
5113 #if CONFIG_MACF
5114 	if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5115 		goto out2;
5116 	}
5117 #endif
5118 
5119 	/* or to anything that kauth doesn't want us to (eg. immutable items) */
5120 	if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5121 		goto out2;
5122 	}
5123 
5124 	/* target node must not exist */
5125 	if (lvp != NULLVP) {
5126 		error = EEXIST;
5127 		goto out2;
5128 	}
5129 	/* cannot link across mountpoints */
5130 	if (vnode_mount(vp) != vnode_mount(dvp)) {
5131 		error = EXDEV;
5132 		goto out2;
5133 	}
5134 
5135 	/* authorize creation of the target note */
5136 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5137 		goto out2;
5138 	}
5139 
5140 	/* and finally make the link */
5141 	error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5142 	if (error) {
5143 		goto out2;
5144 	}
5145 
5146 #if CONFIG_MACF
5147 	(void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5148 #endif
5149 
5150 #if CONFIG_FSE
5151 	need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5152 #else
5153 	need_event = 0;
5154 #endif
5155 	has_listeners = kauth_authorize_fileop_has_listeners();
5156 
5157 	need_kpath2 = 0;
5158 #if CONFIG_AUDIT
5159 	if (AUDIT_RECORD_EXISTS()) {
5160 		need_kpath2 = 1;
5161 	}
5162 #endif
5163 
5164 	if (need_event || has_listeners || need_kpath2) {
5165 		char *link_to_path = NULL;
5166 		int len, link_name_len;
5167 		int  len_no_firmlink_path = 0;
5168 
5169 		/* build the path to the new link file */
5170 		GET_PATH(target_path);
5171 
5172 		len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5173 		if (no_firmlink_path == NULL) {
5174 			GET_PATH(no_firmlink_path);
5175 		}
5176 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5177 
5178 		AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5179 
5180 		if (has_listeners) {
5181 			/* build the path to file we are linking to */
5182 			GET_PATH(link_to_path);
5183 
5184 			link_name_len = MAXPATHLEN;
5185 			if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5186 				/*
5187 				 * Call out to allow 3rd party notification of rename.
5188 				 * Ignore result of kauth_authorize_fileop call.
5189 				 */
5190 				kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5191 				    (uintptr_t)link_to_path,
5192 				    (uintptr_t)target_path);
5193 			}
5194 			if (link_to_path != NULL) {
5195 				RELEASE_PATH(link_to_path);
5196 			}
5197 		}
5198 #if CONFIG_FSE
5199 		if (need_event) {
5200 			/* construct fsevent */
5201 			if (get_fse_info(vp, &finfo, ctx) == 0) {
5202 				if (truncated_no_firmlink_path) {
5203 					finfo.mode |= FSE_TRUNCATED_PATH;
5204 				}
5205 
5206 				// build the path to the destination of the link
5207 				add_fsevent(FSE_CREATE_FILE, ctx,
5208 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5209 				    FSE_ARG_FINFO, &finfo,
5210 				    FSE_ARG_DONE);
5211 			}
5212 
5213 			pvp = vp->v_parent;
5214 			// need an iocount on pvp in this case
5215 			if (pvp && pvp != dvp) {
5216 				error = vnode_get(pvp);
5217 				if (error) {
5218 					pvp = NULLVP;
5219 					error = 0;
5220 				}
5221 			}
5222 			if (pvp) {
5223 				add_fsevent(FSE_STAT_CHANGED, ctx,
5224 				    FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5225 			}
5226 			if (pvp && pvp != dvp) {
5227 				vnode_put(pvp);
5228 			}
5229 		}
5230 #endif
5231 	}
5232 out2:
5233 	/*
5234 	 * nameidone has to happen before we vnode_put(dvp)
5235 	 * since it may need to release the fs_nodelock on the dvp
5236 	 */
5237 	nameidone(&nd);
5238 	if (target_path != NULL) {
5239 		RELEASE_PATH(target_path);
5240 	}
5241 	if (no_firmlink_path != NULL) {
5242 		RELEASE_PATH(no_firmlink_path);
5243 		no_firmlink_path = NULL;
5244 	}
5245 out:
5246 	if (lvp) {
5247 		vnode_put(lvp);
5248 	}
5249 	if (dvp) {
5250 		vnode_put(dvp);
5251 	}
5252 	vnode_put(vp);
5253 	return error;
5254 }
5255 
5256 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5257 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5258 {
5259 	return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5260 	           AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5261 }
5262 
5263 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5264 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5265 {
5266 	if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5267 		return EINVAL;
5268 	}
5269 
5270 	return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5271 	           uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5272 }
5273 
5274 /*
5275  * Make a symbolic link.
5276  *
5277  * We could add support for ACLs here too...
5278  */
5279 /* ARGSUSED */
5280 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5281 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5282     user_addr_t link, enum uio_seg segflg)
5283 {
5284 	struct vnode_attr va;
5285 	char *path;
5286 	int error;
5287 	struct nameidata nd;
5288 	vnode_t vp, dvp;
5289 	size_t dummy = 0;
5290 	proc_t p;
5291 
5292 	error = 0;
5293 	if (UIO_SEG_IS_USER_SPACE(segflg)) {
5294 		path = zalloc(ZV_NAMEI);
5295 		error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5296 	} else {
5297 		path = (char *)path_data;
5298 	}
5299 	if (error) {
5300 		goto out;
5301 	}
5302 	AUDIT_ARG(text, path);  /* This is the link string */
5303 
5304 	NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5305 	    segflg, link, ctx);
5306 
5307 	error = nameiat(&nd, fd);
5308 	if (error) {
5309 		goto out;
5310 	}
5311 	dvp = nd.ni_dvp;
5312 	vp = nd.ni_vp;
5313 
5314 	p = vfs_context_proc(ctx);
5315 	VATTR_INIT(&va);
5316 	VATTR_SET(&va, va_type, VLNK);
5317 	VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5318 
5319 #if CONFIG_MACF
5320 	error = mac_vnode_check_create(ctx,
5321 	    dvp, &nd.ni_cnd, &va);
5322 #endif
5323 	if (error != 0) {
5324 		goto skipit;
5325 	}
5326 
5327 	if (vp != NULL) {
5328 		error = EEXIST;
5329 		goto skipit;
5330 	}
5331 
5332 	/* authorize */
5333 	if (error == 0) {
5334 		error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5335 	}
5336 	/* get default ownership, etc. */
5337 	if (error == 0) {
5338 		error = vnode_authattr_new(dvp, &va, 0, ctx);
5339 	}
5340 	if (error == 0) {
5341 		error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5342 	}
5343 
5344 	/* do fallback attribute handling */
5345 	if (error == 0 && vp) {
5346 		error = vnode_setattr_fallback(vp, &va, ctx);
5347 	}
5348 
5349 #if CONFIG_MACF
5350 	if (error == 0 && vp) {
5351 		error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5352 	}
5353 #endif
5354 
5355 	if (error == 0) {
5356 		int     update_flags = 0;
5357 
5358 		/*check if a new vnode was created, else try to get one*/
5359 		if (vp == NULL) {
5360 			nd.ni_cnd.cn_nameiop = LOOKUP;
5361 #if CONFIG_TRIGGERS
5362 			nd.ni_op = OP_LOOKUP;
5363 #endif
5364 			/*
5365 			 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5366 			 * reallocated again in namei().
5367 			 */
5368 			nd.ni_cnd.cn_flags &= HASBUF;
5369 			error = nameiat(&nd, fd);
5370 			if (error) {
5371 				goto skipit;
5372 			}
5373 			vp = nd.ni_vp;
5374 		}
5375 
5376 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5377 		/* call out to allow 3rd party notification of rename.
5378 		 * Ignore result of kauth_authorize_fileop call.
5379 		 */
5380 		if (kauth_authorize_fileop_has_listeners() &&
5381 		    namei(&nd) == 0) {
5382 			char *new_link_path = NULL;
5383 			int             len;
5384 
5385 			/* build the path to the new link file */
5386 			new_link_path = get_pathbuff();
5387 			len = MAXPATHLEN;
5388 			vn_getpath(dvp, new_link_path, &len);
5389 			if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5390 				new_link_path[len - 1] = '/';
5391 				strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5392 			}
5393 
5394 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5395 			    (uintptr_t)path, (uintptr_t)new_link_path);
5396 			if (new_link_path != NULL) {
5397 				release_pathbuff(new_link_path);
5398 			}
5399 		}
5400 #endif
5401 		// Make sure the name & parent pointers are hooked up
5402 		if (vp->v_name == NULL) {
5403 			update_flags |= VNODE_UPDATE_NAME;
5404 		}
5405 		if (vp->v_parent == NULLVP) {
5406 			update_flags |= VNODE_UPDATE_PARENT;
5407 		}
5408 
5409 		if (update_flags) {
5410 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5411 		}
5412 
5413 #if CONFIG_FSE
5414 		add_fsevent(FSE_CREATE_FILE, ctx,
5415 		    FSE_ARG_VNODE, vp,
5416 		    FSE_ARG_DONE);
5417 #endif
5418 	}
5419 
5420 skipit:
5421 	/*
5422 	 * nameidone has to happen before we vnode_put(dvp)
5423 	 * since it may need to release the fs_nodelock on the dvp
5424 	 */
5425 	nameidone(&nd);
5426 
5427 	if (vp) {
5428 		vnode_put(vp);
5429 	}
5430 	vnode_put(dvp);
5431 out:
5432 	if (path && (path != (char *)path_data)) {
5433 		zfree(ZV_NAMEI, path);
5434 	}
5435 
5436 	return error;
5437 }
5438 
5439 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)5440 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5441 {
5442 	return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5443 	           uap->link, UIO_USERSPACE);
5444 }
5445 
5446 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)5447 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5448     __unused int32_t *retval)
5449 {
5450 	return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5451 	           uap->path2, UIO_USERSPACE);
5452 }
5453 
5454 /*
5455  * Delete a whiteout from the filesystem.
5456  * No longer supported.
5457  */
5458 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)5459 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5460 {
5461 	return ENOTSUP;
5462 }
5463 
5464 /*
5465  * Delete a name from the filesystem.
5466  */
5467 /* ARGSUSED */
5468 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5469 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5470     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5471 {
5472 	struct {
5473 		struct nameidata nd;
5474 #if CONFIG_FSE
5475 		struct vnode_attr va;
5476 		fse_info finfo;
5477 #endif
5478 	} *__unlink_data;
5479 	struct nameidata *ndp;
5480 	vnode_t vp, dvp;
5481 	int error;
5482 	struct componentname *cnp;
5483 	char  *path = NULL;
5484 	char  *no_firmlink_path = NULL;
5485 	int  len_path = 0;
5486 	int  len_no_firmlink_path = 0;
5487 	int flags;
5488 	int need_event;
5489 	int has_listeners;
5490 	int truncated_path;
5491 	int truncated_no_firmlink_path;
5492 	int batched;
5493 	struct vnode_attr *vap;
5494 	int do_retry;
5495 	int retry_count = 0;
5496 	int cn_flags;
5497 
5498 	cn_flags = LOCKPARENT;
5499 	if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
5500 		cn_flags |= AUDITVNPATH1;
5501 	}
5502 	/* If a starting dvp is passed, it trumps any fd passed. */
5503 	if (start_dvp) {
5504 		cn_flags |= USEDVP;
5505 	}
5506 
5507 #if NAMEDRSRCFORK
5508 	/* unlink or delete is allowed on rsrc forks and named streams */
5509 	cn_flags |= CN_ALLOWRSRCFORK;
5510 #endif
5511 
5512 	__unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
5513 	ndp = &__unlink_data->nd;
5514 #if CONFIG_FSE
5515 	fse_info *finfop = &__unlink_data->finfo;
5516 #endif
5517 
5518 retry:
5519 	do_retry = 0;
5520 	flags = 0;
5521 	need_event = 0;
5522 	has_listeners = 0;
5523 	truncated_path = 0;
5524 	truncated_no_firmlink_path = 0;
5525 	vap = NULL;
5526 
5527 	NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
5528 
5529 	ndp->ni_dvp = start_dvp;
5530 	ndp->ni_flag |= NAMEI_COMPOUNDREMOVE;
5531 	cnp = &ndp->ni_cnd;
5532 
5533 continue_lookup:
5534 	error = nameiat(ndp, fd);
5535 	if (error) {
5536 		goto early_out;
5537 	}
5538 
5539 	dvp = ndp->ni_dvp;
5540 	vp = ndp->ni_vp;
5541 
5542 	/* With Carbon delete semantics, busy files cannot be deleted */
5543 	if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
5544 		flags |= VNODE_REMOVE_NODELETEBUSY;
5545 	}
5546 
5547 	/* Skip any potential upcalls if told to. */
5548 	if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
5549 		flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
5550 	}
5551 
5552 	if (vp) {
5553 		batched = vnode_compound_remove_available(vp);
5554 		/*
5555 		 * The root of a mounted filesystem cannot be deleted.
5556 		 */
5557 		if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
5558 			error = EBUSY;
5559 			goto out;
5560 		}
5561 
5562 #if DEVELOPMENT || DEBUG
5563 		/*
5564 		 * XXX VSWAP: Check for entitlements or special flag here
5565 		 * so we can restrict access appropriately.
5566 		 */
5567 #else /* DEVELOPMENT || DEBUG */
5568 
5569 		if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
5570 			error = EPERM;
5571 			goto out;
5572 		}
5573 #endif /* DEVELOPMENT || DEBUG */
5574 
5575 		if (!batched) {
5576 			error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
5577 			if (error) {
5578 				if (error == ENOENT) {
5579 					if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5580 						do_retry = 1;
5581 						retry_count++;
5582 					}
5583 				}
5584 				goto out;
5585 			}
5586 		}
5587 	} else {
5588 		batched = 1;
5589 
5590 		if (!vnode_compound_remove_available(dvp)) {
5591 			panic("No vp, but no compound remove?");
5592 		}
5593 	}
5594 
5595 #if CONFIG_FSE
5596 	need_event = need_fsevent(FSE_DELETE, dvp);
5597 	if (need_event) {
5598 		if (!batched) {
5599 			if ((vp->v_flag & VISHARDLINK) == 0) {
5600 				/* XXX need to get these data in batched VNOP */
5601 				get_fse_info(vp, finfop, ctx);
5602 			}
5603 		} else {
5604 			error =
5605 			    vfs_get_notify_attributes(&__unlink_data->va);
5606 			if (error) {
5607 				goto out;
5608 			}
5609 
5610 			vap = &__unlink_data->va;
5611 		}
5612 	}
5613 #endif
5614 	has_listeners = kauth_authorize_fileop_has_listeners();
5615 	if (need_event || has_listeners) {
5616 		if (path == NULL) {
5617 			GET_PATH(path);
5618 		}
5619 		len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
5620 		if (no_firmlink_path == NULL) {
5621 			GET_PATH(no_firmlink_path);
5622 		}
5623 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5624 	}
5625 
5626 #if NAMEDRSRCFORK
5627 	if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
5628 		error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
5629 	} else
5630 #endif
5631 	{
5632 		error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
5633 		vp = ndp->ni_vp;
5634 		if (error == EKEEPLOOKING) {
5635 			if (!batched) {
5636 				panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
5637 			}
5638 
5639 			if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
5640 				panic("EKEEPLOOKING, but continue flag not set?");
5641 			}
5642 
5643 			if (vnode_isdir(vp)) {
5644 				error = EISDIR;
5645 				goto out;
5646 			}
5647 			goto continue_lookup;
5648 		} else if (error == ENOENT && batched) {
5649 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5650 				/*
5651 				 * For compound VNOPs, the authorization callback may
5652 				 * return ENOENT in case of racing hardlink lookups
5653 				 * hitting the name  cache, redrive the lookup.
5654 				 */
5655 				do_retry = 1;
5656 				retry_count += 1;
5657 				goto out;
5658 			}
5659 		}
5660 	}
5661 
5662 	/*
5663 	 * Call out to allow 3rd party notification of delete.
5664 	 * Ignore result of kauth_authorize_fileop call.
5665 	 */
5666 	if (!error) {
5667 		if (has_listeners) {
5668 			kauth_authorize_fileop(vfs_context_ucred(ctx),
5669 			    KAUTH_FILEOP_DELETE,
5670 			    (uintptr_t)vp,
5671 			    (uintptr_t)path);
5672 		}
5673 
5674 		if (vp->v_flag & VISHARDLINK) {
5675 			//
5676 			// if a hardlink gets deleted we want to blow away the
5677 			// v_parent link because the path that got us to this
5678 			// instance of the link is no longer valid.  this will
5679 			// force the next call to get the path to ask the file
5680 			// system instead of just following the v_parent link.
5681 			//
5682 			vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
5683 		}
5684 
5685 #if CONFIG_FSE
5686 		if (need_event) {
5687 			if (vp->v_flag & VISHARDLINK) {
5688 				get_fse_info(vp, finfop, ctx);
5689 			} else if (vap) {
5690 				vnode_get_fse_info_from_vap(vp, finfop, vap);
5691 			}
5692 			if (truncated_path) {
5693 				finfop->mode |= FSE_TRUNCATED_PATH;
5694 			}
5695 			add_fsevent(FSE_DELETE, ctx,
5696 			    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5697 			    FSE_ARG_FINFO, finfop,
5698 			    FSE_ARG_DONE);
5699 		}
5700 #endif
5701 	}
5702 
5703 out:
5704 	if (path != NULL) {
5705 		RELEASE_PATH(path);
5706 		path = NULL;
5707 	}
5708 
5709 	if (no_firmlink_path != NULL) {
5710 		RELEASE_PATH(no_firmlink_path);
5711 		no_firmlink_path = NULL;
5712 	}
5713 #if NAMEDRSRCFORK
5714 	/* recycle the deleted rsrc fork vnode to force a reclaim, which
5715 	 * will cause its shadow file to go away if necessary.
5716 	 */
5717 	if (vp && (vnode_isnamedstream(vp)) &&
5718 	    (vp->v_parent != NULLVP) &&
5719 	    vnode_isshadow(vp)) {
5720 		vnode_recycle(vp);
5721 	}
5722 #endif
5723 	/*
5724 	 * nameidone has to happen before we vnode_put(dvp)
5725 	 * since it may need to release the fs_nodelock on the dvp
5726 	 */
5727 	nameidone(ndp);
5728 	vnode_put(dvp);
5729 	if (vp) {
5730 		vnode_put(vp);
5731 	}
5732 
5733 	if (do_retry) {
5734 		goto retry;
5735 	}
5736 
5737 early_out:
5738 	kfree_type(typeof(*__unlink_data), __unlink_data);
5739 	return error;
5740 }
5741 
5742 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5743 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
5744     enum uio_seg segflg, int unlink_flags)
5745 {
5746 	return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
5747 	           unlink_flags);
5748 }
5749 
5750 /*
5751  * Delete a name from the filesystem using Carbon semantics.
5752  */
5753 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)5754 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
5755 {
5756 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5757 	           uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
5758 }
5759 
5760 /*
5761  * Delete a name from the filesystem using POSIX semantics.
5762  */
5763 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)5764 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
5765 {
5766 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5767 	           uap->path, UIO_USERSPACE, 0);
5768 }
5769 
5770 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)5771 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
5772 {
5773 	if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5774 		return EINVAL;
5775 	}
5776 
5777 	if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5778 		int unlink_flags = 0;
5779 
5780 		if (uap->flag & AT_REMOVEDIR_DATALESS) {
5781 			unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
5782 		}
5783 		return rmdirat_internal(vfs_context_current(), uap->fd,
5784 		           uap->path, UIO_USERSPACE, unlink_flags);
5785 	} else {
5786 		return unlinkat_internal(vfs_context_current(), uap->fd,
5787 		           NULLVP, uap->path, UIO_USERSPACE, 0);
5788 	}
5789 }
5790 
5791 /*
5792  * Reposition read/write file offset.
5793  */
5794 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)5795 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
5796 {
5797 	struct fileproc *fp;
5798 	vnode_t vp;
5799 	struct vfs_context *ctx;
5800 	off_t offset = uap->offset, file_size;
5801 	int error;
5802 
5803 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
5804 		if (error == ENOTSUP) {
5805 			return ESPIPE;
5806 		}
5807 		return error;
5808 	}
5809 	if (vnode_isfifo(vp)) {
5810 		file_drop(uap->fd);
5811 		return ESPIPE;
5812 	}
5813 
5814 
5815 	ctx = vfs_context_current();
5816 #if CONFIG_MACF
5817 	if (uap->whence == L_INCR && uap->offset == 0) {
5818 		error = mac_file_check_get_offset(vfs_context_ucred(ctx),
5819 		    fp->fp_glob);
5820 	} else {
5821 		error = mac_file_check_change_offset(vfs_context_ucred(ctx),
5822 		    fp->fp_glob);
5823 	}
5824 	if (error) {
5825 		file_drop(uap->fd);
5826 		return error;
5827 	}
5828 #endif
5829 	if ((error = vnode_getwithref(vp))) {
5830 		file_drop(uap->fd);
5831 		return error;
5832 	}
5833 
5834 	switch (uap->whence) {
5835 	case L_INCR:
5836 		offset += fp->fp_glob->fg_offset;
5837 		break;
5838 	case L_XTND:
5839 		if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
5840 			break;
5841 		}
5842 		offset += file_size;
5843 		break;
5844 	case L_SET:
5845 		break;
5846 	case SEEK_HOLE:
5847 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
5848 		break;
5849 	case SEEK_DATA:
5850 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
5851 		break;
5852 	default:
5853 		error = EINVAL;
5854 	}
5855 	if (error == 0) {
5856 		if (uap->offset > 0 && offset < 0) {
5857 			/* Incremented/relative move past max size */
5858 			error = EOVERFLOW;
5859 		} else {
5860 			/*
5861 			 * Allow negative offsets on character devices, per
5862 			 * POSIX 1003.1-2001.  Most likely for writing disk
5863 			 * labels.
5864 			 */
5865 			if (offset < 0 && vp->v_type != VCHR) {
5866 				/* Decremented/relative move before start */
5867 				error = EINVAL;
5868 			} else {
5869 				/* Success */
5870 				fp->fp_glob->fg_offset = offset;
5871 				*retval = fp->fp_glob->fg_offset;
5872 			}
5873 		}
5874 	}
5875 
5876 	/*
5877 	 * An lseek can affect whether data is "available to read."  Use
5878 	 * hint of NOTE_NONE so no EVFILT_VNODE events fire
5879 	 */
5880 	post_event_if_success(vp, error, NOTE_NONE);
5881 	(void)vnode_put(vp);
5882 	file_drop(uap->fd);
5883 	return error;
5884 }
5885 
5886 
5887 /*
5888  * Check access permissions.
5889  *
5890  * Returns:	0			Success
5891  *		vnode_authorize:???
5892  */
5893 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)5894 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
5895 {
5896 	kauth_action_t action;
5897 	int error;
5898 
5899 	/*
5900 	 * If just the regular access bits, convert them to something
5901 	 * that vnode_authorize will understand.
5902 	 */
5903 	if (!(uflags & _ACCESS_EXTENDED_MASK)) {
5904 		action = 0;
5905 		if (uflags & R_OK) {
5906 			action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
5907 		}
5908 		if (uflags & W_OK) {
5909 			if (vnode_isdir(vp)) {
5910 				action |= KAUTH_VNODE_ADD_FILE |
5911 				    KAUTH_VNODE_ADD_SUBDIRECTORY;
5912 				/* might want delete rights here too */
5913 			} else {
5914 				action |= KAUTH_VNODE_WRITE_DATA;
5915 			}
5916 		}
5917 		if (uflags & X_OK) {
5918 			if (vnode_isdir(vp)) {
5919 				action |= KAUTH_VNODE_SEARCH;
5920 			} else {
5921 				action |= KAUTH_VNODE_EXECUTE;
5922 			}
5923 		}
5924 	} else {
5925 		/* take advantage of definition of uflags */
5926 		action = uflags >> 8;
5927 	}
5928 
5929 #if CONFIG_MACF
5930 	error = mac_vnode_check_access(ctx, vp, uflags);
5931 	if (error) {
5932 		return error;
5933 	}
5934 #endif /* MAC */
5935 
5936 	/* action == 0 means only check for existence */
5937 	if (action != 0) {
5938 		error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
5939 	} else {
5940 		error = 0;
5941 	}
5942 
5943 	return error;
5944 }
5945 
5946 
5947 
5948 /*
5949  * access_extended: Check access permissions in bulk.
5950  *
5951  * Description:	uap->entries		Pointer to an array of accessx
5952  *                                      descriptor structs, plus one or
5953  *                                      more NULL terminated strings (see
5954  *                                      "Notes" section below).
5955  *		uap->size		Size of the area pointed to by
5956  *					uap->entries.
5957  *		uap->results		Pointer to the results array.
5958  *
5959  * Returns:	0			Success
5960  *		ENOMEM			Insufficient memory
5961  *		EINVAL			Invalid arguments
5962  *		namei:EFAULT		Bad address
5963  *		namei:ENAMETOOLONG	Filename too long
5964  *		namei:ENOENT		No such file or directory
5965  *		namei:ELOOP		Too many levels of symbolic links
5966  *		namei:EBADF		Bad file descriptor
5967  *		namei:ENOTDIR		Not a directory
5968  *		namei:???
5969  *		access1:
5970  *
5971  * Implicit returns:
5972  *		uap->results		Array contents modified
5973  *
5974  * Notes:	The uap->entries are structured as an arbitrary length array
5975  *		of accessx descriptors, followed by one or more NULL terminated
5976  *		strings
5977  *
5978  *			struct accessx_descriptor[0]
5979  *			...
5980  *			struct accessx_descriptor[n]
5981  *			char name_data[0];
5982  *
5983  *		We determine the entry count by walking the buffer containing
5984  *		the uap->entries argument descriptor.  For each descriptor we
5985  *		see, the valid values for the offset ad_name_offset will be
5986  *		in the byte range:
5987  *
5988  *			[ uap->entries + sizeof(struct accessx_descriptor) ]
5989  *						to
5990  *				[ uap->entries + uap->size - 2 ]
5991  *
5992  *		since we must have at least one string, and the string must
5993  *		be at least one character plus the NULL terminator in length.
5994  *
5995  * XXX:		Need to support the check-as uid argument
5996  */
5997 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)5998 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
5999 {
6000 	struct accessx_descriptor *input = NULL;
6001 	errno_t *result = NULL;
6002 	errno_t error = 0;
6003 	int wantdelete = 0;
6004 	size_t desc_max, desc_actual;
6005 	unsigned int i, j;
6006 	struct vfs_context context;
6007 	struct nameidata nd;
6008 	int niopts;
6009 	vnode_t vp = NULL;
6010 	vnode_t dvp = NULL;
6011 #define ACCESSX_MAX_DESCR_ON_STACK 10
6012 	struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6013 
6014 	context.vc_ucred = NULL;
6015 
6016 	/*
6017 	 * Validate parameters; if valid, copy the descriptor array and string
6018 	 * arguments into local memory.  Before proceeding, the following
6019 	 * conditions must have been met:
6020 	 *
6021 	 * o	The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6022 	 * o	There must be sufficient room in the request for at least one
6023 	 *	descriptor and a one yte NUL terminated string.
6024 	 * o	The allocation of local storage must not fail.
6025 	 */
6026 	if (uap->size > ACCESSX_MAX_TABLESIZE) {
6027 		return ENOMEM;
6028 	}
6029 	if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6030 		return EINVAL;
6031 	}
6032 	if (uap->size <= sizeof(stack_input)) {
6033 		input = stack_input;
6034 	} else {
6035 		input = kalloc_data(uap->size, Z_WAITOK);
6036 		if (input == NULL) {
6037 			error = ENOMEM;
6038 			goto out;
6039 		}
6040 	}
6041 	error = copyin(uap->entries, input, uap->size);
6042 	if (error) {
6043 		goto out;
6044 	}
6045 
6046 	AUDIT_ARG(opaque, input, uap->size);
6047 
6048 	/*
6049 	 * Force NUL termination of the copyin buffer to avoid nami() running
6050 	 * off the end.  If the caller passes us bogus data, they may get a
6051 	 * bogus result.
6052 	 */
6053 	((char *)input)[uap->size - 1] = 0;
6054 
6055 	/*
6056 	 * Access is defined as checking against the process' real identity,
6057 	 * even if operations are checking the effective identity.  This
6058 	 * requires that we use a local vfs context.
6059 	 */
6060 	context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6061 	context.vc_thread = current_thread();
6062 
6063 	/*
6064 	 * Find out how many entries we have, so we can allocate the result
6065 	 * array by walking the list and adjusting the count downward by the
6066 	 * earliest string offset we see.
6067 	 */
6068 	desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6069 	desc_actual = desc_max;
6070 	for (i = 0; i < desc_actual; i++) {
6071 		/*
6072 		 * Take the offset to the name string for this entry and
6073 		 * convert to an input array index, which would be one off
6074 		 * the end of the array if this entry was the lowest-addressed
6075 		 * name string.
6076 		 */
6077 		j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6078 
6079 		/*
6080 		 * An offset greater than the max allowable offset is an error.
6081 		 * It is also an error for any valid entry to point
6082 		 * to a location prior to the end of the current entry, if
6083 		 * it's not a reference to the string of the previous entry.
6084 		 */
6085 		if (j > desc_max || (j != 0 && j <= i)) {
6086 			error = EINVAL;
6087 			goto out;
6088 		}
6089 
6090 		/* Also do not let ad_name_offset point to something beyond the size of the input */
6091 		if (input[i].ad_name_offset >= uap->size) {
6092 			error = EINVAL;
6093 			goto out;
6094 		}
6095 
6096 		/*
6097 		 * An offset of 0 means use the previous descriptor's offset;
6098 		 * this is used to chain multiple requests for the same file
6099 		 * to avoid multiple lookups.
6100 		 */
6101 		if (j == 0) {
6102 			/* This is not valid for the first entry */
6103 			if (i == 0) {
6104 				error = EINVAL;
6105 				goto out;
6106 			}
6107 			continue;
6108 		}
6109 
6110 		/*
6111 		 * If the offset of the string for this descriptor is before
6112 		 * what we believe is the current actual last descriptor,
6113 		 * then we need to adjust our estimate downward; this permits
6114 		 * the string table following the last descriptor to be out
6115 		 * of order relative to the descriptor list.
6116 		 */
6117 		if (j < desc_actual) {
6118 			desc_actual = j;
6119 		}
6120 	}
6121 
6122 	/*
6123 	 * We limit the actual number of descriptors we are willing to process
6124 	 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
6125 	 * requested does not exceed this limit,
6126 	 */
6127 	if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6128 		error = ENOMEM;
6129 		goto out;
6130 	}
6131 	result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6132 	if (result == NULL) {
6133 		error = ENOMEM;
6134 		goto out;
6135 	}
6136 
6137 	/*
6138 	 * Do the work by iterating over the descriptor entries we know to
6139 	 * at least appear to contain valid data.
6140 	 */
6141 	error = 0;
6142 	for (i = 0; i < desc_actual; i++) {
6143 		/*
6144 		 * If the ad_name_offset is 0, then we use the previous
6145 		 * results to make the check; otherwise, we are looking up
6146 		 * a new file name.
6147 		 */
6148 		if (input[i].ad_name_offset != 0) {
6149 			/* discard old vnodes */
6150 			if (vp) {
6151 				vnode_put(vp);
6152 				vp = NULL;
6153 			}
6154 			if (dvp) {
6155 				vnode_put(dvp);
6156 				dvp = NULL;
6157 			}
6158 
6159 			/*
6160 			 * Scan forward in the descriptor list to see if we
6161 			 * need the parent vnode.  We will need it if we are
6162 			 * deleting, since we must have rights  to remove
6163 			 * entries in the parent directory, as well as the
6164 			 * rights to delete the object itself.
6165 			 */
6166 			wantdelete = input[i].ad_flags & _DELETE_OK;
6167 			for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6168 				if (input[j].ad_flags & _DELETE_OK) {
6169 					wantdelete = 1;
6170 				}
6171 			}
6172 
6173 			niopts = FOLLOW | AUDITVNPATH1;
6174 
6175 			/* need parent for vnode_authorize for deletion test */
6176 			if (wantdelete) {
6177 				niopts |= WANTPARENT;
6178 			}
6179 
6180 			/* do the lookup */
6181 			NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6182 			    CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6183 			    &context);
6184 			error = namei(&nd);
6185 			if (!error) {
6186 				vp = nd.ni_vp;
6187 				if (wantdelete) {
6188 					dvp = nd.ni_dvp;
6189 				}
6190 			}
6191 			nameidone(&nd);
6192 		}
6193 
6194 		/*
6195 		 * Handle lookup errors.
6196 		 */
6197 		switch (error) {
6198 		case ENOENT:
6199 		case EACCES:
6200 		case EPERM:
6201 		case ENOTDIR:
6202 			result[i] = error;
6203 			break;
6204 		case 0:
6205 			/* run this access check */
6206 			result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6207 			break;
6208 		default:
6209 			/* fatal lookup error */
6210 
6211 			goto out;
6212 		}
6213 	}
6214 
6215 	AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6216 
6217 	/* copy out results */
6218 	error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6219 
6220 out:
6221 	if (input && input != stack_input) {
6222 		kfree_data(input, uap->size);
6223 	}
6224 	if (result) {
6225 		kfree_data(result, desc_actual * sizeof(errno_t));
6226 	}
6227 	if (vp) {
6228 		vnode_put(vp);
6229 	}
6230 	if (dvp) {
6231 		vnode_put(dvp);
6232 	}
6233 	if (IS_VALID_CRED(context.vc_ucred)) {
6234 		kauth_cred_unref(&context.vc_ucred);
6235 	}
6236 	return error;
6237 }
6238 
6239 
6240 /*
6241  * Returns:	0			Success
6242  *		namei:EFAULT		Bad address
6243  *		namei:ENAMETOOLONG	Filename too long
6244  *		namei:ENOENT		No such file or directory
6245  *		namei:ELOOP		Too many levels of symbolic links
6246  *		namei:EBADF		Bad file descriptor
6247  *		namei:ENOTDIR		Not a directory
6248  *		namei:???
6249  *		access1:
6250  */
6251 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6252 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6253     int flag, enum uio_seg segflg)
6254 {
6255 	int error;
6256 	struct nameidata nd;
6257 	int niopts;
6258 	struct vfs_context context;
6259 #if NAMEDRSRCFORK
6260 	int is_namedstream = 0;
6261 #endif
6262 
6263 	/*
6264 	 * Unless the AT_EACCESS option is used, Access is defined as checking
6265 	 * against the process' real identity, even if operations are checking
6266 	 * the effective identity.  So we need to tweak the credential
6267 	 * in the context for that case.
6268 	 */
6269 	if (!(flag & AT_EACCESS)) {
6270 		context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6271 	} else {
6272 		context.vc_ucred = ctx->vc_ucred;
6273 	}
6274 	context.vc_thread = ctx->vc_thread;
6275 
6276 
6277 	niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6278 	/* need parent for vnode_authorize for deletion test */
6279 	if (amode & _DELETE_OK) {
6280 		niopts |= WANTPARENT;
6281 	}
6282 	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6283 	    path, &context);
6284 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6285 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6286 	}
6287 
6288 #if NAMEDRSRCFORK
6289 	/* access(F_OK) calls are allowed for resource forks. */
6290 	if (amode == F_OK) {
6291 		nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6292 	}
6293 #endif
6294 	error = nameiat(&nd, fd);
6295 	if (error) {
6296 		goto out;
6297 	}
6298 
6299 #if NAMEDRSRCFORK
6300 	/* Grab reference on the shadow stream file vnode to
6301 	 * force an inactive on release which will mark it
6302 	 * for recycle.
6303 	 */
6304 	if (vnode_isnamedstream(nd.ni_vp) &&
6305 	    (nd.ni_vp->v_parent != NULLVP) &&
6306 	    vnode_isshadow(nd.ni_vp)) {
6307 		is_namedstream = 1;
6308 		vnode_ref(nd.ni_vp);
6309 	}
6310 #endif
6311 
6312 	error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6313 
6314 #if NAMEDRSRCFORK
6315 	if (is_namedstream) {
6316 		vnode_rele(nd.ni_vp);
6317 	}
6318 #endif
6319 
6320 	vnode_put(nd.ni_vp);
6321 	if (amode & _DELETE_OK) {
6322 		vnode_put(nd.ni_dvp);
6323 	}
6324 	nameidone(&nd);
6325 
6326 out:
6327 	if (!(flag & AT_EACCESS)) {
6328 		kauth_cred_unref(&context.vc_ucred);
6329 	}
6330 	return error;
6331 }
6332 
6333 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)6334 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6335 {
6336 	return faccessat_internal(vfs_context_current(), AT_FDCWD,
6337 	           uap->path, uap->flags, 0, UIO_USERSPACE);
6338 }
6339 
6340 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)6341 faccessat(__unused proc_t p, struct faccessat_args *uap,
6342     __unused int32_t *retval)
6343 {
6344 	if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
6345 		return EINVAL;
6346 	}
6347 
6348 	return faccessat_internal(vfs_context_current(), uap->fd,
6349 	           uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6350 }
6351 
6352 /*
6353  * Returns:	0			Success
6354  *		EFAULT
6355  *	copyout:EFAULT
6356  *	namei:???
6357  *	vn_stat:???
6358  */
6359 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)6360 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6361     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6362     enum uio_seg segflg, int fd, int flag)
6363 {
6364 	struct nameidata nd;
6365 	int follow;
6366 	union {
6367 		struct stat sb;
6368 		struct stat64 sb64;
6369 	} source = {};
6370 	union {
6371 		struct user64_stat user64_sb;
6372 		struct user32_stat user32_sb;
6373 		struct user64_stat64 user64_sb64;
6374 		struct user32_stat64 user32_sb64;
6375 	} dest = {};
6376 	caddr_t sbp;
6377 	int error, my_size;
6378 	kauth_filesec_t fsec;
6379 	size_t xsecurity_bufsize;
6380 	void * statptr;
6381 	struct fileproc *fp = NULL;
6382 	int needsrealdev = 0;
6383 
6384 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
6385 	NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6386 	    segflg, path, ctx);
6387 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6388 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6389 	}
6390 
6391 #if NAMEDRSRCFORK
6392 	int is_namedstream = 0;
6393 	/* stat calls are allowed for resource forks. */
6394 	nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6395 #endif
6396 
6397 	if (flag & AT_FDONLY) {
6398 		vnode_t fvp;
6399 
6400 		error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6401 		if (error) {
6402 			return error;
6403 		}
6404 		if ((error = vnode_getwithref(fvp))) {
6405 			file_drop(fd);
6406 			return error;
6407 		}
6408 		nd.ni_vp = fvp;
6409 	} else {
6410 		error = nameiat(&nd, fd);
6411 		if (error) {
6412 			return error;
6413 		}
6414 	}
6415 	fsec = KAUTH_FILESEC_NONE;
6416 
6417 	statptr = (void *)&source;
6418 
6419 #if NAMEDRSRCFORK
6420 	/* Grab reference on the shadow stream file vnode to
6421 	 * force an inactive on release which will mark it
6422 	 * for recycle.
6423 	 */
6424 	if (vnode_isnamedstream(nd.ni_vp) &&
6425 	    (nd.ni_vp->v_parent != NULLVP) &&
6426 	    vnode_isshadow(nd.ni_vp)) {
6427 		is_namedstream = 1;
6428 		vnode_ref(nd.ni_vp);
6429 	}
6430 #endif
6431 
6432 	needsrealdev = flag & AT_REALDEV ? 1 : 0;
6433 	if (fp && (xsecurity == USER_ADDR_NULL)) {
6434 		/*
6435 		 * If the caller has the file open, and is not
6436 		 * requesting extended security information, we are
6437 		 * going to let them get the basic stat information.
6438 		 */
6439 		error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6440 		    fp->fp_glob->fg_cred);
6441 	} else {
6442 		error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6443 		    isstat64, needsrealdev, ctx);
6444 	}
6445 
6446 #if NAMEDRSRCFORK
6447 	if (is_namedstream) {
6448 		vnode_rele(nd.ni_vp);
6449 	}
6450 #endif
6451 	vnode_put(nd.ni_vp);
6452 	nameidone(&nd);
6453 	if (fp) {
6454 		file_drop(fd);
6455 		fp = NULL;
6456 	}
6457 
6458 	if (error) {
6459 		return error;
6460 	}
6461 	/* Zap spare fields */
6462 	if (isstat64 != 0) {
6463 		source.sb64.st_lspare = 0;
6464 		source.sb64.st_qspare[0] = 0LL;
6465 		source.sb64.st_qspare[1] = 0LL;
6466 		if (vfs_context_is64bit(ctx)) {
6467 			munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6468 			my_size = sizeof(dest.user64_sb64);
6469 			sbp = (caddr_t)&dest.user64_sb64;
6470 		} else {
6471 			munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6472 			my_size = sizeof(dest.user32_sb64);
6473 			sbp = (caddr_t)&dest.user32_sb64;
6474 		}
6475 		/*
6476 		 * Check if we raced (post lookup) against the last unlink of a file.
6477 		 */
6478 		if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
6479 			source.sb64.st_nlink = 1;
6480 		}
6481 	} else {
6482 		source.sb.st_lspare = 0;
6483 		source.sb.st_qspare[0] = 0LL;
6484 		source.sb.st_qspare[1] = 0LL;
6485 		if (vfs_context_is64bit(ctx)) {
6486 			munge_user64_stat(&source.sb, &dest.user64_sb);
6487 			my_size = sizeof(dest.user64_sb);
6488 			sbp = (caddr_t)&dest.user64_sb;
6489 		} else {
6490 			munge_user32_stat(&source.sb, &dest.user32_sb);
6491 			my_size = sizeof(dest.user32_sb);
6492 			sbp = (caddr_t)&dest.user32_sb;
6493 		}
6494 
6495 		/*
6496 		 * Check if we raced (post lookup) against the last unlink of a file.
6497 		 */
6498 		if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
6499 			source.sb.st_nlink = 1;
6500 		}
6501 	}
6502 	if ((error = copyout(sbp, ub, my_size)) != 0) {
6503 		goto out;
6504 	}
6505 
6506 	/* caller wants extended security information? */
6507 	if (xsecurity != USER_ADDR_NULL) {
6508 		/* did we get any? */
6509 		if (fsec == KAUTH_FILESEC_NONE) {
6510 			if (susize(xsecurity_size, 0) != 0) {
6511 				error = EFAULT;
6512 				goto out;
6513 			}
6514 		} else {
6515 			/* find the user buffer size */
6516 			xsecurity_bufsize = fusize(xsecurity_size);
6517 
6518 			/* copy out the actual data size */
6519 			if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
6520 				error = EFAULT;
6521 				goto out;
6522 			}
6523 
6524 			/* if the caller supplied enough room, copy out to it */
6525 			if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
6526 				error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
6527 			}
6528 		}
6529 	}
6530 out:
6531 	if (fsec != KAUTH_FILESEC_NONE) {
6532 		kauth_filesec_free(fsec);
6533 	}
6534 	return error;
6535 }
6536 
6537 /*
6538  * stat_extended: Get file status; with extended security (ACL).
6539  *
6540  * Parameters:    p                       (ignored)
6541  *                uap                     User argument descriptor (see below)
6542  *                retval                  (ignored)
6543  *
6544  * Indirect:      uap->path               Path of file to get status from
6545  *                uap->ub                 User buffer (holds file status info)
6546  *                uap->xsecurity          ACL to get (extended security)
6547  *                uap->xsecurity_size     Size of ACL
6548  *
6549  * Returns:        0                      Success
6550  *                !0                      errno value
6551  *
6552  */
6553 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)6554 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
6555     __unused int32_t *retval)
6556 {
6557 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6558 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6559 	           0);
6560 }
6561 
6562 /*
6563  * Returns:	0			Success
6564  *	fstatat_internal:???		[see fstatat_internal() in this file]
6565  */
6566 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)6567 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
6568 {
6569 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6570 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
6571 }
6572 
6573 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)6574 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
6575 {
6576 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6577 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
6578 }
6579 
6580 /*
6581  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
6582  *
6583  * Parameters:    p                       (ignored)
6584  *                uap                     User argument descriptor (see below)
6585  *                retval                  (ignored)
6586  *
6587  * Indirect:      uap->path               Path of file to get status from
6588  *                uap->ub                 User buffer (holds file status info)
6589  *                uap->xsecurity          ACL to get (extended security)
6590  *                uap->xsecurity_size     Size of ACL
6591  *
6592  * Returns:        0                      Success
6593  *                !0                      errno value
6594  *
6595  */
6596 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)6597 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
6598 {
6599 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6600 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6601 	           0);
6602 }
6603 
6604 /*
6605  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
6606  *
6607  * Parameters:    p                       (ignored)
6608  *                uap                     User argument descriptor (see below)
6609  *                retval                  (ignored)
6610  *
6611  * Indirect:      uap->path               Path of file to get status from
6612  *                uap->ub                 User buffer (holds file status info)
6613  *                uap->xsecurity          ACL to get (extended security)
6614  *                uap->xsecurity_size     Size of ACL
6615  *
6616  * Returns:        0                      Success
6617  *                !0                      errno value
6618  *
6619  */
6620 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)6621 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
6622 {
6623 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6624 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6625 	           AT_SYMLINK_NOFOLLOW);
6626 }
6627 
6628 /*
6629  * Get file status; this version does not follow links.
6630  */
6631 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)6632 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
6633 {
6634 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6635 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6636 }
6637 
6638 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)6639 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
6640 {
6641 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6642 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6643 }
6644 
6645 /*
6646  * lstat64_extended: Get file status; can handle large inode numbers; does not
6647  * follow links; with extended security (ACL).
6648  *
6649  * Parameters:    p                       (ignored)
6650  *                uap                     User argument descriptor (see below)
6651  *                retval                  (ignored)
6652  *
6653  * Indirect:      uap->path               Path of file to get status from
6654  *                uap->ub                 User buffer (holds file status info)
6655  *                uap->xsecurity          ACL to get (extended security)
6656  *                uap->xsecurity_size     Size of ACL
6657  *
6658  * Returns:        0                      Success
6659  *                !0                      errno value
6660  *
6661  */
6662 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)6663 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
6664 {
6665 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6666 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6667 	           AT_SYMLINK_NOFOLLOW);
6668 }
6669 
6670 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)6671 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
6672 {
6673 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
6674 		return EINVAL;
6675 	}
6676 
6677 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6678 	           0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
6679 }
6680 
6681 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)6682 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
6683     __unused int32_t *retval)
6684 {
6685 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
6686 		return EINVAL;
6687 	}
6688 
6689 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6690 	           0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
6691 }
6692 
6693 /*
6694  * Get configurable pathname variables.
6695  *
6696  * Returns:	0			Success
6697  *	namei:???
6698  *	vn_pathconf:???
6699  *
6700  * Notes:	Global implementation  constants are intended to be
6701  *		implemented in this function directly; all other constants
6702  *		are per-FS implementation, and therefore must be handled in
6703  *		each respective FS, instead.
6704  *
6705  * XXX We implement some things globally right now that should actually be
6706  * XXX per-FS; we will need to deal with this at some point.
6707  */
6708 /* ARGSUSED */
6709 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)6710 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
6711 {
6712 	int error;
6713 	struct nameidata nd;
6714 	vfs_context_t ctx = vfs_context_current();
6715 
6716 	NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
6717 	    UIO_USERSPACE, uap->path, ctx);
6718 	error = namei(&nd);
6719 	if (error) {
6720 		return error;
6721 	}
6722 
6723 	error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
6724 
6725 	vnode_put(nd.ni_vp);
6726 	nameidone(&nd);
6727 	return error;
6728 }
6729 
6730 /*
6731  * Return target name of a symbolic link.
6732  */
6733 /* ARGSUSED */
6734 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)6735 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
6736     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
6737     int *retval)
6738 {
6739 	vnode_t vp;
6740 	uio_t auio;
6741 	int error;
6742 	struct nameidata nd;
6743 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
6744 	bool put_vnode;
6745 
6746 	if (bufsize > INT32_MAX) {
6747 		return EINVAL;
6748 	}
6749 
6750 	if (lnk_vp) {
6751 		vp = lnk_vp;
6752 		put_vnode = false;
6753 	} else {
6754 		NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
6755 		    seg, path, ctx);
6756 
6757 		error = nameiat(&nd, fd);
6758 		if (error) {
6759 			return error;
6760 		}
6761 		vp = nd.ni_vp;
6762 		put_vnode = true;
6763 		nameidone(&nd);
6764 	}
6765 
6766 	auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
6767 	    &uio_buf[0], sizeof(uio_buf));
6768 	uio_addiov(auio, buf, bufsize);
6769 	if (vp->v_type != VLNK) {
6770 		error = EINVAL;
6771 	} else {
6772 #if CONFIG_MACF
6773 		error = mac_vnode_check_readlink(ctx, vp);
6774 #endif
6775 		if (error == 0) {
6776 			error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
6777 			    ctx);
6778 		}
6779 		if (error == 0) {
6780 			error = VNOP_READLINK(vp, auio, ctx);
6781 		}
6782 	}
6783 
6784 	if (put_vnode) {
6785 		vnode_put(vp);
6786 	}
6787 
6788 	*retval = (int)(bufsize - uio_resid(auio));
6789 	return error;
6790 }
6791 
6792 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)6793 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
6794 {
6795 	enum uio_seg procseg;
6796 	vnode_t vp;
6797 	int error;
6798 
6799 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6800 
6801 	AUDIT_ARG(fd, uap->fd);
6802 
6803 	if ((error = file_vnode(uap->fd, &vp))) {
6804 		return error;
6805 	}
6806 	if ((error = vnode_getwithref(vp))) {
6807 		file_drop(uap->fd);
6808 		return error;
6809 	}
6810 
6811 	error = readlinkat_internal(vfs_context_current(), -1,
6812 	    vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
6813 	    uap->bufsize, procseg, retval);
6814 
6815 	vnode_put(vp);
6816 	file_drop(uap->fd);
6817 	return error;
6818 }
6819 
6820 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)6821 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
6822 {
6823 	enum uio_seg procseg;
6824 
6825 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6826 	return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
6827 	           CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
6828 	           uap->count, procseg, retval);
6829 }
6830 
6831 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)6832 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
6833 {
6834 	enum uio_seg procseg;
6835 
6836 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6837 	return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
6838 	           CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
6839 	           retval);
6840 }
6841 
6842 /*
6843  * Change file flags, the deep inner layer.
6844  */
6845 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)6846 chflags0(vnode_t vp, struct vnode_attr *va,
6847     int (*setattr)(vnode_t, void *, vfs_context_t),
6848     void *arg, vfs_context_t ctx)
6849 {
6850 	kauth_action_t action = 0;
6851 	int error;
6852 
6853 #if CONFIG_MACF
6854 	error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
6855 	if (error) {
6856 		goto out;
6857 	}
6858 #endif
6859 
6860 	/* request authorisation, disregard immutability */
6861 	if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
6862 		goto out;
6863 	}
6864 	/*
6865 	 * Request that the auth layer disregard those file flags it's allowed to when
6866 	 * authorizing this operation; we need to do this in order to be able to
6867 	 * clear immutable flags.
6868 	 */
6869 	if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
6870 		goto out;
6871 	}
6872 	error = (*setattr)(vp, arg, ctx);
6873 
6874 #if CONFIG_MACF
6875 	if (error == 0) {
6876 		mac_vnode_notify_setflags(ctx, vp, va->va_flags);
6877 	}
6878 #endif
6879 
6880 out:
6881 	return error;
6882 }
6883 
6884 /*
6885  * Change file flags.
6886  *
6887  * NOTE: this will vnode_put() `vp'
6888  */
6889 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)6890 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
6891 {
6892 	struct vnode_attr va;
6893 	int error;
6894 
6895 	VATTR_INIT(&va);
6896 	VATTR_SET(&va, va_flags, flags);
6897 
6898 	error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
6899 	vnode_put(vp);
6900 
6901 	if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
6902 		error = ENOTSUP;
6903 	}
6904 
6905 	return error;
6906 }
6907 
6908 /*
6909  * Change flags of a file given a path name.
6910  */
6911 /* ARGSUSED */
6912 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)6913 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
6914 {
6915 	vnode_t vp;
6916 	vfs_context_t ctx = vfs_context_current();
6917 	int error;
6918 	struct nameidata nd;
6919 
6920 	AUDIT_ARG(fflags, uap->flags);
6921 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6922 	    UIO_USERSPACE, uap->path, ctx);
6923 	error = namei(&nd);
6924 	if (error) {
6925 		return error;
6926 	}
6927 	vp = nd.ni_vp;
6928 	nameidone(&nd);
6929 
6930 	/* we don't vnode_put() here because chflags1 does internally */
6931 	error = chflags1(vp, uap->flags, ctx);
6932 
6933 	return error;
6934 }
6935 
6936 /*
6937  * Change flags of a file given a file descriptor.
6938  */
6939 /* ARGSUSED */
6940 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)6941 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
6942 {
6943 	vnode_t vp;
6944 	int error;
6945 
6946 	AUDIT_ARG(fd, uap->fd);
6947 	AUDIT_ARG(fflags, uap->flags);
6948 	if ((error = file_vnode(uap->fd, &vp))) {
6949 		return error;
6950 	}
6951 
6952 	if ((error = vnode_getwithref(vp))) {
6953 		file_drop(uap->fd);
6954 		return error;
6955 	}
6956 
6957 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6958 
6959 	/* we don't vnode_put() here because chflags1 does internally */
6960 	error = chflags1(vp, uap->flags, vfs_context_current());
6961 
6962 	file_drop(uap->fd);
6963 	return error;
6964 }
6965 
6966 /*
6967  * Change security information on a filesystem object.
6968  *
6969  * Returns:	0			Success
6970  *		EPERM			Operation not permitted
6971  *		vnode_authattr:???	[anything vnode_authattr can return]
6972  *		vnode_authorize:???	[anything vnode_authorize can return]
6973  *		vnode_setattr:???	[anything vnode_setattr can return]
6974  *
6975  * Notes:	If vnode_authattr or vnode_authorize return EACCES, it will be
6976  *		translated to EPERM before being returned.
6977  */
6978 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)6979 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
6980 {
6981 	kauth_action_t action;
6982 	int error;
6983 
6984 	AUDIT_ARG(mode, vap->va_mode);
6985 	/* XXX audit new args */
6986 
6987 #if NAMEDSTREAMS
6988 	/* chmod calls are not allowed for resource forks. */
6989 	if (vp->v_flag & VISNAMEDSTREAM) {
6990 		return EPERM;
6991 	}
6992 #endif
6993 
6994 #if CONFIG_MACF
6995 	if (VATTR_IS_ACTIVE(vap, va_mode) &&
6996 	    (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
6997 		return error;
6998 	}
6999 
7000 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7001 		if ((error = mac_vnode_check_setowner(ctx, vp,
7002 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7003 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7004 			return error;
7005 		}
7006 	}
7007 
7008 	if (VATTR_IS_ACTIVE(vap, va_acl) &&
7009 	    (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7010 		return error;
7011 	}
7012 #endif
7013 
7014 	/* make sure that the caller is allowed to set this security information */
7015 	if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7016 	    ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7017 		if (error == EACCES) {
7018 			error = EPERM;
7019 		}
7020 		return error;
7021 	}
7022 
7023 	if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7024 		return error;
7025 	}
7026 
7027 #if CONFIG_MACF
7028 	if (VATTR_IS_ACTIVE(vap, va_mode)) {
7029 		mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7030 	}
7031 
7032 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7033 		mac_vnode_notify_setowner(ctx, vp,
7034 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7035 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7036 	}
7037 
7038 	if (VATTR_IS_ACTIVE(vap, va_acl)) {
7039 		mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7040 	}
7041 #endif
7042 
7043 	return error;
7044 }
7045 
7046 
7047 /*
7048  * Change mode of a file given a path name.
7049  *
7050  * Returns:	0			Success
7051  *		namei:???		[anything namei can return]
7052  *		chmod_vnode:???		[anything chmod_vnode can return]
7053  */
7054 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7055 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7056     int fd, int flag, enum uio_seg segflg)
7057 {
7058 	struct nameidata nd;
7059 	int follow, error;
7060 
7061 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7062 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
7063 	    segflg, path, ctx);
7064 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7065 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7066 	}
7067 	if ((error = nameiat(&nd, fd))) {
7068 		return error;
7069 	}
7070 	error = chmod_vnode(ctx, nd.ni_vp, vap);
7071 	vnode_put(nd.ni_vp);
7072 	nameidone(&nd);
7073 	return error;
7074 }
7075 
7076 /*
7077  * chmod_extended: Change the mode of a file given a path name; with extended
7078  * argument list (including extended security (ACL)).
7079  *
7080  * Parameters:	p			Process requesting the open
7081  *		uap			User argument descriptor (see below)
7082  *		retval			(ignored)
7083  *
7084  * Indirect:	uap->path		Path to object (same as 'chmod')
7085  *		uap->uid		UID to set
7086  *		uap->gid		GID to set
7087  *		uap->mode		File mode to set (same as 'chmod')
7088  *		uap->xsecurity		ACL to set (or delete)
7089  *
7090  * Returns:	0			Success
7091  *		!0			errno value
7092  *
7093  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
7094  *
7095  * XXX:		We should enummerate the possible errno values here, and where
7096  *		in the code they originated.
7097  */
7098 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7099 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7100 {
7101 	int error;
7102 	struct vnode_attr va;
7103 	kauth_filesec_t xsecdst;
7104 
7105 	AUDIT_ARG(owner, uap->uid, uap->gid);
7106 
7107 	VATTR_INIT(&va);
7108 	if (uap->mode != -1) {
7109 		VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7110 	}
7111 	if (uap->uid != KAUTH_UID_NONE) {
7112 		VATTR_SET(&va, va_uid, uap->uid);
7113 	}
7114 	if (uap->gid != KAUTH_GID_NONE) {
7115 		VATTR_SET(&va, va_gid, uap->gid);
7116 	}
7117 
7118 	xsecdst = NULL;
7119 	switch (uap->xsecurity) {
7120 	/* explicit remove request */
7121 	case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
7122 		VATTR_SET(&va, va_acl, NULL);
7123 		break;
7124 	/* not being set */
7125 	case USER_ADDR_NULL:
7126 		break;
7127 	default:
7128 		if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
7129 			return error;
7130 		}
7131 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
7132 		va.va_vaflags |= VA_FILESEC_ACL;
7133 		KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
7134 	}
7135 
7136 	error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7137 	    UIO_USERSPACE);
7138 
7139 	if (xsecdst != NULL) {
7140 		kauth_filesec_free(xsecdst);
7141 	}
7142 	return error;
7143 }
7144 
7145 /*
7146  * Returns:	0			Success
7147  *		chmodat:???		[anything chmodat can return]
7148  */
7149 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7150 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7151     int flag, enum uio_seg segflg)
7152 {
7153 	struct vnode_attr va;
7154 
7155 	VATTR_INIT(&va);
7156 	VATTR_SET(&va, va_mode, mode & ALLPERMS);
7157 
7158 	return chmodat(ctx, path, &va, fd, flag, segflg);
7159 }
7160 
7161 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7162 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7163 {
7164 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7165 	           AT_FDCWD, 0, UIO_USERSPACE);
7166 }
7167 
7168 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7169 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7170 {
7171 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7172 		return EINVAL;
7173 	}
7174 
7175 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7176 	           uap->fd, uap->flag, UIO_USERSPACE);
7177 }
7178 
7179 /*
7180  * Change mode of a file given a file descriptor.
7181  */
7182 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7183 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7184 {
7185 	vnode_t vp;
7186 	int error;
7187 
7188 	AUDIT_ARG(fd, fd);
7189 
7190 	if ((error = file_vnode(fd, &vp)) != 0) {
7191 		return error;
7192 	}
7193 	if ((error = vnode_getwithref(vp)) != 0) {
7194 		file_drop(fd);
7195 		return error;
7196 	}
7197 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7198 
7199 	error = chmod_vnode(vfs_context_current(), vp, vap);
7200 	(void)vnode_put(vp);
7201 	file_drop(fd);
7202 
7203 	return error;
7204 }
7205 
7206 /*
7207  * fchmod_extended: Change mode of a file given a file descriptor; with
7208  * extended argument list (including extended security (ACL)).
7209  *
7210  * Parameters:    p                       Process requesting to change file mode
7211  *                uap                     User argument descriptor (see below)
7212  *                retval                  (ignored)
7213  *
7214  * Indirect:      uap->mode               File mode to set (same as 'chmod')
7215  *                uap->uid                UID to set
7216  *                uap->gid                GID to set
7217  *                uap->xsecurity          ACL to set (or delete)
7218  *                uap->fd                 File descriptor of file to change mode
7219  *
7220  * Returns:        0                      Success
7221  *                !0                      errno value
7222  *
7223  */
7224 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7225 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7226 {
7227 	int error;
7228 	struct vnode_attr va;
7229 	kauth_filesec_t xsecdst;
7230 
7231 	AUDIT_ARG(owner, uap->uid, uap->gid);
7232 
7233 	VATTR_INIT(&va);
7234 	if (uap->mode != -1) {
7235 		VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7236 	} else {
7237 		va.va_mode = 0;
7238 	}
7239 
7240 	if (uap->uid != KAUTH_UID_NONE) {
7241 		VATTR_SET(&va, va_uid, uap->uid);
7242 	}
7243 	if (uap->gid != KAUTH_GID_NONE) {
7244 		VATTR_SET(&va, va_gid, uap->gid);
7245 	}
7246 
7247 	xsecdst = NULL;
7248 	switch (uap->xsecurity) {
7249 	case USER_ADDR_NULL:
7250 		VATTR_SET(&va, va_acl, NULL);
7251 		break;
7252 	case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
7253 		VATTR_SET(&va, va_acl, NULL);
7254 		break;
7255 	/* not being set */
7256 	case CAST_USER_ADDR_T(-1):
7257 		break;
7258 	default:
7259 		if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
7260 			return error;
7261 		}
7262 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
7263 		va.va_vaflags |= VA_FILESEC_ACL;
7264 	}
7265 
7266 	error = fchmod1(p, uap->fd, &va);
7267 
7268 
7269 	switch (uap->xsecurity) {
7270 	case USER_ADDR_NULL:
7271 	case CAST_USER_ADDR_T(-1):
7272 		break;
7273 	default:
7274 		if (xsecdst != NULL) {
7275 			kauth_filesec_free(xsecdst);
7276 		}
7277 	}
7278 	return error;
7279 }
7280 
7281 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)7282 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7283 {
7284 	struct vnode_attr va;
7285 
7286 	VATTR_INIT(&va);
7287 	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7288 
7289 	return fchmod1(p, uap->fd, &va);
7290 }
7291 
7292 
7293 /*
7294  * Set ownership given a path name.
7295  */
7296 /* ARGSUSED */
7297 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)7298 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7299     gid_t gid, int flag, enum uio_seg segflg)
7300 {
7301 	vnode_t vp;
7302 	struct vnode_attr va;
7303 	int error;
7304 	struct nameidata nd;
7305 	int follow;
7306 	kauth_action_t action;
7307 
7308 	AUDIT_ARG(owner, uid, gid);
7309 
7310 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7311 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
7312 	    path, ctx);
7313 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7314 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7315 	}
7316 	error = nameiat(&nd, fd);
7317 	if (error) {
7318 		return error;
7319 	}
7320 	vp = nd.ni_vp;
7321 
7322 	nameidone(&nd);
7323 
7324 	VATTR_INIT(&va);
7325 	if (uid != (uid_t)VNOVAL) {
7326 		VATTR_SET(&va, va_uid, uid);
7327 	}
7328 	if (gid != (gid_t)VNOVAL) {
7329 		VATTR_SET(&va, va_gid, gid);
7330 	}
7331 
7332 #if CONFIG_MACF
7333 	error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7334 	if (error) {
7335 		goto out;
7336 	}
7337 #endif
7338 
7339 	/* preflight and authorize attribute changes */
7340 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7341 		goto out;
7342 	}
7343 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7344 		goto out;
7345 	}
7346 	error = vnode_setattr(vp, &va, ctx);
7347 
7348 #if CONFIG_MACF
7349 	if (error == 0) {
7350 		mac_vnode_notify_setowner(ctx, vp, uid, gid);
7351 	}
7352 #endif
7353 
7354 out:
7355 	/*
7356 	 * EACCES is only allowed from namei(); permissions failure should
7357 	 * return EPERM, so we need to translate the error code.
7358 	 */
7359 	if (error == EACCES) {
7360 		error = EPERM;
7361 	}
7362 
7363 	vnode_put(vp);
7364 	return error;
7365 }
7366 
7367 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)7368 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7369 {
7370 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7371 	           uap->uid, uap->gid, 0, UIO_USERSPACE);
7372 }
7373 
7374 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)7375 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7376 {
7377 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7378 	           uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7379 }
7380 
7381 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)7382 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7383 {
7384 	if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7385 		return EINVAL;
7386 	}
7387 
7388 	return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7389 	           uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7390 }
7391 
7392 /*
7393  * Set ownership given a file descriptor.
7394  */
7395 /* ARGSUSED */
7396 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)7397 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7398 {
7399 	struct vnode_attr va;
7400 	vfs_context_t ctx = vfs_context_current();
7401 	vnode_t vp;
7402 	int error;
7403 	kauth_action_t action;
7404 
7405 	AUDIT_ARG(owner, uap->uid, uap->gid);
7406 	AUDIT_ARG(fd, uap->fd);
7407 
7408 	if ((error = file_vnode(uap->fd, &vp))) {
7409 		return error;
7410 	}
7411 
7412 	if ((error = vnode_getwithref(vp))) {
7413 		file_drop(uap->fd);
7414 		return error;
7415 	}
7416 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7417 
7418 	VATTR_INIT(&va);
7419 	if (uap->uid != VNOVAL) {
7420 		VATTR_SET(&va, va_uid, uap->uid);
7421 	}
7422 	if (uap->gid != VNOVAL) {
7423 		VATTR_SET(&va, va_gid, uap->gid);
7424 	}
7425 
7426 #if NAMEDSTREAMS
7427 	/* chown calls are not allowed for resource forks. */
7428 	if (vp->v_flag & VISNAMEDSTREAM) {
7429 		error = EPERM;
7430 		goto out;
7431 	}
7432 #endif
7433 
7434 #if CONFIG_MACF
7435 	error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
7436 	if (error) {
7437 		goto out;
7438 	}
7439 #endif
7440 
7441 	/* preflight and authorize attribute changes */
7442 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7443 		goto out;
7444 	}
7445 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7446 		if (error == EACCES) {
7447 			error = EPERM;
7448 		}
7449 		goto out;
7450 	}
7451 	error = vnode_setattr(vp, &va, ctx);
7452 
7453 #if CONFIG_MACF
7454 	if (error == 0) {
7455 		mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
7456 	}
7457 #endif
7458 
7459 out:
7460 	(void)vnode_put(vp);
7461 	file_drop(uap->fd);
7462 	return error;
7463 }
7464 
7465 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)7466 getutimes(user_addr_t usrtvp, struct timespec *tsp)
7467 {
7468 	int error;
7469 
7470 	if (usrtvp == USER_ADDR_NULL) {
7471 		struct timeval old_tv;
7472 		/* XXX Y2038 bug because of microtime argument */
7473 		microtime(&old_tv);
7474 		TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
7475 		tsp[1] = tsp[0];
7476 	} else {
7477 		if (IS_64BIT_PROCESS(current_proc())) {
7478 			struct user64_timeval tv[2];
7479 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
7480 			if (error) {
7481 				return error;
7482 			}
7483 			TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
7484 			TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
7485 		} else {
7486 			struct user32_timeval tv[2];
7487 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
7488 			if (error) {
7489 				return error;
7490 			}
7491 			TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
7492 			TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
7493 		}
7494 	}
7495 	return 0;
7496 }
7497 
7498 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)7499 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
7500     int nullflag)
7501 {
7502 	int error;
7503 	struct vnode_attr va;
7504 	kauth_action_t action;
7505 
7506 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7507 
7508 	VATTR_INIT(&va);
7509 	VATTR_SET(&va, va_access_time, ts[0]);
7510 	VATTR_SET(&va, va_modify_time, ts[1]);
7511 	if (nullflag) {
7512 		va.va_vaflags |= VA_UTIMES_NULL;
7513 	}
7514 
7515 #if NAMEDSTREAMS
7516 	/* utimes calls are not allowed for resource forks. */
7517 	if (vp->v_flag & VISNAMEDSTREAM) {
7518 		error = EPERM;
7519 		goto out;
7520 	}
7521 #endif
7522 
7523 #if CONFIG_MACF
7524 	error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
7525 	if (error) {
7526 		goto out;
7527 	}
7528 #endif
7529 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7530 		if (!nullflag && error == EACCES) {
7531 			error = EPERM;
7532 		}
7533 		goto out;
7534 	}
7535 
7536 	/* since we may not need to auth anything, check here */
7537 	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7538 		if (!nullflag && error == EACCES) {
7539 			error = EPERM;
7540 		}
7541 		goto out;
7542 	}
7543 	error = vnode_setattr(vp, &va, ctx);
7544 
7545 #if CONFIG_MACF
7546 	if (error == 0) {
7547 		mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
7548 	}
7549 #endif
7550 
7551 out:
7552 	return error;
7553 }
7554 
7555 /*
7556  * Set the access and modification times of a file.
7557  */
7558 /* ARGSUSED */
7559 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)7560 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
7561 {
7562 	struct timespec ts[2];
7563 	user_addr_t usrtvp;
7564 	int error;
7565 	struct nameidata nd;
7566 	vfs_context_t ctx = vfs_context_current();
7567 
7568 	/*
7569 	 * AUDIT: Needed to change the order of operations to do the
7570 	 * name lookup first because auditing wants the path.
7571 	 */
7572 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
7573 	    UIO_USERSPACE, uap->path, ctx);
7574 	error = namei(&nd);
7575 	if (error) {
7576 		return error;
7577 	}
7578 	nameidone(&nd);
7579 
7580 	/*
7581 	 * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
7582 	 * the current time instead.
7583 	 */
7584 	usrtvp = uap->tptr;
7585 	if ((error = getutimes(usrtvp, ts)) != 0) {
7586 		goto out;
7587 	}
7588 
7589 	error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
7590 
7591 out:
7592 	vnode_put(nd.ni_vp);
7593 	return error;
7594 }
7595 
7596 /*
7597  * Set the access and modification times of a file.
7598  */
7599 /* ARGSUSED */
7600 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)7601 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
7602 {
7603 	struct timespec ts[2];
7604 	vnode_t vp;
7605 	user_addr_t usrtvp;
7606 	int error;
7607 
7608 	AUDIT_ARG(fd, uap->fd);
7609 	usrtvp = uap->tptr;
7610 	if ((error = getutimes(usrtvp, ts)) != 0) {
7611 		return error;
7612 	}
7613 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
7614 		return error;
7615 	}
7616 	if ((error = vnode_getwithref(vp))) {
7617 		file_drop(uap->fd);
7618 		return error;
7619 	}
7620 
7621 	error =  setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
7622 	vnode_put(vp);
7623 	file_drop(uap->fd);
7624 	return error;
7625 }
7626 
7627 /*
7628  * Truncate a file given its path name.
7629  */
7630 /* ARGSUSED */
7631 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)7632 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
7633 {
7634 	vnode_t vp;
7635 	struct vnode_attr va;
7636 	vfs_context_t ctx = vfs_context_current();
7637 	int error;
7638 	struct nameidata nd;
7639 	kauth_action_t action;
7640 	rlim_t fsize_limit;
7641 
7642 	if (uap->length < 0) {
7643 		return EINVAL;
7644 	}
7645 
7646 	fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
7647 	if ((rlim_t)uap->length > fsize_limit) {
7648 		psignal(p, SIGXFSZ);
7649 		return EFBIG;
7650 	}
7651 
7652 	NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
7653 	    UIO_USERSPACE, uap->path, ctx);
7654 	if ((error = namei(&nd))) {
7655 		return error;
7656 	}
7657 	vp = nd.ni_vp;
7658 
7659 	nameidone(&nd);
7660 
7661 	VATTR_INIT(&va);
7662 	VATTR_SET(&va, va_data_size, uap->length);
7663 
7664 #if CONFIG_MACF
7665 	error = mac_vnode_check_truncate(ctx, NOCRED, vp);
7666 	if (error) {
7667 		goto out;
7668 	}
7669 #endif
7670 
7671 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7672 		goto out;
7673 	}
7674 	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7675 		goto out;
7676 	}
7677 	error = vnode_setattr(vp, &va, ctx);
7678 
7679 #if CONFIG_MACF
7680 	if (error == 0) {
7681 		mac_vnode_notify_truncate(ctx, NOCRED, vp);
7682 	}
7683 #endif
7684 
7685 out:
7686 	vnode_put(vp);
7687 	return error;
7688 }
7689 
7690 /*
7691  * Truncate a file given a file descriptor.
7692  */
7693 /* ARGSUSED */
7694 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)7695 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
7696 {
7697 	vfs_context_t ctx = vfs_context_current();
7698 	struct vnode_attr va;
7699 	vnode_t vp;
7700 	struct fileproc *fp;
7701 	int error;
7702 	int fd = uap->fd;
7703 	rlim_t fsize_limit;
7704 
7705 	AUDIT_ARG(fd, uap->fd);
7706 	if (uap->length < 0) {
7707 		return EINVAL;
7708 	}
7709 
7710 	fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
7711 	if ((rlim_t)uap->length > fsize_limit) {
7712 		psignal(p, SIGXFSZ);
7713 		return EFBIG;
7714 	}
7715 
7716 	if ((error = fp_lookup(p, fd, &fp, 0))) {
7717 		return error;
7718 	}
7719 
7720 	switch (FILEGLOB_DTYPE(fp->fp_glob)) {
7721 	case DTYPE_PSXSHM:
7722 		error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
7723 		goto out;
7724 	case DTYPE_VNODE:
7725 		break;
7726 	default:
7727 		error = EINVAL;
7728 		goto out;
7729 	}
7730 
7731 	vp = (vnode_t)fp_get_data(fp);
7732 
7733 	if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
7734 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7735 		error = EINVAL;
7736 		goto out;
7737 	}
7738 
7739 	if ((error = vnode_getwithref(vp)) != 0) {
7740 		goto out;
7741 	}
7742 
7743 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7744 
7745 #if CONFIG_MACF
7746 	error = mac_vnode_check_truncate(ctx,
7747 	    fp->fp_glob->fg_cred, vp);
7748 	if (error) {
7749 		(void)vnode_put(vp);
7750 		goto out;
7751 	}
7752 #endif
7753 	VATTR_INIT(&va);
7754 	VATTR_SET(&va, va_data_size, uap->length);
7755 	error = vnode_setattr(vp, &va, ctx);
7756 
7757 #if CONFIG_MACF
7758 	if (error == 0) {
7759 		mac_vnode_notify_truncate(ctx, fp->fp_glob->fg_cred, vp);
7760 	}
7761 #endif
7762 
7763 	(void)vnode_put(vp);
7764 out:
7765 	file_drop(fd);
7766 	return error;
7767 }
7768 
7769 
7770 /*
7771  * Sync an open file with synchronized I/O _file_ integrity completion
7772  */
7773 /* ARGSUSED */
7774 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)7775 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
7776 {
7777 	__pthread_testcancel(1);
7778 	return fsync_common(p, uap, MNT_WAIT);
7779 }
7780 
7781 
7782 /*
7783  * Sync an open file with synchronized I/O _file_ integrity completion
7784  *
7785  * Notes:	This is a legacy support function that does not test for
7786  *		thread cancellation points.
7787  */
7788 /* ARGSUSED */
7789 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)7790 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
7791 {
7792 	return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
7793 }
7794 
7795 
7796 /*
7797  * Sync an open file with synchronized I/O _data_ integrity completion
7798  */
7799 /* ARGSUSED */
7800 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)7801 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
7802 {
7803 	__pthread_testcancel(1);
7804 	return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
7805 }
7806 
7807 
7808 /*
7809  * fsync_common
7810  *
7811  * Common fsync code to support both synchronized I/O file integrity completion
7812  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
7813  *
7814  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
7815  * will only guarantee that the file data contents are retrievable.  If
7816  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
7817  * includes additional metadata unnecessary for retrieving the file data
7818  * contents, such as atime, mtime, ctime, etc., also be committed to stable
7819  * storage.
7820  *
7821  * Parameters:	p				The process
7822  *		uap->fd				The descriptor to synchronize
7823  *		flags				The data integrity flags
7824  *
7825  * Returns:	int				Success
7826  *	fp_getfvp:EBADF				Bad file descriptor
7827  *	fp_getfvp:ENOTSUP			fd does not refer to a vnode
7828  *	VNOP_FSYNC:???				unspecified
7829  *
7830  * Notes:	We use struct fsync_args because it is a short name, and all
7831  *		caller argument structures are otherwise identical.
7832  */
7833 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)7834 fsync_common(proc_t p, struct fsync_args *uap, int flags)
7835 {
7836 	vnode_t vp;
7837 	struct fileproc *fp;
7838 	vfs_context_t ctx = vfs_context_current();
7839 	int error;
7840 
7841 	AUDIT_ARG(fd, uap->fd);
7842 
7843 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
7844 		return error;
7845 	}
7846 	if ((error = vnode_getwithref(vp))) {
7847 		file_drop(uap->fd);
7848 		return error;
7849 	}
7850 
7851 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7852 
7853 	error = VNOP_FSYNC(vp, flags, ctx);
7854 
7855 #if NAMEDRSRCFORK
7856 	/* Sync resource fork shadow file if necessary. */
7857 	if ((error == 0) &&
7858 	    (vp->v_flag & VISNAMEDSTREAM) &&
7859 	    (vp->v_parent != NULLVP) &&
7860 	    vnode_isshadow(vp) &&
7861 	    (fp->fp_glob->fg_flag & FWASWRITTEN)) {
7862 		(void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
7863 	}
7864 #endif
7865 
7866 	(void)vnode_put(vp);
7867 	file_drop(uap->fd);
7868 	return error;
7869 }
7870 
7871 /*
7872  * Duplicate files.  Source must be a file, target must be a file or
7873  * must not exist.
7874  *
7875  * XXX Copyfile authorisation checking is woefully inadequate, and will not
7876  *     perform inheritance correctly.
7877  */
7878 /* ARGSUSED */
7879 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)7880 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
7881 {
7882 	vnode_t tvp, fvp, tdvp, sdvp;
7883 	struct nameidata fromnd, tond;
7884 	int error;
7885 	vfs_context_t ctx = vfs_context_current();
7886 
7887 	/* Check that the flags are valid. */
7888 	if (uap->flags & ~CPF_MASK) {
7889 		return EINVAL;
7890 	}
7891 
7892 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
7893 	    UIO_USERSPACE, uap->from, ctx);
7894 	if ((error = namei(&fromnd))) {
7895 		return error;
7896 	}
7897 	fvp = fromnd.ni_vp;
7898 
7899 	NDINIT(&tond, CREATE, OP_LINK,
7900 	    LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
7901 	    UIO_USERSPACE, uap->to, ctx);
7902 	if ((error = namei(&tond))) {
7903 		goto out1;
7904 	}
7905 	tdvp = tond.ni_dvp;
7906 	tvp = tond.ni_vp;
7907 
7908 	if (tvp != NULL) {
7909 		if (!(uap->flags & CPF_OVERWRITE)) {
7910 			error = EEXIST;
7911 			goto out;
7912 		}
7913 	}
7914 
7915 	if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
7916 		error = EISDIR;
7917 		goto out;
7918 	}
7919 
7920 	if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
7921 		error = EOPNOTSUPP;
7922 		goto out;
7923 	}
7924 
7925 #if CONFIG_MACF
7926 	if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
7927 		goto out;
7928 	}
7929 #endif /* CONFIG_MACF */
7930 
7931 	if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
7932 		goto out;
7933 	}
7934 	if (tvp) {
7935 		if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
7936 			goto out;
7937 		}
7938 	}
7939 	if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
7940 		goto out;
7941 	}
7942 
7943 	if (fvp == tdvp) {
7944 		error = EINVAL;
7945 	}
7946 	/*
7947 	 * If source is the same as the destination (that is the
7948 	 * same inode number) then there is nothing to do.
7949 	 * (fixed to have POSIX semantics - CSM 3/2/98)
7950 	 */
7951 	if (fvp == tvp) {
7952 		error = -1;
7953 	}
7954 	if (!error) {
7955 		error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
7956 	}
7957 out:
7958 	sdvp = tond.ni_startdir;
7959 	/*
7960 	 * nameidone has to happen before we vnode_put(tdvp)
7961 	 * since it may need to release the fs_nodelock on the tdvp
7962 	 */
7963 	nameidone(&tond);
7964 
7965 	if (tvp) {
7966 		vnode_put(tvp);
7967 	}
7968 	vnode_put(tdvp);
7969 	vnode_put(sdvp);
7970 out1:
7971 	vnode_put(fvp);
7972 
7973 	nameidone(&fromnd);
7974 
7975 	if (error == -1) {
7976 		return 0;
7977 	}
7978 	return error;
7979 }
7980 
7981 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
7982 
7983 /*
7984  * Helper function for doing clones. The caller is expected to provide an
7985  * iocounted source vnode and release it.
7986  */
7987 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)7988 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
7989     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
7990 {
7991 	vnode_t tvp, tdvp;
7992 	struct nameidata tond;
7993 	int error;
7994 	int follow;
7995 	boolean_t free_src_acl;
7996 	boolean_t attr_cleanup;
7997 	enum vtype v_type;
7998 	kauth_action_t action;
7999 	struct componentname *cnp;
8000 	uint32_t defaulted;
8001 	struct vnode_attr va;
8002 	struct vnode_attr nva;
8003 	uint32_t vnop_flags;
8004 
8005 	v_type = vnode_vtype(fvp);
8006 	switch (v_type) {
8007 	case VLNK:
8008 	/* FALLTHRU */
8009 	case VREG:
8010 		action = KAUTH_VNODE_ADD_FILE;
8011 		break;
8012 	case VDIR:
8013 		if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8014 		    fvp->v_mountedhere) {
8015 			return EINVAL;
8016 		}
8017 		action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8018 		break;
8019 	default:
8020 		return EINVAL;
8021 	}
8022 
8023 	AUDIT_ARG(fd2, dst_dirfd);
8024 	AUDIT_ARG(value32, flags);
8025 
8026 	follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8027 	NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8028 	    UIO_USERSPACE, dst, ctx);
8029 	if ((error = nameiat(&tond, dst_dirfd))) {
8030 		return error;
8031 	}
8032 	cnp = &tond.ni_cnd;
8033 	tdvp = tond.ni_dvp;
8034 	tvp = tond.ni_vp;
8035 
8036 	free_src_acl = FALSE;
8037 	attr_cleanup = FALSE;
8038 
8039 	if (tvp != NULL) {
8040 		error = EEXIST;
8041 		goto out;
8042 	}
8043 
8044 	if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8045 		error = EXDEV;
8046 		goto out;
8047 	}
8048 
8049 #if CONFIG_MACF
8050 	if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8051 		goto out;
8052 	}
8053 #endif
8054 	if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8055 		goto out;
8056 	}
8057 
8058 	action = KAUTH_VNODE_GENERIC_READ_BITS;
8059 	if (data_read_authorised) {
8060 		action &= ~KAUTH_VNODE_READ_DATA;
8061 	}
8062 	if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8063 		goto out;
8064 	}
8065 
8066 	/*
8067 	 * certain attributes may need to be changed from the source, we ask for
8068 	 * those here with the exception of source file's ACL. The clone file
8069 	 * will inherit the target directory's ACL.
8070 	 */
8071 	VATTR_INIT(&va);
8072 	VATTR_WANTED(&va, va_uid);
8073 	VATTR_WANTED(&va, va_gid);
8074 	VATTR_WANTED(&va, va_mode);
8075 	VATTR_WANTED(&va, va_flags);
8076 
8077 	if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
8078 		goto out;
8079 	}
8080 
8081 	VATTR_INIT(&nva);
8082 	VATTR_SET(&nva, va_type, v_type);
8083 	if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8084 		VATTR_SET(&nva, va_acl, va.va_acl);
8085 		free_src_acl = TRUE;
8086 	}
8087 
8088 	/* Handle ACL inheritance, initialize vap. */
8089 	if (v_type == VLNK) {
8090 		error = vnode_authattr_new(tdvp, &nva, 0, ctx);
8091 	} else {
8092 		error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
8093 		if (error) {
8094 			goto out;
8095 		}
8096 		attr_cleanup = TRUE;
8097 	}
8098 
8099 	vnop_flags = VNODE_CLONEFILE_DEFAULT;
8100 	/*
8101 	 * We've got initial values for all security parameters,
8102 	 * If we are superuser, then we can change owners to be the
8103 	 * same as the source. Both superuser and the owner have default
8104 	 * WRITE_SECURITY privileges so all other fields can be taken
8105 	 * from source as well.
8106 	 */
8107 	if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8108 		if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8109 			VATTR_SET(&nva, va_uid, va.va_uid);
8110 		}
8111 		if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8112 			VATTR_SET(&nva, va_gid, va.va_gid);
8113 		}
8114 	} else {
8115 		vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8116 	}
8117 
8118 	if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8119 		VATTR_SET(&nva, va_mode, va.va_mode);
8120 	}
8121 	if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8122 		VATTR_SET(&nva, va_flags,
8123 		    ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8124 		    (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8125 	}
8126 
8127 	error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8128 
8129 	if (!error && tvp) {
8130 		int     update_flags = 0;
8131 #if CONFIG_FSE
8132 		int fsevent;
8133 #endif /* CONFIG_FSE */
8134 
8135 		/*
8136 		 * If some of the requested attributes weren't handled by the
8137 		 * VNOP, use our fallback code.
8138 		 */
8139 		if (!VATTR_ALL_SUPPORTED(&nva)) {
8140 			(void)vnode_setattr_fallback(tvp, &nva, ctx);
8141 		}
8142 
8143 #if CONFIG_MACF
8144 		(void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8145 		    VNODE_LABEL_CREATE, ctx);
8146 #endif
8147 
8148 		// Make sure the name & parent pointers are hooked up
8149 		if (tvp->v_name == NULL) {
8150 			update_flags |= VNODE_UPDATE_NAME;
8151 		}
8152 		if (tvp->v_parent == NULLVP) {
8153 			update_flags |= VNODE_UPDATE_PARENT;
8154 		}
8155 
8156 		if (update_flags) {
8157 			(void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8158 			    cnp->cn_namelen, cnp->cn_hash, update_flags);
8159 		}
8160 
8161 #if CONFIG_FSE
8162 		switch (vnode_vtype(tvp)) {
8163 		case VLNK:
8164 		/* FALLTHRU */
8165 		case VREG:
8166 			fsevent = FSE_CREATE_FILE;
8167 			break;
8168 		case VDIR:
8169 			fsevent = FSE_CREATE_DIR;
8170 			break;
8171 		default:
8172 			goto out;
8173 		}
8174 
8175 		if (need_fsevent(fsevent, tvp)) {
8176 			/*
8177 			 * The following is a sequence of three explicit events.
8178 			 * A pair of FSE_CLONE events representing the source and destination
8179 			 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8180 			 * fseventsd may coalesce the destination clone and create events
8181 			 * into a single event resulting in the following sequence for a client
8182 			 * FSE_CLONE (src)
8183 			 * FSE_CLONE | FSE_CREATE (dst)
8184 			 */
8185 			add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8186 			    FSE_ARG_DONE);
8187 			add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8188 			    FSE_ARG_DONE);
8189 		}
8190 #endif /* CONFIG_FSE */
8191 	}
8192 
8193 out:
8194 	if (attr_cleanup) {
8195 		vn_attribute_cleanup(&nva, defaulted);
8196 	}
8197 	if (free_src_acl && va.va_acl) {
8198 		kauth_acl_free(va.va_acl);
8199 	}
8200 	nameidone(&tond);
8201 	if (tvp) {
8202 		vnode_put(tvp);
8203 	}
8204 	vnode_put(tdvp);
8205 	return error;
8206 }
8207 
8208 /*
8209  * clone files or directories, target must not exist.
8210  */
8211 /* ARGSUSED */
8212 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8213 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8214     __unused int32_t *retval)
8215 {
8216 	vnode_t fvp;
8217 	struct nameidata fromnd;
8218 	int follow;
8219 	int error;
8220 	vfs_context_t ctx = vfs_context_current();
8221 
8222 	/* Check that the flags are valid. */
8223 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
8224 		return EINVAL;
8225 	}
8226 
8227 	AUDIT_ARG(fd, uap->src_dirfd);
8228 
8229 	follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8230 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8231 	    UIO_USERSPACE, uap->src, ctx);
8232 	if ((error = nameiat(&fromnd, uap->src_dirfd))) {
8233 		return error;
8234 	}
8235 
8236 	fvp = fromnd.ni_vp;
8237 	nameidone(&fromnd);
8238 
8239 	error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8240 	    uap->flags, ctx);
8241 
8242 	vnode_put(fvp);
8243 	return error;
8244 }
8245 
8246 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)8247 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8248     __unused int32_t *retval)
8249 {
8250 	vnode_t fvp;
8251 	struct fileproc *fp;
8252 	int error;
8253 	vfs_context_t ctx = vfs_context_current();
8254 
8255 	/* Check that the flags are valid. */
8256 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
8257 		return EINVAL;
8258 	}
8259 
8260 	AUDIT_ARG(fd, uap->src_fd);
8261 	error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8262 	if (error) {
8263 		return error;
8264 	}
8265 
8266 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8267 		AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8268 		error = EBADF;
8269 		goto out;
8270 	}
8271 
8272 	if ((error = vnode_getwithref(fvp))) {
8273 		goto out;
8274 	}
8275 
8276 	AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8277 
8278 	error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8279 	    uap->flags, ctx);
8280 
8281 	vnode_put(fvp);
8282 out:
8283 	file_drop(uap->src_fd);
8284 	return error;
8285 }
8286 
8287 static int
rename_submounts_callback(mount_t mp,void * arg)8288 rename_submounts_callback(mount_t mp, void *arg)
8289 {
8290 	int error = 0;
8291 	mount_t pmp = (mount_t)arg;
8292 	int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8293 
8294 	if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8295 		return 0;
8296 	}
8297 
8298 	if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8299 		return 0;
8300 	}
8301 
8302 	if ((error = vfs_busy(mp, LK_NOWAIT))) {
8303 		printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8304 		return -1;
8305 	}
8306 
8307 	int pathlen = MAXPATHLEN;
8308 	if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8309 		printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8310 	}
8311 
8312 	vfs_unbusy(mp);
8313 
8314 	return error;
8315 }
8316 
8317 /*
8318  * Rename files.  Source and destination must either both be directories,
8319  * or both not be directories.  If target is a directory, it must be empty.
8320  */
8321 /* ARGSUSED */
8322 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)8323 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8324     int tofd, user_addr_t to, int segflg, u_int uflags)
8325 {
8326 	vnode_t tvp, tdvp;
8327 	vnode_t fvp, fdvp;
8328 	vnode_t mnt_fvp;
8329 	struct nameidata *fromnd, *tond;
8330 	int error;
8331 	int do_retry;
8332 	int retry_count;
8333 	int mntrename;
8334 	int need_event;
8335 	int need_kpath2;
8336 	int has_listeners;
8337 	const char *oname = NULL;
8338 	char *from_name = NULL, *to_name = NULL;
8339 	char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8340 	int from_len = 0, to_len = 0;
8341 	int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8342 	int holding_mntlock;
8343 	int vn_authorize_skipped;
8344 	mount_t locked_mp = NULL;
8345 	vnode_t oparent = NULLVP;
8346 #if CONFIG_FSE
8347 	fse_info from_finfo = {}, to_finfo;
8348 #endif
8349 	int from_truncated = 0, to_truncated = 0;
8350 	int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8351 	int batched = 0;
8352 	struct vnode_attr *fvap, *tvap;
8353 	int continuing = 0;
8354 	vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
8355 	int32_t nofollow_any = 0;
8356 	/* carving out a chunk for structs that are too big to be on stack. */
8357 	struct {
8358 		struct nameidata from_node, to_node;
8359 		struct vnode_attr fv_attr, tv_attr;
8360 	} * __rename_data;
8361 
8362 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
8363 	fromnd = &__rename_data->from_node;
8364 	tond = &__rename_data->to_node;
8365 
8366 	holding_mntlock = 0;
8367 	do_retry = 0;
8368 	retry_count = 0;
8369 retry:
8370 	fvp = tvp = NULL;
8371 	fdvp = tdvp = NULL;
8372 	fvap = tvap = NULL;
8373 	mnt_fvp = NULLVP;
8374 	mntrename = FALSE;
8375 	vn_authorize_skipped = FALSE;
8376 
8377 	if (uflags & RENAME_NOFOLLOW_ANY) {
8378 		nofollow_any = NAMEI_NOFOLLOW_ANY;
8379 	}
8380 	NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8381 	    segflg, from, ctx);
8382 	fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8383 
8384 	NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8385 	    segflg, to, ctx);
8386 	tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8387 
8388 continue_lookup:
8389 	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8390 		if ((error = nameiat(fromnd, fromfd))) {
8391 			goto out1;
8392 		}
8393 		fdvp = fromnd->ni_dvp;
8394 		fvp  = fromnd->ni_vp;
8395 
8396 		if (fvp && fvp->v_type == VDIR) {
8397 			tond->ni_cnd.cn_flags |= WILLBEDIR;
8398 		}
8399 	}
8400 
8401 	if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8402 		if ((error = nameiat(tond, tofd))) {
8403 			/*
8404 			 * Translate error code for rename("dir1", "dir2/.").
8405 			 */
8406 			if (error == EISDIR && fvp->v_type == VDIR) {
8407 				error = EINVAL;
8408 			}
8409 			goto out1;
8410 		}
8411 		tdvp = tond->ni_dvp;
8412 		tvp  = tond->ni_vp;
8413 	}
8414 
8415 #if DEVELOPMENT || DEBUG
8416 	/*
8417 	 * XXX VSWAP: Check for entitlements or special flag here
8418 	 * so we can restrict access appropriately.
8419 	 */
8420 #else /* DEVELOPMENT || DEBUG */
8421 
8422 	if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
8423 		error = EPERM;
8424 		goto out1;
8425 	}
8426 
8427 	if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
8428 		error = EPERM;
8429 		goto out1;
8430 	}
8431 #endif /* DEVELOPMENT || DEBUG */
8432 
8433 	if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
8434 		error = ENOENT;
8435 		goto out1;
8436 	}
8437 
8438 	if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
8439 		int32_t pval = 0;
8440 		int err = 0;
8441 
8442 		/*
8443 		 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
8444 		 * has the same name as target iff the following conditions are met:
8445 		 * 1. the target file system is case insensitive
8446 		 * 2. source and target directories are the same
8447 		 * 3. source and target files are the same
8448 		 * 4. name only differs in case (determined by underlying filesystem)
8449 		 */
8450 		if (fvp != tvp || fdvp != tdvp) {
8451 			error = EEXIST;
8452 			goto out1;
8453 		}
8454 
8455 		/*
8456 		 * Assume that the target file system is case sensitive if
8457 		 * _PC_CASE_SENSITIVE selector isn't supported.
8458 		 */
8459 		err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
8460 		if (err != 0 || pval != 0) {
8461 			error = EEXIST;
8462 			goto out1;
8463 		}
8464 	}
8465 
8466 	batched = vnode_compound_rename_available(fdvp);
8467 
8468 #if CONFIG_FSE
8469 	need_event = need_fsevent(FSE_RENAME, fdvp);
8470 	if (need_event) {
8471 		if (fvp) {
8472 			get_fse_info(fvp, &from_finfo, ctx);
8473 		} else {
8474 			error = vfs_get_notify_attributes(&__rename_data->fv_attr);
8475 			if (error) {
8476 				goto out1;
8477 			}
8478 
8479 			fvap = &__rename_data->fv_attr;
8480 		}
8481 
8482 		if (tvp) {
8483 			get_fse_info(tvp, &to_finfo, ctx);
8484 		} else if (batched) {
8485 			error = vfs_get_notify_attributes(&__rename_data->tv_attr);
8486 			if (error) {
8487 				goto out1;
8488 			}
8489 
8490 			tvap = &__rename_data->tv_attr;
8491 		}
8492 	}
8493 #else
8494 	need_event = 0;
8495 #endif /* CONFIG_FSE */
8496 
8497 	has_listeners = kauth_authorize_fileop_has_listeners();
8498 
8499 	need_kpath2 = 0;
8500 #if CONFIG_AUDIT
8501 	if (AUDIT_RECORD_EXISTS()) {
8502 		need_kpath2 = 1;
8503 	}
8504 #endif
8505 
8506 	if (need_event || has_listeners) {
8507 		if (from_name == NULL) {
8508 			GET_PATH(from_name);
8509 		}
8510 
8511 		from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
8512 
8513 		if (from_name_no_firmlink == NULL) {
8514 			GET_PATH(from_name_no_firmlink);
8515 		}
8516 
8517 		from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
8518 	}
8519 
8520 	if (need_event || need_kpath2 || has_listeners) {
8521 		if (to_name == NULL) {
8522 			GET_PATH(to_name);
8523 		}
8524 
8525 		to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
8526 
8527 		if (to_name_no_firmlink == NULL) {
8528 			GET_PATH(to_name_no_firmlink);
8529 		}
8530 
8531 		to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
8532 		if (to_name && need_kpath2) {
8533 			AUDIT_ARG(kpath, to_name, ARG_KPATH2);
8534 		}
8535 	}
8536 	if (!fvp) {
8537 		/*
8538 		 * Claim: this check will never reject a valid rename.
8539 		 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
8540 		 * Suppose fdvp and tdvp are not on the same mount.
8541 		 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
8542 		 *      then you can't move it to within another dir on the same mountpoint.
8543 		 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
8544 		 *
8545 		 * If this check passes, then we are safe to pass these vnodes to the same FS.
8546 		 */
8547 		if (fdvp->v_mount != tdvp->v_mount) {
8548 			error = EXDEV;
8549 			goto out1;
8550 		}
8551 		goto skipped_lookup;
8552 	}
8553 
8554 	/*
8555 	 * If the source and destination are the same (i.e. they're
8556 	 * links to the same vnode) and the target file system is
8557 	 * case sensitive, then there is nothing to do.
8558 	 *
8559 	 * XXX Come back to this.
8560 	 */
8561 	if (fvp == tvp) {
8562 		int pathconf_val;
8563 
8564 		/*
8565 		 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
8566 		 * then assume that this file system is case sensitive.
8567 		 */
8568 		if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
8569 		    pathconf_val != 0) {
8570 			vn_authorize_skipped = TRUE;
8571 			goto out1;
8572 		}
8573 	}
8574 
8575 	/*
8576 	 * Allow the renaming of mount points.
8577 	 * - target must not exist
8578 	 * - target must reside in the same directory as source
8579 	 * - union mounts cannot be renamed
8580 	 * - the root fs, and tightly-linked system volumes, cannot be renamed
8581 	 *
8582 	 * XXX Handle this in VFS after a continued lookup (if we missed
8583 	 * in the cache to start off)
8584 	 *
8585 	 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
8586 	 * we'll skip past here.  The file system is responsible for
8587 	 * checking that @tvp is not a descendent of @fvp and vice versa
8588 	 * so it should always return EINVAL if either @tvp or @fvp is the
8589 	 * root of a volume.
8590 	 */
8591 	if ((fvp->v_flag & VROOT) &&
8592 	    (fvp->v_type == VDIR) &&
8593 	    (tvp == NULL) &&
8594 	    (fvp->v_mountedhere == NULL) &&
8595 	    (fdvp == tdvp) &&
8596 	    ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
8597 	    ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
8598 	    (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
8599 		vnode_t coveredvp;
8600 
8601 		/* switch fvp to the covered vnode */
8602 		coveredvp = fvp->v_mount->mnt_vnodecovered;
8603 		if ((vnode_getwithref(coveredvp))) {
8604 			error = ENOENT;
8605 			goto out1;
8606 		}
8607 		/*
8608 		 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
8609 		 * later.
8610 		 */
8611 		mnt_fvp = fvp;
8612 
8613 		fvp = coveredvp;
8614 		mntrename = TRUE;
8615 	}
8616 	/*
8617 	 * Check for cross-device rename.
8618 	 */
8619 	if ((fvp->v_mount != tdvp->v_mount) ||
8620 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
8621 		error = EXDEV;
8622 		goto out1;
8623 	}
8624 
8625 	/*
8626 	 * If source is the same as the destination (that is the
8627 	 * same inode number) then there is nothing to do...
8628 	 * EXCEPT if the underlying file system supports case
8629 	 * insensitivity and is case preserving.  In this case
8630 	 * the file system needs to handle the special case of
8631 	 * getting the same vnode as target (fvp) and source (tvp).
8632 	 *
8633 	 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
8634 	 * and _PC_CASE_PRESERVING can have this exception, and they need to
8635 	 * handle the special case of getting the same vnode as target and
8636 	 * source.  NOTE: Then the target is unlocked going into vnop_rename,
8637 	 * so not to cause locking problems. There is a single reference on tvp.
8638 	 *
8639 	 * NOTE - that fvp == tvp also occurs if they are hard linked and
8640 	 * that correct behaviour then is just to return success without doing
8641 	 * anything.
8642 	 *
8643 	 * XXX filesystem should take care of this itself, perhaps...
8644 	 */
8645 	if (fvp == tvp && fdvp == tdvp) {
8646 		if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
8647 		    !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
8648 		    fromnd->ni_cnd.cn_namelen)) {
8649 			vn_authorize_skipped = TRUE;
8650 			goto out1;
8651 		}
8652 	}
8653 
8654 	if (holding_mntlock && fvp->v_mount != locked_mp) {
8655 		/*
8656 		 * we're holding a reference and lock
8657 		 * on locked_mp, but it no longer matches
8658 		 * what we want to do... so drop our hold
8659 		 */
8660 		mount_unlock_renames(locked_mp);
8661 		mount_drop(locked_mp, 0);
8662 		holding_mntlock = 0;
8663 	}
8664 	if (tdvp != fdvp && fvp->v_type == VDIR) {
8665 		/*
8666 		 * serialize renames that re-shape
8667 		 * the tree... if holding_mntlock is
8668 		 * set, then we're ready to go...
8669 		 * otherwise we
8670 		 * first need to drop the iocounts
8671 		 * we picked up, second take the
8672 		 * lock to serialize the access,
8673 		 * then finally start the lookup
8674 		 * process over with the lock held
8675 		 */
8676 		if (!holding_mntlock) {
8677 			/*
8678 			 * need to grab a reference on
8679 			 * the mount point before we
8680 			 * drop all the iocounts... once
8681 			 * the iocounts are gone, the mount
8682 			 * could follow
8683 			 */
8684 			locked_mp = fvp->v_mount;
8685 			mount_ref(locked_mp, 0);
8686 
8687 			/*
8688 			 * nameidone has to happen before we vnode_put(tvp)
8689 			 * since it may need to release the fs_nodelock on the tvp
8690 			 */
8691 			nameidone(tond);
8692 
8693 			if (tvp) {
8694 				vnode_put(tvp);
8695 			}
8696 			vnode_put(tdvp);
8697 
8698 			/*
8699 			 * nameidone has to happen before we vnode_put(fdvp)
8700 			 * since it may need to release the fs_nodelock on the fvp
8701 			 */
8702 			nameidone(fromnd);
8703 
8704 			vnode_put(fvp);
8705 			vnode_put(fdvp);
8706 
8707 			if (mnt_fvp != NULLVP) {
8708 				vnode_put(mnt_fvp);
8709 			}
8710 
8711 			mount_lock_renames(locked_mp);
8712 			holding_mntlock = 1;
8713 
8714 			goto retry;
8715 		}
8716 	} else {
8717 		/*
8718 		 * when we dropped the iocounts to take
8719 		 * the lock, we allowed the identity of
8720 		 * the various vnodes to change... if they did,
8721 		 * we may no longer be dealing with a rename
8722 		 * that reshapes the tree... once we're holding
8723 		 * the iocounts, the vnodes can't change type
8724 		 * so we're free to drop the lock at this point
8725 		 * and continue on
8726 		 */
8727 		if (holding_mntlock) {
8728 			mount_unlock_renames(locked_mp);
8729 			mount_drop(locked_mp, 0);
8730 			holding_mntlock = 0;
8731 		}
8732 	}
8733 
8734 	if (!batched) {
8735 		error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
8736 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
8737 		    flags, NULL);
8738 		if (error) {
8739 			if (error == ENOENT) {
8740 				if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8741 					/*
8742 					 * We encountered a race where after doing the namei,
8743 					 * tvp stops being valid. If so, simply re-drive the rename
8744 					 * call from the top.
8745 					 */
8746 					do_retry = 1;
8747 					retry_count += 1;
8748 				}
8749 			}
8750 			goto out1;
8751 		}
8752 	}
8753 
8754 	/* Release the 'mnt_fvp' now that it is no longer needed. */
8755 	if (mnt_fvp != NULLVP) {
8756 		vnode_put(mnt_fvp);
8757 		mnt_fvp = NULLVP;
8758 	}
8759 
8760 	// save these off so we can later verify that fvp is the same
8761 	oname   = fvp->v_name;
8762 	oparent = fvp->v_parent;
8763 
8764 skipped_lookup:
8765 	error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
8766 	    tdvp, &tvp, &tond->ni_cnd, tvap,
8767 	    flags, ctx);
8768 
8769 	if (holding_mntlock) {
8770 		/*
8771 		 * we can drop our serialization
8772 		 * lock now
8773 		 */
8774 		mount_unlock_renames(locked_mp);
8775 		mount_drop(locked_mp, 0);
8776 		holding_mntlock = 0;
8777 	}
8778 	if (error) {
8779 		if (error == EDATALESS) {
8780 			/*
8781 			 * If we've been here before, something has gone
8782 			 * horribly wrong and we should just get out lest
8783 			 * we spiral around the drain forever.
8784 			 */
8785 			if (flags & VFS_RENAME_DATALESS) {
8786 				error = EIO;
8787 				goto out1;
8788 			}
8789 
8790 			/*
8791 			 * The object we're renaming is dataless (or has a
8792 			 * dataless descendent) and requires materialization
8793 			 * before the rename occurs.  But we're holding the
8794 			 * mount point's rename lock, so it's not safe to
8795 			 * make the upcall.
8796 			 *
8797 			 * In this case, we release the lock, perform the
8798 			 * materialization, and start the whole thing over.
8799 			 */
8800 			error = vnode_materialize_dataless_file(fvp,
8801 			    NAMESPACE_HANDLER_RENAME_OP);
8802 
8803 			if (error == 0) {
8804 				/*
8805 				 * The next time around we need to tell the
8806 				 * file system that the materializtaion has
8807 				 * been performed.
8808 				 */
8809 				flags |= VFS_RENAME_DATALESS;
8810 				do_retry = 1;
8811 			}
8812 			goto out1;
8813 		}
8814 		if (error == EKEEPLOOKING) {
8815 			if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8816 				if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8817 					panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
8818 				}
8819 			}
8820 
8821 			fromnd->ni_vp = fvp;
8822 			tond->ni_vp = tvp;
8823 
8824 			goto continue_lookup;
8825 		}
8826 
8827 		/*
8828 		 * We may encounter a race in the VNOP where the destination didn't
8829 		 * exist when we did the namei, but it does by the time we go and
8830 		 * try to create the entry. In this case, we should re-drive this rename
8831 		 * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
8832 		 * but other filesystems susceptible to this race could return it, too.
8833 		 */
8834 		if (error == ERECYCLE) {
8835 			if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
8836 				do_retry = 1;
8837 				retry_count += 1;
8838 			} else {
8839 				printf("rename retry limit due to ERECYCLE reached\n");
8840 				error = ENOENT;
8841 			}
8842 		}
8843 
8844 		/*
8845 		 * For compound VNOPs, the authorization callback may return
8846 		 * ENOENT in case of racing hardlink lookups hitting the name
8847 		 * cache, redrive the lookup.
8848 		 */
8849 		if (batched && error == ENOENT) {
8850 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8851 				do_retry = 1;
8852 				retry_count += 1;
8853 			}
8854 		}
8855 
8856 		goto out1;
8857 	}
8858 
8859 	/* call out to allow 3rd party notification of rename.
8860 	 * Ignore result of kauth_authorize_fileop call.
8861 	 */
8862 	kauth_authorize_fileop(vfs_context_ucred(ctx),
8863 	    KAUTH_FILEOP_RENAME,
8864 	    (uintptr_t)from_name, (uintptr_t)to_name);
8865 	if (flags & VFS_RENAME_SWAP) {
8866 		kauth_authorize_fileop(vfs_context_ucred(ctx),
8867 		    KAUTH_FILEOP_RENAME,
8868 		    (uintptr_t)to_name, (uintptr_t)from_name);
8869 	}
8870 
8871 #if CONFIG_FSE
8872 	if (from_name != NULL && to_name != NULL) {
8873 		if (from_truncated || to_truncated) {
8874 			// set it here since only the from_finfo gets reported up to user space
8875 			from_finfo.mode |= FSE_TRUNCATED_PATH;
8876 		}
8877 
8878 		if (tvap && tvp) {
8879 			vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
8880 		}
8881 		if (fvap) {
8882 			vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
8883 		}
8884 
8885 		if (tvp) {
8886 			add_fsevent(FSE_RENAME, ctx,
8887 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8888 			    FSE_ARG_FINFO, &from_finfo,
8889 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8890 			    FSE_ARG_FINFO, &to_finfo,
8891 			    FSE_ARG_DONE);
8892 			if (flags & VFS_RENAME_SWAP) {
8893 				/*
8894 				 * Strictly speaking, swap is the equivalent of
8895 				 * *three* renames.  FSEvents clients should only take
8896 				 * the events as a hint, so we only bother reporting
8897 				 * two.
8898 				 */
8899 				add_fsevent(FSE_RENAME, ctx,
8900 				    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8901 				    FSE_ARG_FINFO, &to_finfo,
8902 				    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8903 				    FSE_ARG_FINFO, &from_finfo,
8904 				    FSE_ARG_DONE);
8905 			}
8906 		} else {
8907 			add_fsevent(FSE_RENAME, ctx,
8908 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8909 			    FSE_ARG_FINFO, &from_finfo,
8910 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8911 			    FSE_ARG_DONE);
8912 		}
8913 	}
8914 #endif /* CONFIG_FSE */
8915 
8916 	/*
8917 	 * update filesystem's mount point data
8918 	 */
8919 	if (mntrename) {
8920 		char *cp, *pathend, *mpname;
8921 		char * tobuf;
8922 		struct mount *mp;
8923 		int maxlen;
8924 		size_t len = 0;
8925 
8926 		mp = fvp->v_mountedhere;
8927 
8928 		if (vfs_busy(mp, LK_NOWAIT)) {
8929 			error = EBUSY;
8930 			goto out1;
8931 		}
8932 		tobuf = zalloc(ZV_NAMEI);
8933 
8934 		if (UIO_SEG_IS_USER_SPACE(segflg)) {
8935 			error = copyinstr(to, tobuf, MAXPATHLEN, &len);
8936 		} else {
8937 			error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
8938 		}
8939 		if (!error) {
8940 			/* find current mount point prefix */
8941 			pathend = &mp->mnt_vfsstat.f_mntonname[0];
8942 			for (cp = pathend; *cp != '\0'; ++cp) {
8943 				if (*cp == '/') {
8944 					pathend = cp + 1;
8945 				}
8946 			}
8947 			/* find last component of target name */
8948 			for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
8949 				if (*cp == '/') {
8950 					mpname = cp + 1;
8951 				}
8952 			}
8953 
8954 			/* Update f_mntonname of sub mounts */
8955 			vfs_iterate(0, rename_submounts_callback, (void *)mp);
8956 
8957 			/* append name to prefix */
8958 			maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
8959 			bzero(pathend, maxlen);
8960 
8961 			strlcpy(pathend, mpname, maxlen);
8962 		}
8963 		zfree(ZV_NAMEI, tobuf);
8964 
8965 		vfs_unbusy(mp);
8966 
8967 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
8968 	}
8969 	/*
8970 	 * fix up name & parent pointers.  note that we first
8971 	 * check that fvp has the same name/parent pointers it
8972 	 * had before the rename call... this is a 'weak' check
8973 	 * at best...
8974 	 *
8975 	 * XXX oparent and oname may not be set in the compound vnop case
8976 	 */
8977 	if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
8978 		int update_flags;
8979 
8980 		update_flags = VNODE_UPDATE_NAME;
8981 
8982 		if (fdvp != tdvp) {
8983 			update_flags |= VNODE_UPDATE_PARENT;
8984 		}
8985 
8986 		vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
8987 	}
8988 out1:
8989 	/*
8990 	 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
8991 	 * skipped earlier as no actual rename was performed.
8992 	 */
8993 	if (vn_authorize_skipped && error == 0) {
8994 		error = vn_authorize_renamex_with_paths(fdvp, fvp,
8995 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
8996 		    flags, NULL);
8997 		if (error && error == ENOENT) {
8998 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8999 				do_retry = 1;
9000 				retry_count += 1;
9001 			}
9002 		}
9003 	}
9004 	if (to_name != NULL) {
9005 		RELEASE_PATH(to_name);
9006 		to_name = NULL;
9007 	}
9008 	if (to_name_no_firmlink != NULL) {
9009 		RELEASE_PATH(to_name_no_firmlink);
9010 		to_name_no_firmlink = NULL;
9011 	}
9012 	if (from_name != NULL) {
9013 		RELEASE_PATH(from_name);
9014 		from_name = NULL;
9015 	}
9016 	if (from_name_no_firmlink != NULL) {
9017 		RELEASE_PATH(from_name_no_firmlink);
9018 		from_name_no_firmlink = NULL;
9019 	}
9020 	if (holding_mntlock) {
9021 		mount_unlock_renames(locked_mp);
9022 		mount_drop(locked_mp, 0);
9023 		holding_mntlock = 0;
9024 	}
9025 	if (tdvp) {
9026 		/*
9027 		 * nameidone has to happen before we vnode_put(tdvp)
9028 		 * since it may need to release the fs_nodelock on the tdvp
9029 		 */
9030 		nameidone(tond);
9031 
9032 		if (tvp) {
9033 			vnode_put(tvp);
9034 		}
9035 		vnode_put(tdvp);
9036 	}
9037 	if (fdvp) {
9038 		/*
9039 		 * nameidone has to happen before we vnode_put(fdvp)
9040 		 * since it may need to release the fs_nodelock on the fdvp
9041 		 */
9042 		nameidone(fromnd);
9043 
9044 		if (fvp) {
9045 			vnode_put(fvp);
9046 		}
9047 		vnode_put(fdvp);
9048 	}
9049 	if (mnt_fvp != NULLVP) {
9050 		vnode_put(mnt_fvp);
9051 	}
9052 	/*
9053 	 * If things changed after we did the namei, then we will re-drive
9054 	 * this rename call from the top.
9055 	 */
9056 	if (do_retry) {
9057 		do_retry = 0;
9058 		goto retry;
9059 	}
9060 
9061 	kfree_type(typeof(*__rename_data), __rename_data);
9062 	return error;
9063 }
9064 
9065 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9066 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9067 {
9068 	return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9069 	           AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9070 }
9071 
9072 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9073 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9074 {
9075 	if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9076 		return EINVAL;
9077 	}
9078 
9079 	if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9080 		return EINVAL;
9081 	}
9082 
9083 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9084 	           uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9085 }
9086 
9087 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9088 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9089 {
9090 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9091 	           uap->tofd, uap->to, UIO_USERSPACE, 0);
9092 }
9093 
9094 /*
9095  * Make a directory file.
9096  *
9097  * Returns:	0			Success
9098  *		EEXIST
9099  *	namei:???
9100  *	vnode_authorize:???
9101  *	vn_create:???
9102  */
9103 /* ARGSUSED */
9104 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9105 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9106     enum uio_seg segflg)
9107 {
9108 	vnode_t vp, dvp;
9109 	int error;
9110 	int update_flags = 0;
9111 	int batched;
9112 	struct nameidata nd;
9113 
9114 	AUDIT_ARG(mode, vap->va_mode);
9115 	NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9116 	    path, ctx);
9117 	nd.ni_cnd.cn_flags |= WILLBEDIR;
9118 	nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9119 
9120 continue_lookup:
9121 	error = nameiat(&nd, fd);
9122 	if (error) {
9123 		return error;
9124 	}
9125 	dvp = nd.ni_dvp;
9126 	vp = nd.ni_vp;
9127 
9128 	if (vp != NULL) {
9129 		error = EEXIST;
9130 		goto out;
9131 	}
9132 
9133 	batched = vnode_compound_mkdir_available(dvp);
9134 
9135 	VATTR_SET(vap, va_type, VDIR);
9136 
9137 	/*
9138 	 * XXX
9139 	 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9140 	 * only get EXISTS or EISDIR for existing path components, and not that it could see
9141 	 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9142 	 * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
9143 	 */
9144 	if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9145 		if (error == EACCES || error == EPERM) {
9146 			int error2;
9147 
9148 			nameidone(&nd);
9149 			vnode_put(dvp);
9150 			dvp = NULLVP;
9151 
9152 			/*
9153 			 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9154 			 * rather than EACCESS if the target exists.
9155 			 */
9156 			NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9157 			    path, ctx);
9158 			error2 = nameiat(&nd, fd);
9159 			if (error2) {
9160 				goto out;
9161 			} else {
9162 				vp = nd.ni_vp;
9163 				error = EEXIST;
9164 				goto out;
9165 			}
9166 		}
9167 
9168 		goto out;
9169 	}
9170 
9171 	/*
9172 	 * make the directory
9173 	 */
9174 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9175 		if (error == EKEEPLOOKING) {
9176 			nd.ni_vp = vp;
9177 			goto continue_lookup;
9178 		}
9179 
9180 		goto out;
9181 	}
9182 
9183 	// Make sure the name & parent pointers are hooked up
9184 	if (vp->v_name == NULL) {
9185 		update_flags |= VNODE_UPDATE_NAME;
9186 	}
9187 	if (vp->v_parent == NULLVP) {
9188 		update_flags |= VNODE_UPDATE_PARENT;
9189 	}
9190 
9191 	if (update_flags) {
9192 		vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9193 	}
9194 
9195 #if CONFIG_FSE
9196 	add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9197 #endif
9198 
9199 out:
9200 	/*
9201 	 * nameidone has to happen before we vnode_put(dvp)
9202 	 * since it may need to release the fs_nodelock on the dvp
9203 	 */
9204 	nameidone(&nd);
9205 
9206 	if (vp) {
9207 		vnode_put(vp);
9208 	}
9209 	if (dvp) {
9210 		vnode_put(dvp);
9211 	}
9212 
9213 	return error;
9214 }
9215 
9216 /*
9217  * mkdir_extended: Create a directory; with extended security (ACL).
9218  *
9219  * Parameters:    p                       Process requesting to create the directory
9220  *                uap                     User argument descriptor (see below)
9221  *                retval                  (ignored)
9222  *
9223  * Indirect:      uap->path               Path of directory to create
9224  *                uap->mode               Access permissions to set
9225  *                uap->xsecurity          ACL to set
9226  *
9227  * Returns:        0                      Success
9228  *                !0                      Not success
9229  *
9230  */
9231 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)9232 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9233 {
9234 	int ciferror;
9235 	kauth_filesec_t xsecdst;
9236 	struct vnode_attr va;
9237 
9238 	AUDIT_ARG(owner, uap->uid, uap->gid);
9239 
9240 	xsecdst = NULL;
9241 	if ((uap->xsecurity != USER_ADDR_NULL) &&
9242 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9243 		return ciferror;
9244 	}
9245 
9246 	VATTR_INIT(&va);
9247 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9248 	if (xsecdst != NULL) {
9249 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9250 		va.va_vaflags |= VA_FILESEC_ACL;
9251 	}
9252 
9253 	ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9254 	    UIO_USERSPACE);
9255 	if (xsecdst != NULL) {
9256 		kauth_filesec_free(xsecdst);
9257 	}
9258 	return ciferror;
9259 }
9260 
9261 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)9262 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9263 {
9264 	struct vnode_attr va;
9265 
9266 	VATTR_INIT(&va);
9267 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9268 
9269 	return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9270 	           UIO_USERSPACE);
9271 }
9272 
9273 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)9274 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9275 {
9276 	struct vnode_attr va;
9277 
9278 	VATTR_INIT(&va);
9279 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9280 
9281 	return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9282 	           UIO_USERSPACE);
9283 }
9284 
9285 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)9286 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9287     enum uio_seg segflg, int unlink_flags)
9288 {
9289 	struct {
9290 		struct nameidata nd;
9291 #if CONFIG_FSE
9292 		struct vnode_attr va;
9293 #endif /* CONFIG_FSE */
9294 	} *__rmdir_data;
9295 	vnode_t vp, dvp;
9296 	int error;
9297 	struct nameidata *ndp;
9298 	char     *path = NULL;
9299 	char     *no_firmlink_path = NULL;
9300 	int       len_path = 0;
9301 	int       len_no_firmlink_path = 0;
9302 	int has_listeners = 0;
9303 	int need_event = 0;
9304 	int truncated_path = 0;
9305 	int truncated_no_firmlink_path = 0;
9306 	struct vnode_attr *vap = NULL;
9307 	int restart_count = 0;
9308 	int batched;
9309 
9310 	int restart_flag;
9311 
9312 	__rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
9313 	ndp = &__rmdir_data->nd;
9314 
9315 	/*
9316 	 * This loop exists to restart rmdir in the unlikely case that two
9317 	 * processes are simultaneously trying to remove the same directory
9318 	 * containing orphaned appleDouble files.
9319 	 */
9320 	do {
9321 		NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9322 		    segflg, dirpath, ctx);
9323 		ndp->ni_flag = NAMEI_COMPOUNDRMDIR;
9324 continue_lookup:
9325 		restart_flag = 0;
9326 		vap = NULL;
9327 
9328 		error = nameiat(ndp, fd);
9329 		if (error) {
9330 			goto err_out;
9331 		}
9332 
9333 		dvp = ndp->ni_dvp;
9334 		vp = ndp->ni_vp;
9335 
9336 		if (vp) {
9337 			batched = vnode_compound_rmdir_available(vp);
9338 
9339 			if (vp->v_flag & VROOT) {
9340 				/*
9341 				 * The root of a mounted filesystem cannot be deleted.
9342 				 */
9343 				error = EBUSY;
9344 				goto out;
9345 			}
9346 
9347 #if DEVELOPMENT || DEBUG
9348 			/*
9349 			 * XXX VSWAP: Check for entitlements or special flag here
9350 			 * so we can restrict access appropriately.
9351 			 */
9352 #else /* DEVELOPMENT || DEBUG */
9353 
9354 			if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
9355 				error = EPERM;
9356 				goto out;
9357 			}
9358 #endif /* DEVELOPMENT || DEBUG */
9359 
9360 			/*
9361 			 * Removed a check here; we used to abort if vp's vid
9362 			 * was not the same as what we'd seen the last time around.
9363 			 * I do not think that check was valid, because if we retry
9364 			 * and all dirents are gone, the directory could legitimately
9365 			 * be recycled but still be present in a situation where we would
9366 			 * have had permission to delete.  Therefore, we won't make
9367 			 * an effort to preserve that check now that we may not have a
9368 			 * vp here.
9369 			 */
9370 
9371 			if (!batched) {
9372 				error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
9373 				if (error) {
9374 					if (error == ENOENT) {
9375 						if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9376 							restart_flag = 1;
9377 							restart_count += 1;
9378 						}
9379 					}
9380 					goto out;
9381 				}
9382 			}
9383 		} else {
9384 			batched = 1;
9385 
9386 			if (!vnode_compound_rmdir_available(dvp)) {
9387 				panic("No error, but no compound rmdir?");
9388 			}
9389 		}
9390 
9391 #if CONFIG_FSE
9392 		fse_info  finfo = {0};
9393 
9394 		need_event = need_fsevent(FSE_DELETE, dvp);
9395 		if (need_event) {
9396 			if (!batched) {
9397 				get_fse_info(vp, &finfo, ctx);
9398 			} else {
9399 				error = vfs_get_notify_attributes(&__rmdir_data->va);
9400 				if (error) {
9401 					goto out;
9402 				}
9403 
9404 				vap = &__rmdir_data->va;
9405 			}
9406 		}
9407 #endif
9408 		has_listeners = kauth_authorize_fileop_has_listeners();
9409 		if (need_event || has_listeners) {
9410 			if (path == NULL) {
9411 				GET_PATH(path);
9412 			}
9413 
9414 			len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
9415 
9416 			if (no_firmlink_path == NULL) {
9417 				GET_PATH(no_firmlink_path);
9418 			}
9419 
9420 			len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
9421 #if CONFIG_FSE
9422 			if (truncated_no_firmlink_path) {
9423 				finfo.mode |= FSE_TRUNCATED_PATH;
9424 			}
9425 #endif
9426 		}
9427 
9428 		error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
9429 		ndp->ni_vp = vp;
9430 		if (vp == NULLVP) {
9431 			/* Couldn't find a vnode */
9432 			goto out;
9433 		}
9434 
9435 		if (error == EKEEPLOOKING) {
9436 			goto continue_lookup;
9437 		} else if (batched && error == ENOENT) {
9438 			if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9439 				/*
9440 				 * For compound VNOPs, the authorization callback
9441 				 * may return ENOENT in case of racing hard link lookups
9442 				 * redrive the lookup.
9443 				 */
9444 				restart_flag = 1;
9445 				restart_count += 1;
9446 				goto out;
9447 			}
9448 		}
9449 
9450 		/*
9451 		 * XXX There's no provision for passing flags
9452 		 * to VNOP_RMDIR().  So, if vn_rmdir() fails
9453 		 * because it's not empty, then we try again
9454 		 * with VNOP_REMOVE(), passing in a special
9455 		 * flag that clever file systems will know
9456 		 * how to handle.
9457 		 */
9458 		if (error == ENOTEMPTY &&
9459 		    (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
9460 			/*
9461 			 * If this fails, we want to keep the original
9462 			 * error.
9463 			 */
9464 			if (vn_remove(dvp, &vp, ndp,
9465 			    VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
9466 				error = 0;
9467 			}
9468 		}
9469 
9470 #if CONFIG_APPLEDOUBLE
9471 		/*
9472 		 * Special case to remove orphaned AppleDouble
9473 		 * files. I don't like putting this in the kernel,
9474 		 * but carbon does not like putting this in carbon either,
9475 		 * so here we are.
9476 		 */
9477 		if (error == ENOTEMPTY) {
9478 			int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
9479 			if (ad_error == EBUSY) {
9480 				error = ad_error;
9481 				goto out;
9482 			}
9483 
9484 
9485 			/*
9486 			 * Assuming everything went well, we will try the RMDIR again
9487 			 */
9488 			if (!ad_error) {
9489 				error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
9490 			}
9491 		}
9492 #endif /* CONFIG_APPLEDOUBLE */
9493 		/*
9494 		 * Call out to allow 3rd party notification of delete.
9495 		 * Ignore result of kauth_authorize_fileop call.
9496 		 */
9497 		if (!error) {
9498 			if (has_listeners) {
9499 				kauth_authorize_fileop(vfs_context_ucred(ctx),
9500 				    KAUTH_FILEOP_DELETE,
9501 				    (uintptr_t)vp,
9502 				    (uintptr_t)path);
9503 			}
9504 
9505 			if (vp->v_flag & VISHARDLINK) {
9506 				// see the comment in unlink1() about why we update
9507 				// the parent of a hard link when it is removed
9508 				vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
9509 			}
9510 
9511 #if CONFIG_FSE
9512 			if (need_event) {
9513 				if (vap) {
9514 					vnode_get_fse_info_from_vap(vp, &finfo, vap);
9515 				}
9516 				add_fsevent(FSE_DELETE, ctx,
9517 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
9518 				    FSE_ARG_FINFO, &finfo,
9519 				    FSE_ARG_DONE);
9520 			}
9521 #endif
9522 		}
9523 
9524 out:
9525 		if (path != NULL) {
9526 			RELEASE_PATH(path);
9527 			path = NULL;
9528 		}
9529 
9530 		if (no_firmlink_path != NULL) {
9531 			RELEASE_PATH(no_firmlink_path);
9532 			no_firmlink_path = NULL;
9533 		}
9534 
9535 		/*
9536 		 * nameidone has to happen before we vnode_put(dvp)
9537 		 * since it may need to release the fs_nodelock on the dvp
9538 		 */
9539 		nameidone(ndp);
9540 		vnode_put(dvp);
9541 
9542 		if (vp) {
9543 			vnode_put(vp);
9544 		}
9545 
9546 		if (restart_flag == 0) {
9547 			wakeup_one((caddr_t)vp);
9548 			goto err_out;
9549 		}
9550 		tsleep(vp, PVFS, "rm AD", 1);
9551 	} while (restart_flag != 0);
9552 
9553 err_out:
9554 	kfree_type(typeof(*__rmdir_data), __rmdir_data);
9555 
9556 	return error;
9557 }
9558 
9559 /*
9560  * Remove a directory file.
9561  */
9562 /* ARGSUSED */
9563 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)9564 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
9565 {
9566 	return rmdirat_internal(vfs_context_current(), AT_FDCWD,
9567 	           CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
9568 }
9569 
9570 /* Get direntry length padded to 8 byte alignment */
9571 #define DIRENT64_LEN(namlen) \
9572 	((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
9573 
9574 /* Get dirent length padded to 4 byte alignment */
9575 #define DIRENT_LEN(namelen) \
9576 	((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
9577 
9578 /* Get the end of this dirent */
9579 #define DIRENT_END(dep) \
9580 	(((char *)(dep)) + (dep)->d_reclen - 1)
9581 
9582 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)9583 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
9584     int *numdirent, vfs_context_t ctxp)
9585 {
9586 	/* Check if fs natively supports VNODE_READDIR_EXTENDED */
9587 	if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
9588 	    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
9589 		return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
9590 	} else {
9591 		size_t bufsize;
9592 		void * bufptr;
9593 		uio_t auio;
9594 		struct direntry *entry64;
9595 		struct dirent *dep;
9596 		size_t bytesread;
9597 		int error;
9598 
9599 		/*
9600 		 * We're here because the underlying file system does not
9601 		 * support direnties or we mounted denying support so we must
9602 		 * fall back to dirents and convert them to direntries.
9603 		 *
9604 		 * Our kernel buffer needs to be smaller since re-packing will
9605 		 * expand each dirent.  The worse case (when the name length
9606 		 * is 3 or less) corresponds to a struct direntry size of 32
9607 		 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
9608 		 * (4-byte aligned).  So having a buffer that is 3/8 the size
9609 		 * will prevent us from reading more than we can pack.
9610 		 *
9611 		 * Since this buffer is wired memory, we will limit the
9612 		 * buffer size to a maximum of 32K. We would really like to
9613 		 * use 32K in the MIN(), but we use magic number 87371 to
9614 		 * prevent uio_resid() * 3 / 8 from overflowing.
9615 		 */
9616 		bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
9617 		bufptr = kalloc_data(bufsize, Z_WAITOK);
9618 		if (bufptr == NULL) {
9619 			return ENOMEM;
9620 		}
9621 
9622 		auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
9623 		uio_addiov(auio, (uintptr_t)bufptr, bufsize);
9624 		auio->uio_offset = uio->uio_offset;
9625 
9626 		error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
9627 
9628 		dep = (struct dirent *)bufptr;
9629 		bytesread = bufsize - uio_resid(auio);
9630 
9631 		entry64 = kalloc_type(struct direntry, Z_WAITOK);
9632 		/*
9633 		 * Convert all the entries and copy them out to user's buffer.
9634 		 */
9635 		while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
9636 			/* First check that the dirent struct up to d_name is within the buffer */
9637 			if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
9638 			    /* Check that the length of the entire dirent is within the buffer */
9639 			    DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
9640 			    /* Check that the actual length including the name doesn't exceed d_reclen */
9641 			    DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
9642 				printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
9643 				    vp->v_mount->mnt_vfsstat.f_mntonname,
9644 				    vp->v_name ? vp->v_name : "<unknown>");
9645 				error = EIO;
9646 				break;
9647 			}
9648 
9649 			size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
9650 
9651 			bzero(entry64, enbufsize);
9652 			/* Convert a dirent to a dirent64. */
9653 			entry64->d_ino = dep->d_ino;
9654 			entry64->d_seekoff = 0;
9655 			entry64->d_reclen = (uint16_t)enbufsize;
9656 			entry64->d_namlen = dep->d_namlen;
9657 			entry64->d_type = dep->d_type;
9658 			bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
9659 
9660 			/* Move to next entry. */
9661 			dep = (struct dirent *)((char *)dep + dep->d_reclen);
9662 
9663 			/* Copy entry64 to user's buffer. */
9664 			error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
9665 		}
9666 
9667 		/* Update the real offset using the offset we got from VNOP_READDIR. */
9668 		if (error == 0) {
9669 			uio->uio_offset = auio->uio_offset;
9670 		}
9671 		uio_free(auio);
9672 		kfree_data(bufptr, bufsize);
9673 		kfree_type(struct direntry, entry64);
9674 		return error;
9675 	}
9676 }
9677 
9678 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
9679 
9680 /*
9681  * Read a block of directory entries in a file system independent format.
9682  */
9683 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)9684 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
9685     off_t *offset, int *eofflag, int flags)
9686 {
9687 	vnode_t vp;
9688 	struct vfs_context context = *vfs_context_current();    /* local copy */
9689 	struct fileproc *fp;
9690 	uio_t auio;
9691 	int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9692 	off_t loff;
9693 	int error, numdirent;
9694 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
9695 
9696 get_from_fd:
9697 	error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
9698 	if (error) {
9699 		return error;
9700 	}
9701 
9702 	vn_offset_lock(fp->fp_glob);
9703 	if (((vnode_t)fp_get_data(fp)) != vp) {
9704 		vn_offset_unlock(fp->fp_glob);
9705 		file_drop(fd);
9706 		goto get_from_fd;
9707 	}
9708 
9709 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
9710 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9711 		error = EBADF;
9712 		goto out;
9713 	}
9714 
9715 	if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
9716 		bufsize = GETDIRENTRIES_MAXBUFSIZE;
9717 	}
9718 
9719 #if CONFIG_MACF
9720 	error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
9721 	if (error) {
9722 		goto out;
9723 	}
9724 #endif
9725 
9726 	if ((error = vnode_getwithref(vp))) {
9727 		goto out;
9728 	}
9729 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9730 
9731 #if CONFIG_UNION_MOUNTS
9732 unionread:
9733 #endif /* CONFIG_UNION_MOUNTS */
9734 	if (vp->v_type != VDIR) {
9735 		(void)vnode_put(vp);
9736 		error = EINVAL;
9737 		goto out;
9738 	}
9739 
9740 #if CONFIG_MACF
9741 	error = mac_vnode_check_readdir(&context, vp);
9742 	if (error != 0) {
9743 		(void)vnode_put(vp);
9744 		goto out;
9745 	}
9746 #endif /* MAC */
9747 
9748 	loff = fp->fp_glob->fg_offset;
9749 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9750 	uio_addiov(auio, bufp, bufsize);
9751 
9752 	if (flags & VNODE_READDIR_EXTENDED) {
9753 		error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
9754 		fp->fp_glob->fg_offset = uio_offset(auio);
9755 	} else {
9756 		error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
9757 		fp->fp_glob->fg_offset = uio_offset(auio);
9758 	}
9759 	if (error) {
9760 		(void)vnode_put(vp);
9761 		goto out;
9762 	}
9763 
9764 #if CONFIG_UNION_MOUNTS
9765 	if ((user_ssize_t)bufsize == uio_resid(auio) &&
9766 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
9767 		vnode_t uvp;
9768 
9769 		if (lookup_traverse_union(vp, &uvp, &context) == 0) {
9770 			if (vnode_ref(uvp) == 0) {
9771 				fp_set_data(fp, uvp);
9772 				fp->fp_glob->fg_offset = 0;
9773 				vnode_rele(vp);
9774 				vnode_put(vp);
9775 				vp = uvp;
9776 				goto unionread;
9777 			} else {
9778 				/* could not get a ref, can't replace in fd */
9779 				vnode_put(uvp);
9780 			}
9781 		}
9782 	}
9783 #endif /* CONFIG_UNION_MOUNTS */
9784 
9785 	vnode_put(vp);
9786 	if (offset) {
9787 		*offset = loff;
9788 	}
9789 
9790 	*bytesread = bufsize - uio_resid(auio);
9791 out:
9792 	vn_offset_unlock(fp->fp_glob);
9793 	file_drop(fd);
9794 	return error;
9795 }
9796 
9797 
9798 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)9799 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
9800 {
9801 	off_t offset;
9802 	ssize_t bytesread;
9803 	int error, eofflag;
9804 
9805 	AUDIT_ARG(fd, uap->fd);
9806 	error = getdirentries_common(uap->fd, uap->buf, uap->count,
9807 	    &bytesread, &offset, &eofflag, 0);
9808 
9809 	if (error == 0) {
9810 		if (proc_is64bit(p)) {
9811 			user64_long_t base = (user64_long_t)offset;
9812 			error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
9813 		} else {
9814 			user32_long_t base = (user32_long_t)offset;
9815 			error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
9816 		}
9817 		*retval = (int)bytesread;
9818 	}
9819 	return error;
9820 }
9821 
9822 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)9823 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
9824 {
9825 	off_t offset;
9826 	ssize_t bytesread;
9827 	int error, eofflag;
9828 	user_size_t bufsize;
9829 
9830 	AUDIT_ARG(fd, uap->fd);
9831 
9832 	/*
9833 	 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
9834 	 * then the kernel carves out the last 4 bytes to return extended
9835 	 * information to userspace (namely whether we reached EOF with this call).
9836 	 */
9837 	if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9838 		bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
9839 	} else {
9840 		bufsize = uap->bufsize;
9841 	}
9842 
9843 	error = getdirentries_common(uap->fd, uap->buf, bufsize,
9844 	    &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
9845 
9846 	if (error == 0) {
9847 		*retval = bytesread;
9848 		error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
9849 
9850 		if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9851 			getdirentries64_flags_t flags = 0;
9852 			if (eofflag) {
9853 				flags |= GETDIRENTRIES64_EOF;
9854 			}
9855 			error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
9856 			    sizeof(flags));
9857 		}
9858 	}
9859 	return error;
9860 }
9861 
9862 
9863 /*
9864  * Set the mode mask for creation of filesystem nodes.
9865  * XXX implement xsecurity
9866  */
9867 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
9868 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)9869 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
9870 {
9871 	AUDIT_ARG(mask, newmask);
9872 	proc_fdlock(p);
9873 	*retval = p->p_fd.fd_cmask;
9874 	p->p_fd.fd_cmask = newmask & ALLPERMS;
9875 	proc_fdunlock(p);
9876 	return 0;
9877 }
9878 
9879 /*
9880  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
9881  *
9882  * Parameters:    p                       Process requesting to set the umask
9883  *                uap                     User argument descriptor (see below)
9884  *                retval                  umask of the process (parameter p)
9885  *
9886  * Indirect:      uap->newmask            umask to set
9887  *                uap->xsecurity          ACL to set
9888  *
9889  * Returns:        0                      Success
9890  *                !0                      Not success
9891  *
9892  */
9893 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)9894 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
9895 {
9896 	return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
9897 }
9898 
9899 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)9900 umask(proc_t p, struct umask_args *uap, int32_t *retval)
9901 {
9902 	return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
9903 }
9904 
9905 /*
9906  * Void all references to file by ripping underlying filesystem
9907  * away from vnode.
9908  */
9909 /* ARGSUSED */
9910 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)9911 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
9912 {
9913 	vnode_t vp;
9914 	struct vnode_attr va;
9915 	vfs_context_t ctx = vfs_context_current();
9916 	int error;
9917 	struct nameidata nd;
9918 
9919 	NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
9920 	    uap->path, ctx);
9921 	error = namei(&nd);
9922 	if (error) {
9923 		return error;
9924 	}
9925 	vp = nd.ni_vp;
9926 
9927 	nameidone(&nd);
9928 
9929 	if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
9930 		error = ENOTSUP;
9931 		goto out;
9932 	}
9933 
9934 	if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
9935 		error = EBUSY;
9936 		goto out;
9937 	}
9938 
9939 #if CONFIG_MACF
9940 	error = mac_vnode_check_revoke(ctx, vp);
9941 	if (error) {
9942 		goto out;
9943 	}
9944 #endif
9945 
9946 	VATTR_INIT(&va);
9947 	VATTR_WANTED(&va, va_uid);
9948 	if ((error = vnode_getattr(vp, &va, ctx))) {
9949 		goto out;
9950 	}
9951 	if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
9952 	    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
9953 		goto out;
9954 	}
9955 	if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
9956 		VNOP_REVOKE(vp, REVOKEALL, ctx);
9957 	}
9958 out:
9959 	vnode_put(vp);
9960 	return error;
9961 }
9962 
9963 
9964 /*
9965  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
9966  *  The following system calls are designed to support features
9967  *  which are specific to the HFS & HFS Plus volume formats
9968  */
9969 
9970 
9971 /*
9972  * Obtain attribute information on objects in a directory while enumerating
9973  * the directory.
9974  */
9975 /* ARGSUSED */
9976 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)9977 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
9978 {
9979 	vnode_t vp;
9980 	struct fileproc *fp;
9981 	uio_t auio = NULL;
9982 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9983 	uint32_t count = 0, savecount = 0;
9984 	uint32_t newstate = 0;
9985 	int error, eofflag;
9986 	off_t loff = 0;
9987 	struct attrlist attributelist;
9988 	vfs_context_t ctx = vfs_context_current();
9989 	int fd = uap->fd;
9990 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
9991 	kauth_action_t action;
9992 
9993 	AUDIT_ARG(fd, fd);
9994 
9995 	/* Get the attributes into kernel space */
9996 	if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
9997 		return error;
9998 	}
9999 	if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10000 		return error;
10001 	}
10002 	savecount = count;
10003 
10004 get_from_fd:
10005 	if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10006 		return error;
10007 	}
10008 
10009 	vn_offset_lock(fp->fp_glob);
10010 	if (((vnode_t)fp_get_data(fp)) != vp) {
10011 		vn_offset_unlock(fp->fp_glob);
10012 		file_drop(fd);
10013 		goto get_from_fd;
10014 	}
10015 
10016 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10017 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10018 		error = EBADF;
10019 		goto out;
10020 	}
10021 
10022 
10023 #if CONFIG_MACF
10024 	error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10025 	    fp->fp_glob);
10026 	if (error) {
10027 		goto out;
10028 	}
10029 #endif
10030 
10031 
10032 	if ((error = vnode_getwithref(vp))) {
10033 		goto out;
10034 	}
10035 
10036 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10037 
10038 #if CONFIG_UNION_MOUNTS
10039 unionread:
10040 #endif /* CONFIG_UNION_MOUNTS */
10041 	if (vp->v_type != VDIR) {
10042 		(void)vnode_put(vp);
10043 		error = EINVAL;
10044 		goto out;
10045 	}
10046 
10047 #if CONFIG_MACF
10048 	error = mac_vnode_check_readdir(ctx, vp);
10049 	if (error != 0) {
10050 		(void)vnode_put(vp);
10051 		goto out;
10052 	}
10053 #endif /* MAC */
10054 
10055 	/* set up the uio structure which will contain the users return buffer */
10056 	loff = fp->fp_glob->fg_offset;
10057 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10058 	uio_addiov(auio, uap->buffer, uap->buffersize);
10059 
10060 	/*
10061 	 * If the only item requested is file names, we can let that past with
10062 	 * just LIST_DIRECTORY.  If they want any other attributes, that means
10063 	 * they need SEARCH as well.
10064 	 */
10065 	action = KAUTH_VNODE_LIST_DIRECTORY;
10066 	if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10067 	    attributelist.fileattr || attributelist.dirattr) {
10068 		action |= KAUTH_VNODE_SEARCH;
10069 	}
10070 
10071 	if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10072 		/* Believe it or not, uap->options only has 32-bits of valid
10073 		 * info, so truncate before extending again */
10074 
10075 		error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10076 		    (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10077 	}
10078 
10079 	if (error) {
10080 		(void) vnode_put(vp);
10081 		goto out;
10082 	}
10083 
10084 #if CONFIG_UNION_MOUNTS
10085 	/*
10086 	 * If we've got the last entry of a directory in a union mount
10087 	 * then reset the eofflag and pretend there's still more to come.
10088 	 * The next call will again set eofflag and the buffer will be empty,
10089 	 * so traverse to the underlying directory and do the directory
10090 	 * read there.
10091 	 */
10092 	if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10093 		if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10094 			eofflag = 0;
10095 		} else {                                                // Empty buffer
10096 			vnode_t uvp;
10097 			if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10098 				if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10099 					fp_set_data(fp, uvp);
10100 					fp->fp_glob->fg_offset = 0; // reset index for new dir
10101 					count = savecount;
10102 					vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10103 					vnode_put(vp);
10104 					vp = uvp;
10105 					goto unionread;
10106 				} else {
10107 					/* could not get a ref, can't replace in fd */
10108 					vnode_put(uvp);
10109 				}
10110 			}
10111 		}
10112 	}
10113 #endif /* CONFIG_UNION_MOUNTS */
10114 
10115 	(void)vnode_put(vp);
10116 
10117 	if (error) {
10118 		goto out;
10119 	}
10120 	fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10121 
10122 	if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10123 		goto out;
10124 	}
10125 	if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10126 		goto out;
10127 	}
10128 	if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10129 		goto out;
10130 	}
10131 
10132 	*retval = eofflag;  /* similar to getdirentries */
10133 	error = 0;
10134 out:
10135 	vn_offset_unlock(fp->fp_glob);
10136 	file_drop(fd);
10137 	return error; /* return error earlier, an retval of 0 or 1 now */
10138 } /* end of getdirentriesattr system call */
10139 
10140 /*
10141  * Exchange data between two files
10142  */
10143 
10144 /* ARGSUSED */
10145 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10146 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10147 {
10148 	struct nameidata fnd, snd;
10149 	vfs_context_t ctx = vfs_context_current();
10150 	vnode_t fvp;
10151 	vnode_t svp;
10152 	int error;
10153 	u_int32_t nameiflags;
10154 	char *fpath = NULL;
10155 	char *spath = NULL;
10156 	int   flen = 0, slen = 0;
10157 	int from_truncated = 0, to_truncated = 0;
10158 #if CONFIG_FSE
10159 	fse_info f_finfo, s_finfo;
10160 #endif
10161 
10162 	nameiflags = 0;
10163 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10164 		nameiflags |= FOLLOW;
10165 	}
10166 
10167 	NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10168 	    UIO_USERSPACE, uap->path1, ctx);
10169 
10170 	error = namei(&fnd);
10171 	if (error) {
10172 		goto out2;
10173 	}
10174 
10175 	nameidone(&fnd);
10176 	fvp = fnd.ni_vp;
10177 
10178 	NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10179 	    UIO_USERSPACE, uap->path2, ctx);
10180 
10181 	error = namei(&snd);
10182 	if (error) {
10183 		vnode_put(fvp);
10184 		goto out2;
10185 	}
10186 	nameidone(&snd);
10187 	svp = snd.ni_vp;
10188 
10189 	/*
10190 	 * if the files are the same, return an inval error
10191 	 */
10192 	if (svp == fvp) {
10193 		error = EINVAL;
10194 		goto out;
10195 	}
10196 
10197 	/*
10198 	 * if the files are on different volumes, return an error
10199 	 */
10200 	if (svp->v_mount != fvp->v_mount) {
10201 		error = EXDEV;
10202 		goto out;
10203 	}
10204 
10205 	/* If they're not files, return an error */
10206 	if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10207 		error = EINVAL;
10208 		goto out;
10209 	}
10210 
10211 #if CONFIG_MACF
10212 	error = mac_vnode_check_exchangedata(ctx,
10213 	    fvp, svp);
10214 	if (error) {
10215 		goto out;
10216 	}
10217 #endif
10218 	if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10219 	    ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10220 		goto out;
10221 	}
10222 
10223 	if (
10224 #if CONFIG_FSE
10225 		need_fsevent(FSE_EXCHANGE, fvp) ||
10226 #endif
10227 		kauth_authorize_fileop_has_listeners()) {
10228 		GET_PATH(fpath);
10229 		GET_PATH(spath);
10230 
10231 		flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10232 		slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10233 
10234 #if CONFIG_FSE
10235 		get_fse_info(fvp, &f_finfo, ctx);
10236 		get_fse_info(svp, &s_finfo, ctx);
10237 		if (from_truncated || to_truncated) {
10238 			// set it here since only the f_finfo gets reported up to user space
10239 			f_finfo.mode |= FSE_TRUNCATED_PATH;
10240 		}
10241 #endif
10242 	}
10243 	/* Ok, make the call */
10244 	error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10245 
10246 	if (error == 0) {
10247 		const char *tmpname;
10248 
10249 		if (fpath != NULL && spath != NULL) {
10250 			/* call out to allow 3rd party notification of exchangedata.
10251 			 * Ignore result of kauth_authorize_fileop call.
10252 			 */
10253 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10254 			    (uintptr_t)fpath, (uintptr_t)spath);
10255 		}
10256 		name_cache_lock();
10257 
10258 		tmpname     = fvp->v_name;
10259 		fvp->v_name = svp->v_name;
10260 		svp->v_name = tmpname;
10261 
10262 		if (fvp->v_parent != svp->v_parent) {
10263 			vnode_t tmp;
10264 
10265 			tmp           = fvp->v_parent;
10266 			fvp->v_parent = svp->v_parent;
10267 			svp->v_parent = tmp;
10268 		}
10269 		name_cache_unlock();
10270 
10271 #if CONFIG_FSE
10272 		if (fpath != NULL && spath != NULL) {
10273 			add_fsevent(FSE_EXCHANGE, ctx,
10274 			    FSE_ARG_STRING, flen, fpath,
10275 			    FSE_ARG_FINFO, &f_finfo,
10276 			    FSE_ARG_STRING, slen, spath,
10277 			    FSE_ARG_FINFO, &s_finfo,
10278 			    FSE_ARG_DONE);
10279 		}
10280 #endif
10281 	}
10282 
10283 out:
10284 	if (fpath != NULL) {
10285 		RELEASE_PATH(fpath);
10286 	}
10287 	if (spath != NULL) {
10288 		RELEASE_PATH(spath);
10289 	}
10290 	vnode_put(svp);
10291 	vnode_put(fvp);
10292 out2:
10293 	return error;
10294 }
10295 
10296 /*
10297  * Return (in MB) the amount of freespace on the given vnode's volume.
10298  */
10299 uint32_t freespace_mb(vnode_t vp);
10300 
10301 uint32_t
freespace_mb(vnode_t vp)10302 freespace_mb(vnode_t vp)
10303 {
10304 	vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
10305 	return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10306 	       vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10307 }
10308 
10309 #if CONFIG_SEARCHFS
10310 
10311 /* ARGSUSED */
10312 
10313 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)10314 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10315 {
10316 	vnode_t vp, tvp;
10317 	int i, error = 0;
10318 	int fserror = 0;
10319 	struct nameidata nd;
10320 	struct user64_fssearchblock searchblock;
10321 	struct searchstate *state;
10322 	struct attrlist *returnattrs;
10323 	struct timeval timelimit;
10324 	void *searchparams1, *searchparams2;
10325 	uio_t auio = NULL;
10326 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10327 	uint32_t nummatches;
10328 	size_t mallocsize;
10329 	uint32_t nameiflags;
10330 	vfs_context_t ctx = vfs_context_current();
10331 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10332 
10333 	/* Start by copying in fsearchblock parameter list */
10334 	if (IS_64BIT_PROCESS(p)) {
10335 		error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
10336 		timelimit.tv_sec = searchblock.timelimit.tv_sec;
10337 		timelimit.tv_usec = searchblock.timelimit.tv_usec;
10338 	} else {
10339 		struct user32_fssearchblock tmp_searchblock;
10340 
10341 		error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
10342 		// munge into 64-bit version
10343 		searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
10344 		searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
10345 		searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
10346 		searchblock.maxmatches = tmp_searchblock.maxmatches;
10347 		/*
10348 		 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
10349 		 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
10350 		 */
10351 		timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
10352 		timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
10353 		searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
10354 		searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
10355 		searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
10356 		searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
10357 		searchblock.searchattrs = tmp_searchblock.searchattrs;
10358 	}
10359 	if (error) {
10360 		return error;
10361 	}
10362 
10363 	/* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
10364 	 */
10365 	if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10366 	    searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10367 		return EINVAL;
10368 	}
10369 
10370 	/* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
10371 	/* It all has to do into local memory and it's not that big so we might as well  put it all together. */
10372 	/* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
10373 	/* block.                                                                                             */
10374 	/*												      */
10375 	/* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
10376 	/*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
10377 	/*       assumes the size is still 556 bytes it will continue to work				      */
10378 
10379 	mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
10380 	    sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
10381 
10382 	searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
10383 
10384 	/* Now set up the various pointers to the correct place in our newly allocated memory */
10385 
10386 	searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
10387 	returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
10388 	state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
10389 
10390 	/* Now copy in the stuff given our local variables. */
10391 
10392 	if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
10393 		goto freeandexit;
10394 	}
10395 
10396 	if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
10397 		goto freeandexit;
10398 	}
10399 
10400 	if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
10401 		goto freeandexit;
10402 	}
10403 
10404 	if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
10405 		goto freeandexit;
10406 	}
10407 
10408 	/*
10409 	 * When searching a union mount, need to set the
10410 	 * start flag at the first call on each layer to
10411 	 * reset state for the new volume.
10412 	 */
10413 	if (uap->options & SRCHFS_START) {
10414 		state->ss_union_layer = 0;
10415 	} else {
10416 		uap->options |= state->ss_union_flags;
10417 	}
10418 	state->ss_union_flags = 0;
10419 
10420 	/*
10421 	 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
10422 	 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
10423 	 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
10424 	 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
10425 	 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
10426 	 */
10427 
10428 	if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
10429 		attrreference_t* string_ref;
10430 		u_int32_t* start_length;
10431 		user64_size_t param_length;
10432 
10433 		/* validate searchparams1 */
10434 		param_length = searchblock.sizeofsearchparams1;
10435 		/* skip the word that specifies length of the buffer */
10436 		start_length = (u_int32_t*) searchparams1;
10437 		start_length = start_length + 1;
10438 		string_ref = (attrreference_t*) start_length;
10439 
10440 		/* ensure no negative offsets or too big offsets */
10441 		if (string_ref->attr_dataoffset < 0) {
10442 			error = EINVAL;
10443 			goto freeandexit;
10444 		}
10445 		if (string_ref->attr_length > MAXPATHLEN) {
10446 			error = EINVAL;
10447 			goto freeandexit;
10448 		}
10449 
10450 		/* Check for pointer overflow in the string ref */
10451 		if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
10452 			error = EINVAL;
10453 			goto freeandexit;
10454 		}
10455 
10456 		if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
10457 			error = EINVAL;
10458 			goto freeandexit;
10459 		}
10460 		if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
10461 			error = EINVAL;
10462 			goto freeandexit;
10463 		}
10464 	}
10465 
10466 	/* set up the uio structure which will contain the users return buffer */
10467 	auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10468 	uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
10469 
10470 	nameiflags = 0;
10471 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10472 		nameiflags |= FOLLOW;
10473 	}
10474 	NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
10475 	    UIO_USERSPACE, uap->path, ctx);
10476 
10477 	error = namei(&nd);
10478 	if (error) {
10479 		goto freeandexit;
10480 	}
10481 	vp = nd.ni_vp;
10482 	nameidone(&nd);
10483 
10484 	/*
10485 	 * Switch to the root vnode for the volume
10486 	 */
10487 	error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
10488 	vnode_put(vp);
10489 	if (error) {
10490 		goto freeandexit;
10491 	}
10492 	vp = tvp;
10493 
10494 #if CONFIG_UNION_MOUNTS
10495 	/*
10496 	 * If it's a union mount, the path lookup takes
10497 	 * us to the top layer. But we may need to descend
10498 	 * to a lower layer. For non-union mounts the layer
10499 	 * is always zero.
10500 	 */
10501 	for (i = 0; i < (int) state->ss_union_layer; i++) {
10502 		if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
10503 			break;
10504 		}
10505 		tvp = vp;
10506 		vp = vp->v_mount->mnt_vnodecovered;
10507 		if (vp == NULL) {
10508 			vnode_put(tvp);
10509 			error = ENOENT;
10510 			goto freeandexit;
10511 		}
10512 		error = vnode_getwithref(vp);
10513 		vnode_put(tvp);
10514 		if (error) {
10515 			goto freeandexit;
10516 		}
10517 	}
10518 #endif /* CONFIG_UNION_MOUNTS */
10519 
10520 #if CONFIG_MACF
10521 	error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
10522 	if (error) {
10523 		vnode_put(vp);
10524 		goto freeandexit;
10525 	}
10526 #endif
10527 
10528 
10529 	/*
10530 	 * If searchblock.maxmatches == 0, then skip the search. This has happened
10531 	 * before and sometimes the underlying code doesnt deal with it well.
10532 	 */
10533 	if (searchblock.maxmatches == 0) {
10534 		nummatches = 0;
10535 		goto saveandexit;
10536 	}
10537 
10538 	/*
10539 	 * Allright, we have everything we need, so lets make that call.
10540 	 *
10541 	 * We keep special track of the return value from the file system:
10542 	 * EAGAIN is an acceptable error condition that shouldn't keep us
10543 	 * from copying out any results...
10544 	 */
10545 
10546 	fserror = VNOP_SEARCHFS(vp,
10547 	    searchparams1,
10548 	    searchparams2,
10549 	    &searchblock.searchattrs,
10550 	    (uint32_t)searchblock.maxmatches,
10551 	    &timelimit,
10552 	    returnattrs,
10553 	    &nummatches,
10554 	    (uint32_t)uap->scriptcode,
10555 	    (uint32_t)uap->options,
10556 	    auio,
10557 	    (struct searchstate *) &state->ss_fsstate,
10558 	    ctx);
10559 
10560 #if CONFIG_UNION_MOUNTS
10561 	/*
10562 	 * If it's a union mount we need to be called again
10563 	 * to search the mounted-on filesystem.
10564 	 */
10565 	if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
10566 		state->ss_union_flags = SRCHFS_START;
10567 		state->ss_union_layer++;        // search next layer down
10568 		fserror = EAGAIN;
10569 	}
10570 #endif /* CONFIG_UNION_MOUNTS */
10571 
10572 saveandexit:
10573 
10574 	vnode_put(vp);
10575 
10576 	/* Now copy out the stuff that needs copying out. That means the number of matches, the
10577 	 *  search state.  Everything was already put into he return buffer by the vop call. */
10578 
10579 	if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
10580 		goto freeandexit;
10581 	}
10582 
10583 	if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
10584 		goto freeandexit;
10585 	}
10586 
10587 	error = fserror;
10588 
10589 freeandexit:
10590 
10591 	kfree_data(searchparams1, mallocsize);
10592 
10593 	return error;
10594 } /* end of searchfs system call */
10595 
10596 #else /* CONFIG_SEARCHFS */
10597 
10598 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)10599 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
10600 {
10601 	return ENOTSUP;
10602 }
10603 
10604 #endif /* CONFIG_SEARCHFS */
10605 
10606 
10607 #if CONFIG_DATALESS_FILES
10608 
10609 /*
10610  * === Namespace Resolver Up-call Mechanism ===
10611  *
10612  * When I/O is performed to a dataless file or directory (read, write,
10613  * lookup-in, etc.), the file system performs an upcall to the namespace
10614  * resolver (filecoordinationd) to materialize the object.
10615  *
10616  * We need multiple up-calls to be in flight at once, and we need these
10617  * up-calls to be interruptible, thus the following implementation:
10618  *
10619  * => The nspace_resolver_request represents the in-kernel request state.
10620  *    It contains a request ID, storage space for the errno code returned
10621  *    by filecoordinationd, and flags.
10622  *
10623  * => The request ID is simply a global monotonically incrementing 32-bit
10624  *    number.  Outstanding requests are stored in a hash table, and the
10625  *    hash function is extremely simple.
10626  *
10627  * => When an upcall is to be made to filecoordinationd, a request structure
10628  *    is allocated on the stack (it is small, and needs to live only during
10629  *    the duration of the call to resolve_nspace_item_ext()).  It is
10630  *    initialized and inserted into the table.  Some backpressure from
10631  *    filecoordinationd is applied by limiting the numnber of entries that
10632  *    can be inserted into the table (and thus limiting the number of
10633  *    outstanding requests issued to filecoordinationd); waiting for an
10634  *    available slot is interruptible.
10635  *
10636  * => Once the request has been inserted into the table, the up-call is made
10637  *    to filecoordinationd via a MiG-generated stub.  The up-call returns
10638  *    immediately and filecoordinationd processes the request asynchronously.
10639  *
10640  * => The caller now waits for the request to complete.  Tnis is achieved by
10641  *    sleeping on the address of the request structure and waiting for
10642  *    filecoordinationd to mark the request structure as complete.  This
10643  *    is an interruptible sleep call; if interrupted, the request structure
10644  *    is removed from the table and EINTR is returned to the caller.  If
10645  *    this occurs, an advisory up-call is made to filecoordinationd with
10646  *    the request ID to indicate that the request can be aborted or
10647  *    de-prioritized at the discretion of filecoordinationd.
10648  *
10649  * => When filecoordinationd has completed the request, it signals completion
10650  *    by writing to the vfs.nspace.complete sysctl node.  Only a process
10651  *    decorated as a namespace resolver can write to this sysctl node.  The
10652  *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
10653  *    The request ID is looked up in the table, and if the request is found,
10654  *    the error code is stored in the request structure and a wakeup()
10655  *    issued on the address of the request structure.  If the request is not
10656  *    found, we simply drop the completion notification, assuming that the
10657  *    caller was interrupted.
10658  *
10659  * => When the waiting thread wakes up, it extracts the error code from the
10660  *    request structure, removes the request from the table, and returns the
10661  *    error code to the calling function.  Fini!
10662  */
10663 
10664 struct nspace_resolver_request {
10665 	LIST_ENTRY(nspace_resolver_request) r_hashlink;
10666 	vnode_t         r_vp;
10667 	uint32_t        r_req_id;
10668 	int             r_resolver_error;
10669 	int             r_flags;
10670 };
10671 
10672 #define RRF_COMPLETE    0x0001
10673 
10674 static uint32_t
next_nspace_req_id(void)10675 next_nspace_req_id(void)
10676 {
10677 	static uint32_t next_req_id;
10678 
10679 	return OSAddAtomic(1, &next_req_id);
10680 }
10681 
10682 #define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
10683 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
10684 
10685 static LIST_HEAD(nspace_resolver_requesthead,
10686     nspace_resolver_request) * nspace_resolver_request_hashtbl;
10687 static u_long nspace_resolver_request_hashmask;
10688 static u_int nspace_resolver_request_count;
10689 static bool nspace_resolver_request_wait_slot;
10690 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
10691 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
10692     &nspace_resolver_request_lck_grp);
10693 
10694 #define NSPACE_REQ_LOCK() \
10695 	lck_mtx_lock(&nspace_resolver_request_hash_mutex)
10696 #define NSPACE_REQ_UNLOCK() \
10697 	lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
10698 
10699 #define NSPACE_RESOLVER_HASH(req_id)    \
10700 	(&nspace_resolver_request_hashtbl[(req_id) & \
10701 	 nspace_resolver_request_hashmask])
10702 
10703 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id)10704 nspace_resolver_req_lookup(uint32_t req_id)
10705 {
10706 	struct nspace_resolver_requesthead *bucket;
10707 	struct nspace_resolver_request *req;
10708 
10709 	bucket = NSPACE_RESOLVER_HASH(req_id);
10710 	LIST_FOREACH(req, bucket, r_hashlink) {
10711 		if (req->r_req_id == req_id) {
10712 			return req;
10713 		}
10714 	}
10715 
10716 	return NULL;
10717 }
10718 
10719 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)10720 nspace_resolver_req_add(struct nspace_resolver_request *req)
10721 {
10722 	struct nspace_resolver_requesthead *bucket;
10723 	int error;
10724 
10725 	while (nspace_resolver_request_count >=
10726 	    NSPACE_RESOLVER_MAX_OUTSTANDING) {
10727 		nspace_resolver_request_wait_slot = true;
10728 		error = msleep(&nspace_resolver_request_count,
10729 		    &nspace_resolver_request_hash_mutex,
10730 		    PVFS | PCATCH, "nspacerq", NULL);
10731 		if (error) {
10732 			return error;
10733 		}
10734 	}
10735 
10736 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10737 #if DIAGNOSTIC
10738 	assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
10739 #endif /* DIAGNOSTIC */
10740 	LIST_INSERT_HEAD(bucket, req, r_hashlink);
10741 	nspace_resolver_request_count++;
10742 
10743 	return 0;
10744 }
10745 
10746 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)10747 nspace_resolver_req_remove(struct nspace_resolver_request *req)
10748 {
10749 	struct nspace_resolver_requesthead *bucket;
10750 
10751 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10752 #if DIAGNOSTIC
10753 	assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
10754 #endif /* DIAGNOSTIC */
10755 	LIST_REMOVE(req, r_hashlink);
10756 	nspace_resolver_request_count--;
10757 
10758 	if (nspace_resolver_request_wait_slot) {
10759 		nspace_resolver_request_wait_slot = false;
10760 		wakeup(&nspace_resolver_request_count);
10761 	}
10762 }
10763 
10764 static void
nspace_resolver_req_cancel(uint32_t req_id)10765 nspace_resolver_req_cancel(uint32_t req_id)
10766 {
10767 	kern_return_t kr;
10768 	mach_port_t mp;
10769 
10770 	// Failures here aren't fatal -- the cancellation message
10771 	// sent to the resolver is merely advisory.
10772 
10773 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
10774 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
10775 		return;
10776 	}
10777 
10778 	kr = send_nspace_resolve_cancel(mp, req_id);
10779 	if (kr != KERN_SUCCESS) {
10780 		os_log_error(OS_LOG_DEFAULT,
10781 		    "NSPACE send_nspace_resolve_cancel failure: %d", kr);
10782 	}
10783 
10784 	ipc_port_release_send(mp);
10785 }
10786 
10787 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)10788 nspace_resolver_req_wait(struct nspace_resolver_request *req)
10789 {
10790 	bool send_cancel_message = false;
10791 	int error;
10792 
10793 	NSPACE_REQ_LOCK();
10794 
10795 	while ((req->r_flags & RRF_COMPLETE) == 0) {
10796 		error = msleep(req, &nspace_resolver_request_hash_mutex,
10797 		    PVFS | PCATCH, "nspace", NULL);
10798 		if (error && error != ERESTART) {
10799 			req->r_resolver_error = (error == EINTR) ? EINTR :
10800 			    ETIMEDOUT;
10801 			send_cancel_message = true;
10802 			break;
10803 		}
10804 	}
10805 
10806 	nspace_resolver_req_remove(req);
10807 
10808 	NSPACE_REQ_UNLOCK();
10809 
10810 	if (send_cancel_message) {
10811 		nspace_resolver_req_cancel(req->r_req_id);
10812 	}
10813 
10814 	return req->r_resolver_error;
10815 }
10816 
10817 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)10818 nspace_resolver_req_mark_complete(
10819 	struct nspace_resolver_request *req,
10820 	int resolver_error)
10821 {
10822 	req->r_resolver_error = resolver_error;
10823 	req->r_flags |= RRF_COMPLETE;
10824 	wakeup(req);
10825 }
10826 
10827 static void
nspace_resolver_req_completed(uint32_t req_id,int resolver_error,uint64_t orig_gencount)10828 nspace_resolver_req_completed(uint32_t req_id, int resolver_error, uint64_t orig_gencount)
10829 {
10830 	struct nspace_resolver_request *req;
10831 
10832 	NSPACE_REQ_LOCK();
10833 
10834 	// If we don't find the request corresponding to our req_id,
10835 	// just drop the completion signal on the floor; it's likely
10836 	// that the requester interrupted with a signal.
10837 
10838 	req = nspace_resolver_req_lookup(req_id);
10839 	if (req) {
10840 		mount_t locked_mp = NULL;
10841 
10842 		locked_mp = req->r_vp->v_mount;
10843 		mount_ref(locked_mp, 0);
10844 		mount_lock_renames(locked_mp);
10845 
10846 		//
10847 		// if the resolver isn't already returning an error and we have an
10848 		// orig_gencount, then get an iocount on the request vnode and check
10849 		// that the gencount on req->r_vp has not changed.
10850 		//
10851 		// note: a ref was taken on req->r_vp when the request was created
10852 		// and that ref will be dropped by that thread when it wakes up.
10853 		//
10854 		if (resolver_error == 0 &&
10855 		    orig_gencount != 0 &&
10856 		    vnode_getwithref(req->r_vp) == 0) {
10857 			struct vnode_attr va;
10858 			uint64_t cur_gencount;
10859 
10860 			VATTR_INIT(&va);
10861 			VATTR_WANTED(&va, va_recursive_gencount);
10862 
10863 			if (vnode_getattr(req->r_vp, &va, vfs_context_kernel()) == 0) {
10864 				cur_gencount = va.va_recursive_gencount;
10865 			} else {
10866 				cur_gencount = 0;
10867 			}
10868 
10869 			if (resolver_error == 0 && cur_gencount && orig_gencount && cur_gencount != orig_gencount) {
10870 				printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n", orig_gencount, cur_gencount);
10871 
10872 				// this error will be returned to the thread that initiated the
10873 				// materialization of req->r_vp.
10874 				resolver_error = EBUSY;
10875 
10876 				// note: we explicitly do not return an error to the caller (i.e.
10877 				// the thread that did the materialization) because they said they
10878 				// don't want one.
10879 			}
10880 
10881 			vnode_put(req->r_vp);
10882 		}
10883 
10884 		mount_unlock_renames(locked_mp);
10885 		mount_drop(locked_mp, 0);
10886 
10887 		nspace_resolver_req_mark_complete(req, resolver_error);
10888 	}
10889 
10890 	NSPACE_REQ_UNLOCK();
10891 
10892 	return;
10893 }
10894 
10895 static struct proc *nspace_resolver_proc;
10896 
10897 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)10898 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
10899 {
10900 	*is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
10901 	    p == nspace_resolver_proc) ? 1 : 0;
10902 	return 0;
10903 }
10904 
10905 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)10906 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
10907 {
10908 	vfs_context_t ctx = vfs_context_current();
10909 	int error = 0;
10910 
10911 	//
10912 	// The system filecoordinationd runs as uid == 0.  This also
10913 	// has the nice side-effect of filtering out filecoordinationd
10914 	// running in the simulator.
10915 	//
10916 	if (!vfs_context_issuser(ctx)) {
10917 		return EPERM;
10918 	}
10919 
10920 	error = priv_check_cred(vfs_context_ucred(ctx),
10921 	    PRIV_VFS_DATALESS_RESOLVER, 0);
10922 	if (error) {
10923 		return error;
10924 	}
10925 
10926 	if (is_resolver) {
10927 		NSPACE_REQ_LOCK();
10928 
10929 		if (nspace_resolver_proc == NULL) {
10930 			proc_lock(p);
10931 			p->p_lflag |= P_LNSPACE_RESOLVER;
10932 			proc_unlock(p);
10933 			nspace_resolver_proc = p;
10934 		} else {
10935 			error = EBUSY;
10936 		}
10937 
10938 		NSPACE_REQ_UNLOCK();
10939 	} else {
10940 		// This is basically just like the exit case.
10941 		// nspace_resolver_exited() will verify that the
10942 		// process is the resolver, and will clear the
10943 		// global.
10944 		nspace_resolver_exited(p);
10945 	}
10946 
10947 	return error;
10948 }
10949 
10950 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)10951 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
10952 {
10953 	if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
10954 	    (p->p_vfs_iopolicy &
10955 	    P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
10956 		*is_prevented = 1;
10957 	} else {
10958 		*is_prevented = 0;
10959 	}
10960 	return 0;
10961 }
10962 
10963 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)10964 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
10965 {
10966 	if (p->p_lflag & P_LNSPACE_RESOLVER) {
10967 		return is_prevented ? 0 : EBUSY;
10968 	}
10969 
10970 	if (is_prevented) {
10971 		OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
10972 	} else {
10973 		OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
10974 	}
10975 	return 0;
10976 }
10977 
10978 static int
nspace_materialization_get_thread_state(int * is_prevented)10979 nspace_materialization_get_thread_state(int *is_prevented)
10980 {
10981 	uthread_t ut = current_uthread();
10982 
10983 	*is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
10984 	return 0;
10985 }
10986 
10987 static int
nspace_materialization_set_thread_state(int is_prevented)10988 nspace_materialization_set_thread_state(int is_prevented)
10989 {
10990 	uthread_t ut = current_uthread();
10991 
10992 	if (is_prevented) {
10993 		ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
10994 	} else {
10995 		ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
10996 	}
10997 	return 0;
10998 }
10999 
11000 /* the vfs.nspace branch */
11001 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11002 
11003 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11004 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11005     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11006 {
11007 	struct proc *p = req->p;
11008 	int new_value, old_value, changed = 0;
11009 	int error;
11010 
11011 	error = nspace_resolver_get_proc_state(p, &old_value);
11012 	if (error) {
11013 		return error;
11014 	}
11015 
11016 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11017 	    &changed);
11018 	if (error == 0 && changed) {
11019 		error = nspace_resolver_set_proc_state(p, new_value);
11020 	}
11021 	return error;
11022 }
11023 
11024 /* decorate this process as the dataless file resolver */
11025 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11026     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11027     0, 0, sysctl_nspace_resolver, "I", "");
11028 
11029 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11030 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11031     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11032 {
11033 	struct proc *p = req->p;
11034 	int new_value, old_value, changed = 0;
11035 	int error;
11036 
11037 	error = nspace_materialization_get_proc_state(p, &old_value);
11038 	if (error) {
11039 		return error;
11040 	}
11041 
11042 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11043 	    &changed);
11044 	if (error == 0 && changed) {
11045 		error = nspace_materialization_set_proc_state(p, new_value);
11046 	}
11047 	return error;
11048 }
11049 
11050 /* decorate this process as not wanting to materialize dataless files */
11051 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
11052     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11053     0, 0, sysctl_nspace_prevent_materialization, "I", "");
11054 
11055 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11056 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
11057     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11058 {
11059 	int new_value, old_value, changed = 0;
11060 	int error;
11061 
11062 	error = nspace_materialization_get_thread_state(&old_value);
11063 	if (error) {
11064 		return error;
11065 	}
11066 
11067 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11068 	    &changed);
11069 	if (error == 0 && changed) {
11070 		error = nspace_materialization_set_thread_state(new_value);
11071 	}
11072 	return error;
11073 }
11074 
11075 /* decorate this thread as not wanting to materialize dataless files */
11076 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
11077     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11078     0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
11079 
11080 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11081 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
11082     __unused int arg2, struct sysctl_req *req)
11083 {
11084 	struct proc *p = req->p;
11085 	uint32_t req_status[2] = { 0, 0 };
11086 	uint64_t gencount = 0;
11087 	int error, is_resolver, changed = 0, gencount_changed;
11088 
11089 	error = nspace_resolver_get_proc_state(p, &is_resolver);
11090 	if (error) {
11091 		return error;
11092 	}
11093 
11094 	if (!is_resolver) {
11095 		return EPERM;
11096 	}
11097 
11098 	error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11099 	    &changed);
11100 	if (error) {
11101 		return error;
11102 	}
11103 
11104 	// get the gencount if it was passed
11105 	error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11106 	    &gencount_changed);
11107 	if (error) {
11108 		gencount = 0;
11109 		// we ignore the error because the gencount was optional
11110 		error = 0;
11111 	}
11112 
11113 	/*
11114 	 * req_status[0] is the req_id
11115 	 *
11116 	 * req_status[1] is the errno
11117 	 */
11118 	if (error == 0 && changed) {
11119 		nspace_resolver_req_completed(req_status[0],
11120 		    (int)req_status[1], gencount);
11121 	}
11122 	return error;
11123 }
11124 
11125 /* Resolver reports completed reqs here. */
11126 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11127     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11128     0, 0, sysctl_nspace_complete, "-", "");
11129 
11130 #endif /* CONFIG_DATALESS_FILES */
11131 
11132 #if CONFIG_DATALESS_FILES
11133 #define __no_dataless_unused    /* nothing */
11134 #else
11135 #define __no_dataless_unused    __unused
11136 #endif
11137 
11138 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)11139 vfs_context_dataless_materialization_is_prevented(
11140 	vfs_context_t const ctx __no_dataless_unused)
11141 {
11142 #if CONFIG_DATALESS_FILES
11143 	proc_t const p = vfs_context_proc(ctx);
11144 	thread_t const t = vfs_context_thread(ctx);
11145 	uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11146 
11147 	/*
11148 	 * Kernel context ==> return EDEADLK, as we would with any random
11149 	 * process decorated as no-materialize.
11150 	 */
11151 	if (ctx == vfs_context_kernel()) {
11152 		return EDEADLK;
11153 	}
11154 
11155 	/*
11156 	 * If the process has the dataless-manipulation entitlement,
11157 	 * materialization is prevented, and depending on the kind
11158 	 * of file system operation, things get to proceed as if the
11159 	 * object is not dataless.
11160 	 */
11161 	if (vfs_context_is_dataless_manipulator(ctx)) {
11162 		return EJUSTRETURN;
11163 	}
11164 
11165 	/*
11166 	 * Per-thread decorations override any process-wide decorations.
11167 	 * (Foundation uses this, and this overrides even the dataless-
11168 	 * manipulation entitlement so as to make API contracts consistent.)
11169 	 */
11170 	if (ut != NULL) {
11171 		if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11172 			return EDEADLK;
11173 		}
11174 		if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11175 			return 0;
11176 		}
11177 	}
11178 
11179 	/*
11180 	 * If the process's iopolicy specifies that dataless files
11181 	 * can be materialized, then we let it go ahead.
11182 	 */
11183 	if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11184 		return 0;
11185 	}
11186 #endif /* CONFIG_DATALESS_FILES */
11187 
11188 	/*
11189 	 * The default behavior is to not materialize dataless files;
11190 	 * return to the caller that deadlock was detected.
11191 	 */
11192 	return EDEADLK;
11193 }
11194 
11195 void
nspace_resolver_init(void)11196 nspace_resolver_init(void)
11197 {
11198 #if CONFIG_DATALESS_FILES
11199 	nspace_resolver_request_hashtbl =
11200 	    hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
11201 	    M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
11202 #endif /* CONFIG_DATALESS_FILES */
11203 }
11204 
11205 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)11206 nspace_resolver_exited(struct proc *p __no_dataless_unused)
11207 {
11208 #if CONFIG_DATALESS_FILES
11209 	struct nspace_resolver_requesthead *bucket;
11210 	struct nspace_resolver_request *req;
11211 	u_long idx;
11212 
11213 	NSPACE_REQ_LOCK();
11214 
11215 	if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11216 	    p == nspace_resolver_proc) {
11217 		for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
11218 			bucket = &nspace_resolver_request_hashtbl[idx];
11219 			LIST_FOREACH(req, bucket, r_hashlink) {
11220 				nspace_resolver_req_mark_complete(req,
11221 				    ETIMEDOUT);
11222 			}
11223 		}
11224 		nspace_resolver_proc = NULL;
11225 	}
11226 
11227 	NSPACE_REQ_UNLOCK();
11228 #endif /* CONFIG_DATALESS_FILES */
11229 }
11230 
11231 int
resolve_nspace_item(struct vnode * vp,uint64_t op)11232 resolve_nspace_item(struct vnode *vp, uint64_t op)
11233 {
11234 	return resolve_nspace_item_ext(vp, op, NULL);
11235 }
11236 
11237 #define DATALESS_RESOLVER_ENTITLEMENT     \
11238 	"com.apple.private.vfs.dataless-resolver"
11239 #define DATALESS_MANIPULATION_ENTITLEMENT \
11240 	"com.apple.private.vfs.dataless-manipulation"
11241 
11242 /*
11243  * Return TRUE if the vfs context is associated with a process entitled
11244  * for dataless manipulation.
11245  *
11246  * XXX Arguably belongs in vfs_subr.c, but is here because of the
11247  * complication around CONFIG_DATALESS_FILES.
11248  */
11249 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __unused)11250 vfs_context_is_dataless_manipulator(vfs_context_t ctx __unused)
11251 {
11252 #if CONFIG_DATALESS_FILES
11253 	assert(ctx->vc_thread == current_thread());
11254 	return IOCurrentTaskHasEntitlement( DATALESS_MANIPULATION_ENTITLEMENT) ||
11255 	       IOCurrentTaskHasEntitlement(DATALESS_RESOLVER_ENTITLEMENT);
11256 #else
11257 	return false;
11258 #endif /* CONFIG_DATALESS_FILES */
11259 }
11260 
11261 #if CONFIG_DATALESS_FILES
11262 static void
log_materialization_prevented(vnode_t vp,uint64_t op)11263 log_materialization_prevented(vnode_t vp, uint64_t op)
11264 {
11265 	char p_name[MAXCOMLEN + 1];
11266 	char *vntype;
11267 	proc_selfname(&p_name[0], sizeof(p_name));
11268 
11269 	if (vp->v_type == VREG) {
11270 		vntype = "File";
11271 	} else if (vp->v_type == VDIR) {
11272 		vntype = "Dir";
11273 	} else if (vp->v_type == VLNK) {
11274 		vntype = "SymLink";
11275 	} else {
11276 		vntype = "Other";
11277 	}
11278 
11279 #if DEVELOPMENT
11280 	char *path = NULL;
11281 	int   len;
11282 
11283 	path = get_pathbuff();
11284 	len = MAXPATHLEN;
11285 	if (path) {
11286 		vn_getpath(vp, path, &len);
11287 	}
11288 
11289 	os_log_debug(OS_LOG_DEFAULT,
11290 	    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) path: %s",
11291 	    p_name, proc_selfpid(),
11292 	    op, vntype, path ? path : "<unknown-path>");
11293 	if (path) {
11294 		release_pathbuff(path);
11295 	}
11296 #else
11297 	os_log_debug(OS_LOG_DEFAULT,
11298 	    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
11299 	    p_name, proc_selfpid(),
11300 	    op, vntype);
11301 #endif
11302 }
11303 #endif /* CONFIG_DATALESS_FILES */
11304 
11305 
11306 static int
vfs_materialize_item(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused)11307 vfs_materialize_item(
11308 	struct vnode *vp __no_dataless_unused,
11309 	uint64_t op __no_dataless_unused,
11310 	int64_t offset __no_dataless_unused,
11311 	int64_t size __no_dataless_unused,
11312 	char *lookup_name __no_dataless_unused,
11313 	size_t const namelen __no_dataless_unused)
11314 {
11315 #if CONFIG_DATALESS_FILES
11316 	struct nspace_resolver_request req;
11317 	kern_return_t kern_ret;
11318 	mach_port_t mach_port;
11319 	char *path = NULL;
11320 	vfs_context_t context;
11321 	int path_len;
11322 	int error;
11323 #ifdef DATALESS_FILES_USE_AUDIT_TOKEN
11324 	audit_token_t atoken;
11325 #endif
11326 
11327 	/*
11328 	 * If this is a snapshot event and the vnode is on a disk image just
11329 	 * pretend nothing happened since any change to the disk image will
11330 	 * cause the disk image itself to get backed up and this avoids multi-
11331 	 * way deadlocks between the snapshot handler and the ever popular
11332 	 * diskimages-helper process. The variable nspace_allow_virtual_devs
11333 	 * allows this behavior to be overridden (for use by the Mobile
11334 	 * TimeMachine testing infrastructure which uses disk images).
11335 	 */
11336 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
11337 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
11338 		return ENOTSUP;
11339 	}
11340 
11341 	context = vfs_context_current();
11342 
11343 	error = vfs_context_dataless_materialization_is_prevented(context);
11344 	if (error) {
11345 		log_materialization_prevented(vp, op);
11346 		return error;
11347 	}
11348 
11349 	kern_ret = host_get_filecoordinationd_port(host_priv_self(),
11350 	    &mach_port);
11351 	if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
11352 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
11353 		/*
11354 		 * Treat this like being unable to access the backing store
11355 		 * server.
11356 		 */
11357 		return ETIMEDOUT;
11358 	}
11359 
11360 	path = zalloc(ZV_NAMEI);
11361 	path_len = MAXPATHLEN;
11362 
11363 	error = vn_getpath(vp, path, &path_len);
11364 	if (error) {
11365 		goto out_release_port;
11366 	}
11367 
11368 #ifdef DATALESS_FILES_USE_AUDIT_TOKEN
11369 	error = vfs_context_copy_audit_token(context, &atoken);
11370 	if (error) {
11371 		goto out_release_port;
11372 	}
11373 #endif
11374 
11375 	req.r_req_id = next_nspace_req_id();
11376 	req.r_resolver_error = 0;
11377 	req.r_flags = 0;
11378 	req.r_vp = vp;
11379 
11380 	NSPACE_REQ_LOCK();
11381 	error = nspace_resolver_req_add(&req);
11382 	NSPACE_REQ_UNLOCK();
11383 	if (error) {
11384 		goto out_release_port;
11385 	}
11386 
11387 	os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
11388 	if (vp->v_type == VDIR) {
11389 		char *tmpname = NULL;
11390 
11391 		/*
11392 		 * If the caller provided a lookup_name *and* a name length,
11393 		 * then we assume the lookup_name is not NUL-terminated.
11394 		 * Allocate a temporary buffer in this case to provide
11395 		 * a NUL-terminated path name to the IPC call.
11396 		 */
11397 		if (lookup_name != NULL && namelen != 0) {
11398 			if (namelen >= PATH_MAX) {
11399 				error = EINVAL;
11400 				goto out_release_port;
11401 			}
11402 			tmpname = zalloc(ZV_NAMEI);
11403 			strlcpy(tmpname, lookup_name, namelen + 1);
11404 			lookup_name = tmpname;
11405 		} else if (lookup_name != NULL) {
11406 			/*
11407 			 * If the caller provided a lookup_name with a
11408 			 * zero name length, then we assume it's NUL-
11409 			 * terminated.  Verify it has a valid length.
11410 			 */
11411 			if (strlen(lookup_name) >= PATH_MAX) {
11412 				error = EINVAL;
11413 				goto out_release_port;
11414 			}
11415 		}
11416 
11417 #ifdef DATALESS_FILES_USE_AUDIT_TOKEN
11418 		kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
11419 		    req.r_req_id, (uint32_t)(op & 0xffffffff),
11420 		    lookup_name == NULL ? "" : lookup_name, path, atoken);
11421 #else
11422 		kern_ret = send_vfs_resolve_dir(mach_port, req.r_req_id,
11423 		    proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
11424 		    lookup_name == NULL ? "" : lookup_name, path);
11425 #endif /* DATALESS_FILES_USE_AUDIT_TOKEN */
11426 
11427 		if (tmpname != NULL) {
11428 			zfree(ZV_NAMEI, tmpname);
11429 
11430 			/*
11431 			 * Poison lookup_name rather than reference
11432 			 * freed memory.
11433 			 */
11434 			lookup_name = NULL;
11435 		}
11436 	} else {
11437 #ifdef DATALESS_FILES_USE_AUDIT_TOKEN
11438 		kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
11439 		    req.r_req_id, (uint32_t)(op & 0xffffffff),
11440 		    offset, size, path, atoken);
11441 #else
11442 		kern_ret = send_vfs_resolve_file(mach_port, req.r_req_id,
11443 		    proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
11444 		    offset, size, path);
11445 #endif /* DATALESS_FILES_USE_AUDIT_TOKEN */
11446 	}
11447 	if (kern_ret != KERN_SUCCESS) {
11448 		/*
11449 		 * Also treat this like being unable to access the backing
11450 		 * store server.
11451 		 */
11452 		os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
11453 		    kern_ret);
11454 		error = ETIMEDOUT;
11455 
11456 		NSPACE_REQ_LOCK();
11457 		nspace_resolver_req_remove(&req);
11458 		NSPACE_REQ_UNLOCK();
11459 		goto out_release_port;
11460 	}
11461 
11462 	/*
11463 	 * Give back the memory we allocated earlier while we wait; we
11464 	 * no longer need it.
11465 	 */
11466 	zfree(ZV_NAMEI, path);
11467 	path = NULL;
11468 
11469 	/*
11470 	 * Request has been submitted to the resolver. Now (interruptibly)
11471 	 * wait for completion. Upon requrn, the request will have been
11472 	 * removed from the lookup table.
11473 	 */
11474 	error = nspace_resolver_req_wait(&req);
11475 
11476 out_release_port:
11477 	if (path != NULL) {
11478 		zfree(ZV_NAMEI, path);
11479 	}
11480 	ipc_port_release_send(mach_port);
11481 
11482 	return error;
11483 #else
11484 	return ENOTSUP;
11485 #endif /* CONFIG_DATALESS_FILES */
11486 }
11487 
11488 /*
11489  * vfs_materialize_file: Materialize a regular file.
11490  *
11491  * Inputs:
11492  * vp		The dataless file to be materialized.
11493  *
11494  * op		What kind of operation is being performed:
11495  *		-> NAMESPACE_HANDLER_READ_OP
11496  *		-> NAMESPACE_HANDLER_WRITE_OP
11497  *		-> NAMESPACE_HANDLER_LINK_CREATE
11498  *		-> NAMESPACE_HANDLER_DELETE_OP
11499  *		-> NAMESPACE_HANDLER_TRUNCATE_OP
11500  *		-> NAMESPACE_HANDLER_RENAME_OP
11501  *
11502  * offset	offset of I/O for READ or WRITE.  Ignored for
11503  *		other ops.
11504  *
11505  * size		size of I/O for READ or WRITE  Ignored for
11506  *		other ops.
11507  *
11508  * If offsize or size are -1 for a READ or WRITE, then the resolver should
11509  * consider the range to be unknown.
11510  *
11511  * Upon successful return, the caller may proceed with the operation.
11512  * N.B. the file may still be "dataless" in this case.
11513  */
11514 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)11515 vfs_materialize_file(
11516 	struct vnode *vp,
11517 	uint64_t op,
11518 	int64_t offset,
11519 	int64_t size)
11520 {
11521 	if (vp->v_type != VREG) {
11522 		return EFTYPE;
11523 	}
11524 	return vfs_materialize_item(vp, op, offset, size, NULL, 0);
11525 }
11526 
11527 /*
11528  * vfs_materialize_dir:
11529  *
11530  * Inputs:
11531  * vp		The dataless directory to be materialized.
11532  *
11533  * op		What kind of operation is being performed:
11534  *		-> NAMESPACE_HANDLER_READ_OP
11535  *		-> NAMESPACE_HANDLER_WRITE_OP
11536  *		-> NAMESPACE_HANDLER_DELETE_OP
11537  *		-> NAMESPACE_HANDLER_RENAME_OP
11538  *		-> NAMESPACE_HANDLER_LOOKUP_OP
11539  *
11540  * lookup_name	Name being looked up for a LOOKUP op.  Ignored for
11541  *		other ops.  May or may not be NUL-terminated; see below.
11542  *
11543  * namelen	If non-zero, then lookup_name is assumed to not be NUL-
11544  *		terminated and namelen is the number of valid bytes in
11545  *		lookup_name. If zero, then lookup_name is assumed to be
11546  *		NUL-terminated.
11547  *
11548  * Upon successful return, the caller may proceed with the operation.
11549  * N.B. the directory may still be "dataless" in this case.
11550  */
11551 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)11552 vfs_materialize_dir(
11553 	struct vnode *vp,
11554 	uint64_t op,
11555 	char *lookup_name,
11556 	size_t namelen)
11557 {
11558 	if (vp->v_type != VDIR) {
11559 		return EFTYPE;
11560 	}
11561 	if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
11562 		return EINVAL;
11563 	}
11564 	return vfs_materialize_item(vp, op, 0, 0, lookup_name, namelen);
11565 }
11566 
11567 int
resolve_nspace_item_ext(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,void * arg __unused)11568 resolve_nspace_item_ext(
11569 	struct vnode *vp __no_dataless_unused,
11570 	uint64_t op __no_dataless_unused,
11571 	void *arg __unused)
11572 {
11573 #if CONFIG_DATALESS_FILES
11574 	int error;
11575 	mach_port_t mp;
11576 	char *path = NULL;
11577 	int path_len;
11578 	kern_return_t kr;
11579 	struct nspace_resolver_request req;
11580 
11581 	// only allow namespace events on regular files, directories and symlinks.
11582 	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
11583 		return EFTYPE;
11584 	}
11585 
11586 	//
11587 	// if this is a snapshot event and the vnode is on a
11588 	// disk image just pretend nothing happened since any
11589 	// change to the disk image will cause the disk image
11590 	// itself to get backed up and this avoids multi-way
11591 	// deadlocks between the snapshot handler and the ever
11592 	// popular diskimages-helper process.  the variable
11593 	// nspace_allow_virtual_devs allows this behavior to
11594 	// be overridden (for use by the Mobile TimeMachine
11595 	// testing infrastructure which uses disk images)
11596 	//
11597 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
11598 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
11599 		return ENOTSUP;
11600 	}
11601 
11602 	error = vfs_context_dataless_materialization_is_prevented(
11603 		vfs_context_current());
11604 	if (error) {
11605 		log_materialization_prevented(vp, op);
11606 		return error;
11607 	}
11608 
11609 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11610 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11611 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
11612 		// Treat this like being unable to access the backing
11613 		// store server.
11614 		return ETIMEDOUT;
11615 	}
11616 
11617 	path = zalloc(ZV_NAMEI);
11618 	path_len = MAXPATHLEN;
11619 
11620 	error = vn_getpath(vp, path, &path_len);
11621 	if (error == 0) {
11622 		int xxx_rdar44371223;   /* XXX Mig bug */
11623 		req.r_req_id = next_nspace_req_id();
11624 		req.r_resolver_error = 0;
11625 		req.r_flags = 0;
11626 
11627 		if ((error = vnode_ref(vp)) == 0) {     // take a ref so that the vnode doesn't go away
11628 			req.r_vp = vp;
11629 		} else {
11630 			goto out_release_port;
11631 		}
11632 
11633 		NSPACE_REQ_LOCK();
11634 		error = nspace_resolver_req_add(&req);
11635 		NSPACE_REQ_UNLOCK();
11636 		if (error) {
11637 			vnode_rele(req.r_vp);
11638 			goto out_release_port;
11639 		}
11640 
11641 		os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
11642 		kr = send_nspace_resolve_path(mp, req.r_req_id,
11643 		    proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
11644 		    path, &xxx_rdar44371223);
11645 		if (kr != KERN_SUCCESS) {
11646 			// Also treat this like being unable to access
11647 			// the backing store server.
11648 			os_log_error(OS_LOG_DEFAULT,
11649 			    "NSPACE resolve_path failure: %d", kr);
11650 			error = ETIMEDOUT;
11651 
11652 			NSPACE_REQ_LOCK();
11653 			nspace_resolver_req_remove(&req);
11654 			NSPACE_REQ_UNLOCK();
11655 			vnode_rele(req.r_vp);
11656 			goto out_release_port;
11657 		}
11658 
11659 		// Give back the memory we allocated earlier while
11660 		// we wait; we no longer need it.
11661 		zfree(ZV_NAMEI, path);
11662 		path = NULL;
11663 
11664 		// Request has been submitted to the resolver.
11665 		// Now (interruptibly) wait for completion.
11666 		// Upon requrn, the request will have been removed
11667 		// from the lookup table.
11668 		error = nspace_resolver_req_wait(&req);
11669 
11670 		vnode_rele(req.r_vp);
11671 	}
11672 
11673 out_release_port:
11674 	if (path != NULL) {
11675 		zfree(ZV_NAMEI, path);
11676 	}
11677 	ipc_port_release_send(mp);
11678 
11679 	return error;
11680 #else
11681 	return ENOTSUP;
11682 #endif /* CONFIG_DATALESS_FILES */
11683 }
11684 
11685 int
nspace_snapshot_event(__unused vnode_t vp,__unused time_t ctime,__unused uint64_t op_type,__unused void * arg)11686 nspace_snapshot_event(__unused vnode_t vp, __unused  time_t ctime,
11687     __unused uint64_t op_type, __unused void *arg)
11688 {
11689 	return 0;
11690 }
11691 
11692 #if 0
11693 static int
11694 build_volfs_path(struct vnode *vp, char *path, int *len)
11695 {
11696 	struct vnode_attr va;
11697 	int ret;
11698 
11699 	VATTR_INIT(&va);
11700 	VATTR_WANTED(&va, va_fsid);
11701 	VATTR_WANTED(&va, va_fileid);
11702 
11703 	if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
11704 		*len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
11705 		ret = -1;
11706 	} else {
11707 		*len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
11708 		ret = 0;
11709 	}
11710 
11711 	return ret;
11712 }
11713 #endif
11714 
11715 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)11716 fsctl_bogus_command_compat(unsigned long cmd)
11717 {
11718 	switch (cmd) {
11719 	case IOCBASECMD(FSIOC_SYNC_VOLUME):
11720 		return FSIOC_SYNC_VOLUME;
11721 	case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
11722 		return FSIOC_ROUTEFS_SETROUTEID;
11723 	case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
11724 		return FSIOC_SET_PACKAGE_EXTS;
11725 	case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
11726 		return FSIOC_SET_FSTYPENAME_OVERRIDE;
11727 	case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
11728 		return DISK_CONDITIONER_IOC_GET;
11729 	case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
11730 		return DISK_CONDITIONER_IOC_SET;
11731 	case IOCBASECMD(FSIOC_FIOSEEKHOLE):
11732 		return FSIOC_FIOSEEKHOLE;
11733 	case IOCBASECMD(FSIOC_FIOSEEKDATA):
11734 		return FSIOC_FIOSEEKDATA;
11735 	case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
11736 		return SPOTLIGHT_IOC_GET_MOUNT_TIME;
11737 	case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
11738 		return SPOTLIGHT_IOC_GET_LAST_MTIME;
11739 	}
11740 
11741 	return cmd;
11742 }
11743 
11744 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)11745 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
11746 {
11747 	return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
11748 }
11749 
11750 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)11751 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
11752 {
11753 	struct vfs_attr vfa;
11754 	mount_t mp = vp->v_mount;
11755 	unsigned arg;
11756 	int error;
11757 
11758 	/* record vid of vp so we can drop it below. */
11759 	uint32_t vvid = vp->v_id;
11760 
11761 	/*
11762 	 * Then grab mount_iterref so that we can release the vnode.
11763 	 * Without this, a thread may call vnode_iterate_prepare then
11764 	 * get into a deadlock because we've never released the root vp
11765 	 */
11766 	error = mount_iterref(mp, 0);
11767 	if (error) {
11768 		return error;
11769 	}
11770 	vnode_put(vp);
11771 
11772 	arg = MNT_NOWAIT;
11773 	if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
11774 		arg = MNT_WAIT;
11775 	}
11776 
11777 	/*
11778 	 * If the filessytem supports multiple filesytems in a
11779 	 * partition (For eg APFS volumes in a container, it knows
11780 	 * that the waitfor argument to VFS_SYNC are flags.
11781 	 */
11782 	VFSATTR_INIT(&vfa);
11783 	VFSATTR_WANTED(&vfa, f_capabilities);
11784 	if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
11785 	    VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
11786 	    ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
11787 	    ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
11788 		arg |= MNT_VOLUME;
11789 	}
11790 
11791 	/* issue the sync for this volume */
11792 	(void)sync_callback(mp, &arg);
11793 
11794 	/*
11795 	 * Then release the mount_iterref once we're done syncing; it's not
11796 	 * needed for the VNOP_IOCTL below
11797 	 */
11798 	mount_iterdrop(mp);
11799 
11800 	if (arg & FSCTL_SYNC_FULLSYNC) {
11801 		/* re-obtain vnode iocount on the root vp, if possible */
11802 		error = vnode_getwithvid(vp, vvid);
11803 		if (error == 0) {
11804 			error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
11805 			vnode_put(vp);
11806 		}
11807 	}
11808 	/* mark the argument VP as having been released */
11809 	*arg_vp = NULL;
11810 	return error;
11811 }
11812 
11813 #if ROUTEFS
11814 static int __attribute__((noinline))
handle_routes(user_addr_t udata)11815 handle_routes(user_addr_t udata)
11816 {
11817 	char routepath[MAXPATHLEN];
11818 	size_t len = 0;
11819 	int error;
11820 
11821 	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11822 		return error;
11823 	}
11824 	bzero(routepath, MAXPATHLEN);
11825 	error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
11826 	if (error) {
11827 		return error;
11828 	}
11829 	error = routefs_kernel_mount(routepath);
11830 	return error;
11831 }
11832 #endif
11833 
11834 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)11835 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
11836 {
11837 	struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
11838 	struct vnode_attr va;
11839 	int error;
11840 
11841 	VATTR_INIT(&va);
11842 	VATTR_SET(&va, va_flags, cas->new_flags);
11843 
11844 	error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
11845 	return error;
11846 }
11847 
11848 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)11849 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
11850 {
11851 	struct mount *mp = NULL;
11852 	errno_t rootauth = 0;
11853 
11854 	mp = vp->v_mount;
11855 
11856 	/*
11857 	 * query the underlying FS and see if it reports something
11858 	 * sane for this vnode. If volume is authenticated via
11859 	 * chunklist, leave that for the caller to determine.
11860 	 */
11861 	rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
11862 
11863 	return rootauth;
11864 }
11865 
11866 /*
11867  * Make a filesystem-specific control call:
11868  */
11869 /* ARGSUSED */
11870 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)11871 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
11872 {
11873 	int error = 0;
11874 	boolean_t is64bit;
11875 	u_int size;
11876 #define STK_PARAMS 128
11877 	char stkbuf[STK_PARAMS] = {0};
11878 	caddr_t data, memp;
11879 	vnode_t vp = *arg_vp;
11880 
11881 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
11882 		return ENOTTY;
11883 	}
11884 
11885 	cmd = fsctl_bogus_command_compat(cmd);
11886 
11887 	size = IOCPARM_LEN(cmd);
11888 	if (size > IOCPARM_MAX) {
11889 		return EINVAL;
11890 	}
11891 
11892 	is64bit = proc_is64bit(p);
11893 
11894 	memp = NULL;
11895 
11896 	if (size > sizeof(stkbuf)) {
11897 		if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
11898 			return ENOMEM;
11899 		}
11900 		data = memp;
11901 	} else {
11902 		data = &stkbuf[0];
11903 	};
11904 
11905 	if (cmd & IOC_IN) {
11906 		if (size) {
11907 			error = copyin(udata, data, size);
11908 			if (error) {
11909 				if (memp) {
11910 					kfree_data(memp, size);
11911 				}
11912 				return error;
11913 			}
11914 		} else {
11915 			if (is64bit) {
11916 				*(user_addr_t *)data = udata;
11917 			} else {
11918 				*(uint32_t *)data = (uint32_t)udata;
11919 			}
11920 		};
11921 	} else if ((cmd & IOC_OUT) && size) {
11922 		/*
11923 		 * Zero the buffer so the user always
11924 		 * gets back something deterministic.
11925 		 */
11926 		bzero(data, size);
11927 	} else if (cmd & IOC_VOID) {
11928 		if (is64bit) {
11929 			*(user_addr_t *)data = udata;
11930 		} else {
11931 			*(uint32_t *)data = (uint32_t)udata;
11932 		}
11933 	}
11934 
11935 	/* Check to see if it's a generic command */
11936 	switch (cmd) {
11937 	case FSIOC_SYNC_VOLUME:
11938 		error = handle_sync_volume(vp, arg_vp, data, ctx);
11939 		break;
11940 
11941 	case FSIOC_ROUTEFS_SETROUTEID:
11942 #if ROUTEFS
11943 		error = handle_routes(udata);
11944 #endif
11945 		break;
11946 
11947 	case FSIOC_SET_PACKAGE_EXTS: {
11948 		user_addr_t ext_strings;
11949 		uint32_t    num_entries;
11950 		uint32_t    max_width;
11951 
11952 		if ((error = priv_check_cred(kauth_cred_get(), PRIV_PACKAGE_EXTENSIONS, 0))) {
11953 			break;
11954 		}
11955 
11956 		if ((is64bit && size != sizeof(user64_package_ext_info))
11957 		    || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
11958 			// either you're 64-bit and passed a 64-bit struct or
11959 			// you're 32-bit and passed a 32-bit struct.  otherwise
11960 			// it's not ok.
11961 			error = EINVAL;
11962 			break;
11963 		}
11964 
11965 		if (is64bit) {
11966 			if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
11967 				assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
11968 			}
11969 			ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
11970 			num_entries = ((user64_package_ext_info *)data)->num_entries;
11971 			max_width   = ((user64_package_ext_info *)data)->max_width;
11972 		} else {
11973 			ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
11974 			num_entries = ((user32_package_ext_info *)data)->num_entries;
11975 			max_width   = ((user32_package_ext_info *)data)->max_width;
11976 		}
11977 		error = set_package_extensions_table(ext_strings, num_entries, max_width);
11978 	}
11979 	break;
11980 
11981 	case FSIOC_SET_FSTYPENAME_OVERRIDE:
11982 	{
11983 		if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11984 			break;
11985 		}
11986 		if (vp->v_mount) {
11987 			mount_lock(vp->v_mount);
11988 			if (data[0] != 0) {
11989 				int i;
11990 				for (i = 0; i < MFSTYPENAMELEN; i++) {
11991 					if (!data[i]) {
11992 						goto continue_copy;
11993 					}
11994 				}
11995 				/*
11996 				 * Getting here means we have a user data string which has no
11997 				 * NULL termination in its first MFSTYPENAMELEN bytes.
11998 				 * This is bogus, let's avoid strlcpy-ing the read data and
11999 				 * return an error.
12000 				 */
12001 				error = EINVAL;
12002 				goto unlock;
12003 continue_copy:
12004 				strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
12005 				vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
12006 				if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
12007 					vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
12008 					vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
12009 				}
12010 			} else {
12011 				if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
12012 					vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
12013 				}
12014 				vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
12015 				vp->v_mount->fstypename_override[0] = '\0';
12016 			}
12017 unlock:
12018 			mount_unlock(vp->v_mount);
12019 		}
12020 	}
12021 	break;
12022 
12023 	case DISK_CONDITIONER_IOC_GET: {
12024 		error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12025 	}
12026 	break;
12027 
12028 	case DISK_CONDITIONER_IOC_SET: {
12029 		error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
12030 	}
12031 	break;
12032 
12033 	case FSIOC_CAS_BSDFLAGS:
12034 		error = handle_flags(vp, data, ctx);
12035 		break;
12036 
12037 	case FSIOC_FD_ONLY_OPEN_ONCE: {
12038 		error = 0;
12039 		if (vnode_usecount(vp) > 1) {
12040 			vnode_lock_spin(vp);
12041 			if (vp->v_lflag & VL_HASSTREAMS) {
12042 				if (vnode_isinuse_locked(vp, 1, 1)) {
12043 					error = EBUSY;
12044 				}
12045 			} else if (vnode_usecount(vp) > 1) {
12046 				error = EBUSY;
12047 			}
12048 			vnode_unlock(vp);
12049 		}
12050 	}
12051 	break;
12052 
12053 	case FSIOC_EVAL_ROOTAUTH:
12054 		error = handle_auth(vp, cmd, data, options, ctx);
12055 		break;
12056 
12057 	default: {
12058 		/* other, known commands shouldn't be passed down here */
12059 		switch (cmd) {
12060 		case F_PUNCHHOLE:
12061 		case F_TRIM_ACTIVE_FILE:
12062 		case F_RDADVISE:
12063 		case F_TRANSCODEKEY:
12064 		case F_GETPROTECTIONLEVEL:
12065 		case F_GETDEFAULTPROTLEVEL:
12066 		case F_MAKECOMPRESSED:
12067 		case F_SET_GREEDY_MODE:
12068 		case F_SETSTATICCONTENT:
12069 		case F_SETIOTYPE:
12070 		case F_SETBACKINGSTORE:
12071 		case F_GETPATH_MTMINFO:
12072 		case APFSIOC_REVERT_TO_SNAPSHOT:
12073 		case FSIOC_FIOSEEKHOLE:
12074 		case FSIOC_FIOSEEKDATA:
12075 		case HFS_GET_BOOT_INFO:
12076 		case HFS_SET_BOOT_INFO:
12077 		case FIOPINSWAP:
12078 		case F_CHKCLEAN:
12079 		case F_FULLFSYNC:
12080 		case F_BARRIERFSYNC:
12081 		case F_FREEZE_FS:
12082 		case F_THAW_FS:
12083 		case FSIOC_KERNEL_ROOTAUTH:
12084 			error = EINVAL;
12085 			goto outdrop;
12086 		}
12087 		/* Invoke the filesystem-specific code */
12088 		error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12089 	}
12090 	} /* end switch stmt */
12091 
12092 	/*
12093 	 * if no errors, copy any data to user. Size was
12094 	 * already set and checked above.
12095 	 */
12096 	if (error == 0 && (cmd & IOC_OUT) && size) {
12097 		error = copyout(data, udata, size);
12098 	}
12099 
12100 outdrop:
12101 	if (memp) {
12102 		kfree_data(memp, size);
12103 	}
12104 
12105 	return error;
12106 }
12107 
12108 /* ARGSUSED */
12109 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)12110 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
12111 {
12112 	int error;
12113 	struct nameidata nd;
12114 	uint32_t nameiflags;
12115 	vnode_t vp = NULL;
12116 	vfs_context_t ctx = vfs_context_current();
12117 
12118 	AUDIT_ARG(cmd, (int)uap->cmd);
12119 	AUDIT_ARG(value32, uap->options);
12120 	/* Get the vnode for the file we are getting info on:  */
12121 	nameiflags = 0;
12122 	//
12123 	// if we come through fsctl() then the file is by definition not open.
12124 	// therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
12125 	// lest the caller mistakenly thinks the only open is their own (but in
12126 	// reality it's someone elses).
12127 	//
12128 	if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
12129 		return EINVAL;
12130 	}
12131 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
12132 		nameiflags |= FOLLOW;
12133 	}
12134 	if (uap->cmd == FSIOC_FIRMLINK_CTL) {
12135 		nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
12136 	}
12137 	NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
12138 	    UIO_USERSPACE, uap->path, ctx);
12139 	if ((error = namei(&nd))) {
12140 		goto done;
12141 	}
12142 	vp = nd.ni_vp;
12143 	nameidone(&nd);
12144 
12145 #if CONFIG_MACF
12146 	error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
12147 	if (error) {
12148 		goto done;
12149 	}
12150 #endif
12151 
12152 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12153 
12154 done:
12155 	if (vp) {
12156 		vnode_put(vp);
12157 	}
12158 	return error;
12159 }
12160 /* ARGSUSED */
12161 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)12162 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
12163 {
12164 	int error;
12165 	vnode_t vp = NULL;
12166 	vfs_context_t ctx = vfs_context_current();
12167 	int fd = -1;
12168 
12169 	AUDIT_ARG(fd, uap->fd);
12170 	AUDIT_ARG(cmd, (int)uap->cmd);
12171 	AUDIT_ARG(value32, uap->options);
12172 
12173 	/* Get the vnode for the file we are getting info on:  */
12174 	if ((error = file_vnode(uap->fd, &vp))) {
12175 		return error;
12176 	}
12177 	fd = uap->fd;
12178 	if ((error = vnode_getwithref(vp))) {
12179 		file_drop(fd);
12180 		return error;
12181 	}
12182 
12183 #if CONFIG_MACF
12184 	if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
12185 		file_drop(fd);
12186 		vnode_put(vp);
12187 		return error;
12188 	}
12189 #endif
12190 
12191 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12192 
12193 	file_drop(fd);
12194 
12195 	/*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
12196 	if (vp) {
12197 		vnode_put(vp);
12198 	}
12199 
12200 	return error;
12201 }
12202 /* end of fsctl system call */
12203 
12204 #define FILESEC_ACCESS_ENTITLEMENT              \
12205 	"com.apple.private.vfs.filesec-access"
12206 
12207 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)12208 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
12209 {
12210 	if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
12211 		/*
12212 		 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
12213 		 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
12214 		 */
12215 		if ((!setting && vfs_context_issuser(ctx)) ||
12216 		    IOCurrentTaskHasEntitlement(FILESEC_ACCESS_ENTITLEMENT)) {
12217 			return 0;
12218 		}
12219 	}
12220 
12221 	return EPERM;
12222 }
12223 
12224 /*
12225  *  Retrieve the data of an extended attribute.
12226  */
12227 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)12228 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
12229 {
12230 	vnode_t vp;
12231 	struct nameidata nd;
12232 	char attrname[XATTR_MAXNAMELEN + 1];
12233 	vfs_context_t ctx = vfs_context_current();
12234 	uio_t auio = NULL;
12235 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12236 	size_t attrsize = 0;
12237 	size_t namelen;
12238 	u_int32_t nameiflags;
12239 	int error;
12240 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12241 
12242 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12243 		return EINVAL;
12244 	}
12245 
12246 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12247 	NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
12248 	if ((error = namei(&nd))) {
12249 		return error;
12250 	}
12251 	vp = nd.ni_vp;
12252 	nameidone(&nd);
12253 
12254 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12255 	if (error != 0) {
12256 		goto out;
12257 	}
12258 	if (xattr_protected(attrname) &&
12259 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12260 		goto out;
12261 	}
12262 	/*
12263 	 * the specific check for 0xffffffff is a hack to preserve
12264 	 * binaray compatibilty in K64 with applications that discovered
12265 	 * that passing in a buf pointer and a size of -1 resulted in
12266 	 * just the size of the indicated extended attribute being returned.
12267 	 * this isn't part of the documented behavior, but because of the
12268 	 * original implemtation's check for "uap->size > 0", this behavior
12269 	 * was allowed. In K32 that check turned into a signed comparison
12270 	 * even though uap->size is unsigned...  in K64, we blow by that
12271 	 * check because uap->size is unsigned and doesn't get sign smeared
12272 	 * in the munger for a 32 bit user app.  we also need to add a
12273 	 * check to limit the maximum size of the buffer being passed in...
12274 	 * unfortunately, the underlying fileystems seem to just malloc
12275 	 * the requested size even if the actual extended attribute is tiny.
12276 	 * because that malloc is for kernel wired memory, we have to put a
12277 	 * sane limit on it.
12278 	 *
12279 	 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
12280 	 * U64 running on K64 will yield -1 (64 bits wide)
12281 	 * U32/U64 running on K32 will yield -1 (32 bits wide)
12282 	 */
12283 	if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
12284 		goto no_uio;
12285 	}
12286 
12287 	if (uap->value) {
12288 		if (uap->size > (size_t)XATTR_MAXSIZE) {
12289 			uap->size = XATTR_MAXSIZE;
12290 		}
12291 
12292 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12293 		    &uio_buf[0], sizeof(uio_buf));
12294 		uio_addiov(auio, uap->value, uap->size);
12295 	}
12296 no_uio:
12297 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
12298 out:
12299 	vnode_put(vp);
12300 
12301 	if (auio) {
12302 		*retval = uap->size - uio_resid(auio);
12303 	} else {
12304 		*retval = (user_ssize_t)attrsize;
12305 	}
12306 
12307 	return error;
12308 }
12309 
12310 /*
12311  * Retrieve the data of an extended attribute.
12312  */
12313 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)12314 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
12315 {
12316 	vnode_t vp;
12317 	char attrname[XATTR_MAXNAMELEN + 1];
12318 	vfs_context_t ctx = vfs_context_current();
12319 	uio_t auio = NULL;
12320 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12321 	size_t attrsize = 0;
12322 	size_t namelen;
12323 	int error;
12324 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12325 
12326 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12327 		return EINVAL;
12328 	}
12329 
12330 	if ((error = file_vnode(uap->fd, &vp))) {
12331 		return error;
12332 	}
12333 	if ((error = vnode_getwithref(vp))) {
12334 		file_drop(uap->fd);
12335 		return error;
12336 	}
12337 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12338 	if (error != 0) {
12339 		goto out;
12340 	}
12341 	if (xattr_protected(attrname) &&
12342 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12343 		goto out;
12344 	}
12345 	if (uap->value && uap->size > 0) {
12346 		if (uap->size > (size_t)XATTR_MAXSIZE) {
12347 			uap->size = XATTR_MAXSIZE;
12348 		}
12349 
12350 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12351 		    &uio_buf[0], sizeof(uio_buf));
12352 		uio_addiov(auio, uap->value, uap->size);
12353 	}
12354 
12355 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
12356 out:
12357 	(void)vnode_put(vp);
12358 	file_drop(uap->fd);
12359 
12360 	if (auio) {
12361 		*retval = uap->size - uio_resid(auio);
12362 	} else {
12363 		*retval = (user_ssize_t)attrsize;
12364 	}
12365 	return error;
12366 }
12367 
12368 /* struct for checkdirs iteration */
12369 struct setxattr_ctx {
12370 	struct nameidata nd;
12371 	char attrname[XATTR_MAXNAMELEN + 1];
12372 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12373 };
12374 
12375 /*
12376  * Set the data of an extended attribute.
12377  */
12378 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)12379 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
12380 {
12381 	vnode_t vp;
12382 	vfs_context_t ctx = vfs_context_current();
12383 	uio_t auio = NULL;
12384 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12385 	size_t namelen;
12386 	u_int32_t nameiflags;
12387 	int error;
12388 	struct setxattr_ctx *sactx;
12389 
12390 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12391 		return EINVAL;
12392 	}
12393 
12394 	sactx = (struct setxattr_ctx *)kalloc_data(sizeof(struct setxattr_ctx), Z_WAITOK);
12395 	if (sactx == NULL) {
12396 		return ENOMEM;
12397 	}
12398 
12399 	error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
12400 	if (error != 0) {
12401 		if (error == EPERM) {
12402 			/* if the string won't fit in attrname, copyinstr emits EPERM */
12403 			error = ENAMETOOLONG;
12404 		}
12405 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
12406 		goto out;
12407 	}
12408 	if (xattr_protected(sactx->attrname) &&
12409 	    (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
12410 		goto out;
12411 	}
12412 	if (uap->size != 0 && uap->value == 0) {
12413 		error = EINVAL;
12414 		goto out;
12415 	}
12416 	if (uap->size > INT_MAX) {
12417 		error = E2BIG;
12418 		goto out;
12419 	}
12420 
12421 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12422 	NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
12423 	if ((error = namei(&sactx->nd))) {
12424 		goto out;
12425 	}
12426 	vp = sactx->nd.ni_vp;
12427 	nameidone(&sactx->nd);
12428 
12429 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
12430 	    &sactx->uio_buf[0], sizeof(sactx->uio_buf));
12431 	uio_addiov(auio, uap->value, uap->size);
12432 
12433 	error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
12434 #if CONFIG_FSE
12435 	if (error == 0) {
12436 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
12437 		    FSE_ARG_VNODE, vp,
12438 		    FSE_ARG_DONE);
12439 	}
12440 #endif
12441 	vnode_put(vp);
12442 out:
12443 	kfree_data(sactx, sizeof(struct setxattr_ctx));
12444 	*retval = 0;
12445 	return error;
12446 }
12447 
12448 /*
12449  * Set the data of an extended attribute.
12450  */
12451 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)12452 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
12453 {
12454 	vnode_t vp;
12455 	char attrname[XATTR_MAXNAMELEN + 1];
12456 	vfs_context_t ctx = vfs_context_current();
12457 	uio_t auio = NULL;
12458 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12459 	size_t namelen;
12460 	int error;
12461 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12462 
12463 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12464 		return EINVAL;
12465 	}
12466 
12467 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12468 	if (error != 0) {
12469 		if (error == EPERM) {
12470 			/* if the string won't fit in attrname, copyinstr emits EPERM */
12471 			return ENAMETOOLONG;
12472 		}
12473 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
12474 		return error;
12475 	}
12476 	if (xattr_protected(attrname) &&
12477 	    (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
12478 		return error;
12479 	}
12480 	if (uap->size != 0 && uap->value == 0) {
12481 		return EINVAL;
12482 	}
12483 	if (uap->size > INT_MAX) {
12484 		return E2BIG;
12485 	}
12486 	if ((error = file_vnode(uap->fd, &vp))) {
12487 		return error;
12488 	}
12489 	if ((error = vnode_getwithref(vp))) {
12490 		file_drop(uap->fd);
12491 		return error;
12492 	}
12493 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
12494 	    &uio_buf[0], sizeof(uio_buf));
12495 	uio_addiov(auio, uap->value, uap->size);
12496 
12497 	error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
12498 #if CONFIG_FSE
12499 	if (error == 0) {
12500 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
12501 		    FSE_ARG_VNODE, vp,
12502 		    FSE_ARG_DONE);
12503 	}
12504 #endif
12505 	vnode_put(vp);
12506 	file_drop(uap->fd);
12507 	*retval = 0;
12508 	return error;
12509 }
12510 
12511 /*
12512  * Remove an extended attribute.
12513  * XXX Code duplication here.
12514  */
12515 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)12516 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
12517 {
12518 	vnode_t vp;
12519 	struct nameidata nd;
12520 	char attrname[XATTR_MAXNAMELEN + 1];
12521 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12522 	vfs_context_t ctx = vfs_context_current();
12523 	size_t namelen;
12524 	u_int32_t nameiflags;
12525 	int error;
12526 
12527 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12528 		return EINVAL;
12529 	}
12530 
12531 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12532 	if (error != 0) {
12533 		return error;
12534 	}
12535 	if (xattr_protected(attrname)) {
12536 		return EPERM;
12537 	}
12538 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12539 	NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
12540 	if ((error = namei(&nd))) {
12541 		return error;
12542 	}
12543 	vp = nd.ni_vp;
12544 	nameidone(&nd);
12545 
12546 	error = vn_removexattr(vp, attrname, uap->options, ctx);
12547 #if CONFIG_FSE
12548 	if (error == 0) {
12549 		add_fsevent(FSE_XATTR_REMOVED, ctx,
12550 		    FSE_ARG_VNODE, vp,
12551 		    FSE_ARG_DONE);
12552 	}
12553 #endif
12554 	vnode_put(vp);
12555 	*retval = 0;
12556 	return error;
12557 }
12558 
12559 /*
12560  * Remove an extended attribute.
12561  * XXX Code duplication here.
12562  */
12563 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)12564 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
12565 {
12566 	vnode_t vp;
12567 	char attrname[XATTR_MAXNAMELEN + 1];
12568 	size_t namelen;
12569 	int error;
12570 #if CONFIG_FSE
12571 	vfs_context_t ctx = vfs_context_current();
12572 #endif
12573 
12574 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12575 		return EINVAL;
12576 	}
12577 
12578 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12579 	if (error != 0) {
12580 		return error;
12581 	}
12582 	if (xattr_protected(attrname)) {
12583 		return EPERM;
12584 	}
12585 	if ((error = file_vnode(uap->fd, &vp))) {
12586 		return error;
12587 	}
12588 	if ((error = vnode_getwithref(vp))) {
12589 		file_drop(uap->fd);
12590 		return error;
12591 	}
12592 
12593 	error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
12594 #if CONFIG_FSE
12595 	if (error == 0) {
12596 		add_fsevent(FSE_XATTR_REMOVED, ctx,
12597 		    FSE_ARG_VNODE, vp,
12598 		    FSE_ARG_DONE);
12599 	}
12600 #endif
12601 	vnode_put(vp);
12602 	file_drop(uap->fd);
12603 	*retval = 0;
12604 	return error;
12605 }
12606 
12607 /*
12608  * Retrieve the list of extended attribute names.
12609  * XXX Code duplication here.
12610  */
12611 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)12612 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
12613 {
12614 	vnode_t vp;
12615 	struct nameidata nd;
12616 	vfs_context_t ctx = vfs_context_current();
12617 	uio_t auio = NULL;
12618 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12619 	size_t attrsize = 0;
12620 	u_int32_t nameiflags;
12621 	int error;
12622 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12623 
12624 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12625 		return EINVAL;
12626 	}
12627 
12628 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12629 	NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
12630 	if ((error = namei(&nd))) {
12631 		return error;
12632 	}
12633 	vp = nd.ni_vp;
12634 	nameidone(&nd);
12635 	if (uap->namebuf != 0 && uap->bufsize > 0) {
12636 		auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
12637 		    &uio_buf[0], sizeof(uio_buf));
12638 		uio_addiov(auio, uap->namebuf, uap->bufsize);
12639 	}
12640 
12641 	error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
12642 
12643 	vnode_put(vp);
12644 	if (auio) {
12645 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
12646 	} else {
12647 		*retval = (user_ssize_t)attrsize;
12648 	}
12649 	return error;
12650 }
12651 
12652 /*
12653  * Retrieve the list of extended attribute names.
12654  * XXX Code duplication here.
12655  */
12656 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)12657 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
12658 {
12659 	vnode_t vp;
12660 	uio_t auio = NULL;
12661 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12662 	size_t attrsize = 0;
12663 	int error;
12664 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12665 
12666 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12667 		return EINVAL;
12668 	}
12669 
12670 	if ((error = file_vnode(uap->fd, &vp))) {
12671 		return error;
12672 	}
12673 	if ((error = vnode_getwithref(vp))) {
12674 		file_drop(uap->fd);
12675 		return error;
12676 	}
12677 	if (uap->namebuf != 0 && uap->bufsize > 0) {
12678 		auio = uio_createwithbuffer(1, 0, spacetype,
12679 		    UIO_READ, &uio_buf[0], sizeof(uio_buf));
12680 		uio_addiov(auio, uap->namebuf, uap->bufsize);
12681 	}
12682 
12683 	error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
12684 
12685 	vnode_put(vp);
12686 	file_drop(uap->fd);
12687 	if (auio) {
12688 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
12689 	} else {
12690 		*retval = (user_ssize_t)attrsize;
12691 	}
12692 	return error;
12693 }
12694 
12695 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)12696 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
12697     vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
12698 {
12699 	int error;
12700 	struct mount *mp = NULL;
12701 	vnode_t vp;
12702 	int length;
12703 	int bpflags;
12704 	/* maximum number of times to retry build_path */
12705 	unsigned int retries = 0x10;
12706 
12707 	if (bufsize > PAGE_SIZE) {
12708 		return EINVAL;
12709 	}
12710 
12711 	if (buf == NULL) {
12712 		return ENOMEM;
12713 	}
12714 
12715 retry:
12716 	if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
12717 		error = ENOTSUP;  /* unexpected failure */
12718 		return ENOTSUP;
12719 	}
12720 
12721 #if CONFIG_UNION_MOUNTS
12722 unionget:
12723 #endif /* CONFIG_UNION_MOUNTS */
12724 	if (objid == 2) {
12725 		struct vfs_attr vfsattr;
12726 		int use_vfs_root = TRUE;
12727 
12728 		VFSATTR_INIT(&vfsattr);
12729 		VFSATTR_WANTED(&vfsattr, f_capabilities);
12730 		if (!(options & FSOPT_ISREALFSID) &&
12731 		    vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
12732 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
12733 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
12734 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
12735 				use_vfs_root = FALSE;
12736 			}
12737 		}
12738 
12739 		if (use_vfs_root) {
12740 			error = VFS_ROOT(mp, &vp, ctx);
12741 		} else {
12742 			error = VFS_VGET(mp, objid, &vp, ctx);
12743 		}
12744 	} else {
12745 		error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
12746 	}
12747 
12748 #if CONFIG_UNION_MOUNTS
12749 	if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
12750 		/*
12751 		 * If the fileid isn't found and we're in a union
12752 		 * mount volume, then see if the fileid is in the
12753 		 * mounted-on volume.
12754 		 */
12755 		struct mount *tmp = mp;
12756 		mp = vnode_mount(tmp->mnt_vnodecovered);
12757 		vfs_unbusy(tmp);
12758 		if (vfs_busy(mp, LK_NOWAIT) == 0) {
12759 			goto unionget;
12760 		}
12761 	} else {
12762 		vfs_unbusy(mp);
12763 	}
12764 #else
12765 	vfs_unbusy(mp);
12766 #endif /* CONFIG_UNION_MOUNTS */
12767 
12768 	if (error) {
12769 		return error;
12770 	}
12771 
12772 #if CONFIG_MACF
12773 	error = mac_vnode_check_fsgetpath(ctx, vp);
12774 	if (error) {
12775 		vnode_put(vp);
12776 		return error;
12777 	}
12778 #endif
12779 
12780 	/* Obtain the absolute path to this vnode. */
12781 	bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
12782 	if (options & FSOPT_NOFIRMLINKPATH) {
12783 		bpflags |= BUILDPATH_NO_FIRMLINK;
12784 	}
12785 	bpflags |= BUILDPATH_CHECK_MOVED;
12786 	error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
12787 	vnode_put(vp);
12788 
12789 	if (error) {
12790 		/* there was a race building the path, try a few more times */
12791 		if (error == EAGAIN) {
12792 			--retries;
12793 			if (retries > 0) {
12794 				goto retry;
12795 			}
12796 
12797 			error = ENOENT;
12798 		}
12799 		goto out;
12800 	}
12801 
12802 	AUDIT_ARG(text, buf);
12803 
12804 	if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
12805 		unsigned long path_words[NUMPARMS];
12806 		size_t path_len = sizeof(path_words);
12807 
12808 		if ((size_t)length < path_len) {
12809 			memcpy((char *)path_words, buf, length);
12810 			memset((char *)path_words + length, 0, path_len - length);
12811 
12812 			path_len = length;
12813 		} else {
12814 			memcpy((char *)path_words, buf + (length - path_len), path_len);
12815 		}
12816 
12817 		kdebug_vfs_lookup(path_words, (int)path_len, vp,
12818 		    KDBG_VFS_LOOKUP_FLAG_LOOKUP);
12819 	}
12820 
12821 	*pathlen = length; /* may be superseded by error */
12822 
12823 out:
12824 	return error;
12825 }
12826 
12827 /*
12828  * Obtain the full pathname of a file system object by id.
12829  */
12830 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)12831 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
12832     uint32_t options, user_ssize_t *retval)
12833 {
12834 	vfs_context_t ctx = vfs_context_current();
12835 	fsid_t fsid;
12836 	char *realpath;
12837 	int length;
12838 	int error;
12839 
12840 	if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
12841 		return EINVAL;
12842 	}
12843 
12844 	if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
12845 		return error;
12846 	}
12847 	AUDIT_ARG(value32, fsid.val[0]);
12848 	AUDIT_ARG(value64, objid);
12849 	/* Restrict output buffer size for now. */
12850 
12851 	if (bufsize > PAGE_SIZE || bufsize <= 0) {
12852 		return EINVAL;
12853 	}
12854 	realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
12855 	if (realpath == NULL) {
12856 		return ENOMEM;
12857 	}
12858 
12859 	error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
12860 	    options, &length);
12861 
12862 	if (error) {
12863 		goto out;
12864 	}
12865 
12866 	error = copyout((caddr_t)realpath, buf, length);
12867 
12868 	*retval = (user_ssize_t)length; /* may be superseded by error */
12869 out:
12870 	kfree_data(realpath, bufsize);
12871 	return error;
12872 }
12873 
12874 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)12875 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
12876 {
12877 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
12878 	           0, retval);
12879 }
12880 
12881 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)12882 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
12883 {
12884 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
12885 	           uap->options, retval);
12886 }
12887 
12888 /*
12889  * Common routine to handle various flavors of statfs data heading out
12890  *	to user space.
12891  *
12892  * Returns:	0			Success
12893  *		EFAULT
12894  */
12895 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)12896 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
12897     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
12898     boolean_t partial_copy)
12899 {
12900 	int             error;
12901 	int             my_size, copy_size;
12902 
12903 	if (is_64_bit) {
12904 		struct user64_statfs sfs;
12905 		my_size = copy_size = sizeof(sfs);
12906 		bzero(&sfs, my_size);
12907 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12908 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
12909 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12910 		sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
12911 		sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
12912 		sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
12913 		sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
12914 		sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
12915 		sfs.f_files = (user64_long_t)sfsp->f_files;
12916 		sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
12917 		sfs.f_fsid = sfsp->f_fsid;
12918 		sfs.f_owner = sfsp->f_owner;
12919 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12920 			strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12921 		} else {
12922 			strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12923 		}
12924 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12925 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12926 
12927 		if (partial_copy) {
12928 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12929 		}
12930 		error = copyout((caddr_t)&sfs, bufp, copy_size);
12931 	} else {
12932 		struct user32_statfs sfs;
12933 
12934 		my_size = copy_size = sizeof(sfs);
12935 		bzero(&sfs, my_size);
12936 
12937 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12938 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
12939 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12940 
12941 		/*
12942 		 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
12943 		 * have to fudge the numbers here in that case.   We inflate the blocksize in order
12944 		 * to reflect the filesystem size as best we can.
12945 		 */
12946 		if ((sfsp->f_blocks > INT_MAX)
12947 		    /* Hack for 4061702 . I think the real fix is for Carbon to
12948 		     * look for some volume capability and not depend on hidden
12949 		     * semantics agreed between a FS and carbon.
12950 		     * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
12951 		     * for Carbon to set bNoVolumeSizes volume attribute.
12952 		     * Without this the webdavfs files cannot be copied onto
12953 		     * disk as they look huge. This change should not affect
12954 		     * XSAN as they should not setting these to -1..
12955 		     */
12956 		    && (sfsp->f_blocks != 0xffffffffffffffffULL)
12957 		    && (sfsp->f_bfree != 0xffffffffffffffffULL)
12958 		    && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
12959 			int             shift;
12960 
12961 			/*
12962 			 * Work out how far we have to shift the block count down to make it fit.
12963 			 * Note that it's possible to have to shift so far that the resulting
12964 			 * blocksize would be unreportably large.  At that point, we will clip
12965 			 * any values that don't fit.
12966 			 *
12967 			 * For safety's sake, we also ensure that f_iosize is never reported as
12968 			 * being smaller than f_bsize.
12969 			 */
12970 			for (shift = 0; shift < 32; shift++) {
12971 				if ((sfsp->f_blocks >> shift) <= INT_MAX) {
12972 					break;
12973 				}
12974 				if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
12975 					break;
12976 				}
12977 			}
12978 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
12979 			sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
12980 			sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
12981 			sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
12982 #undef __SHIFT_OR_CLIP
12983 			sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
12984 			sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
12985 		} else {
12986 			/* filesystem is small enough to be reported honestly */
12987 			sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
12988 			sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
12989 			sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
12990 			sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
12991 			sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
12992 		}
12993 		sfs.f_files = (user32_long_t)sfsp->f_files;
12994 		sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
12995 		sfs.f_fsid = sfsp->f_fsid;
12996 		sfs.f_owner = sfsp->f_owner;
12997 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12998 			strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12999 		} else {
13000 			strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
13001 		}
13002 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13003 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13004 
13005 		if (partial_copy) {
13006 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13007 		}
13008 		error = copyout((caddr_t)&sfs, bufp, copy_size);
13009 	}
13010 
13011 	if (sizep != NULL) {
13012 		*sizep = my_size;
13013 	}
13014 	return error;
13015 }
13016 
13017 /*
13018  * copy stat structure into user_stat structure.
13019  */
13020 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)13021 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
13022 {
13023 	bzero(usbp, sizeof(*usbp));
13024 
13025 	usbp->st_dev = sbp->st_dev;
13026 	usbp->st_ino = sbp->st_ino;
13027 	usbp->st_mode = sbp->st_mode;
13028 	usbp->st_nlink = sbp->st_nlink;
13029 	usbp->st_uid = sbp->st_uid;
13030 	usbp->st_gid = sbp->st_gid;
13031 	usbp->st_rdev = sbp->st_rdev;
13032 #ifndef _POSIX_C_SOURCE
13033 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13034 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13035 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13036 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13037 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13038 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13039 #else
13040 	usbp->st_atime = sbp->st_atime;
13041 	usbp->st_atimensec = sbp->st_atimensec;
13042 	usbp->st_mtime = sbp->st_mtime;
13043 	usbp->st_mtimensec = sbp->st_mtimensec;
13044 	usbp->st_ctime = sbp->st_ctime;
13045 	usbp->st_ctimensec = sbp->st_ctimensec;
13046 #endif
13047 	usbp->st_size = sbp->st_size;
13048 	usbp->st_blocks = sbp->st_blocks;
13049 	usbp->st_blksize = sbp->st_blksize;
13050 	usbp->st_flags = sbp->st_flags;
13051 	usbp->st_gen = sbp->st_gen;
13052 	usbp->st_lspare = sbp->st_lspare;
13053 	usbp->st_qspare[0] = sbp->st_qspare[0];
13054 	usbp->st_qspare[1] = sbp->st_qspare[1];
13055 }
13056 
13057 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)13058 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
13059 {
13060 	bzero(usbp, sizeof(*usbp));
13061 
13062 	usbp->st_dev = sbp->st_dev;
13063 	usbp->st_ino = sbp->st_ino;
13064 	usbp->st_mode = sbp->st_mode;
13065 	usbp->st_nlink = sbp->st_nlink;
13066 	usbp->st_uid = sbp->st_uid;
13067 	usbp->st_gid = sbp->st_gid;
13068 	usbp->st_rdev = sbp->st_rdev;
13069 #ifndef _POSIX_C_SOURCE
13070 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13071 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13072 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13073 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13074 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13075 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13076 #else
13077 	usbp->st_atime = sbp->st_atime;
13078 	usbp->st_atimensec = sbp->st_atimensec;
13079 	usbp->st_mtime = sbp->st_mtime;
13080 	usbp->st_mtimensec = sbp->st_mtimensec;
13081 	usbp->st_ctime = sbp->st_ctime;
13082 	usbp->st_ctimensec = sbp->st_ctimensec;
13083 #endif
13084 	usbp->st_size = sbp->st_size;
13085 	usbp->st_blocks = sbp->st_blocks;
13086 	usbp->st_blksize = sbp->st_blksize;
13087 	usbp->st_flags = sbp->st_flags;
13088 	usbp->st_gen = sbp->st_gen;
13089 	usbp->st_lspare = sbp->st_lspare;
13090 	usbp->st_qspare[0] = sbp->st_qspare[0];
13091 	usbp->st_qspare[1] = sbp->st_qspare[1];
13092 }
13093 
13094 /*
13095  * copy stat64 structure into user_stat64 structure.
13096  */
13097 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)13098 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
13099 {
13100 	bzero(usbp, sizeof(*usbp));
13101 
13102 	usbp->st_dev = sbp->st_dev;
13103 	usbp->st_ino = sbp->st_ino;
13104 	usbp->st_mode = sbp->st_mode;
13105 	usbp->st_nlink = sbp->st_nlink;
13106 	usbp->st_uid = sbp->st_uid;
13107 	usbp->st_gid = sbp->st_gid;
13108 	usbp->st_rdev = sbp->st_rdev;
13109 #ifndef _POSIX_C_SOURCE
13110 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13111 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13112 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13113 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13114 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13115 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13116 	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
13117 	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
13118 #else
13119 	usbp->st_atime = sbp->st_atime;
13120 	usbp->st_atimensec = sbp->st_atimensec;
13121 	usbp->st_mtime = sbp->st_mtime;
13122 	usbp->st_mtimensec = sbp->st_mtimensec;
13123 	usbp->st_ctime = sbp->st_ctime;
13124 	usbp->st_ctimensec = sbp->st_ctimensec;
13125 	usbp->st_birthtime = sbp->st_birthtime;
13126 	usbp->st_birthtimensec = sbp->st_birthtimensec;
13127 #endif
13128 	usbp->st_size = sbp->st_size;
13129 	usbp->st_blocks = sbp->st_blocks;
13130 	usbp->st_blksize = sbp->st_blksize;
13131 	usbp->st_flags = sbp->st_flags;
13132 	usbp->st_gen = sbp->st_gen;
13133 	usbp->st_lspare = sbp->st_lspare;
13134 	usbp->st_qspare[0] = sbp->st_qspare[0];
13135 	usbp->st_qspare[1] = sbp->st_qspare[1];
13136 }
13137 
13138 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)13139 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
13140 {
13141 	bzero(usbp, sizeof(*usbp));
13142 
13143 	usbp->st_dev = sbp->st_dev;
13144 	usbp->st_ino = sbp->st_ino;
13145 	usbp->st_mode = sbp->st_mode;
13146 	usbp->st_nlink = sbp->st_nlink;
13147 	usbp->st_uid = sbp->st_uid;
13148 	usbp->st_gid = sbp->st_gid;
13149 	usbp->st_rdev = sbp->st_rdev;
13150 #ifndef _POSIX_C_SOURCE
13151 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13152 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13153 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13154 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13155 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13156 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13157 	usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
13158 	usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
13159 #else
13160 	usbp->st_atime = sbp->st_atime;
13161 	usbp->st_atimensec = sbp->st_atimensec;
13162 	usbp->st_mtime = sbp->st_mtime;
13163 	usbp->st_mtimensec = sbp->st_mtimensec;
13164 	usbp->st_ctime = sbp->st_ctime;
13165 	usbp->st_ctimensec = sbp->st_ctimensec;
13166 	usbp->st_birthtime = sbp->st_birthtime;
13167 	usbp->st_birthtimensec = sbp->st_birthtimensec;
13168 #endif
13169 	usbp->st_size = sbp->st_size;
13170 	usbp->st_blocks = sbp->st_blocks;
13171 	usbp->st_blksize = sbp->st_blksize;
13172 	usbp->st_flags = sbp->st_flags;
13173 	usbp->st_gen = sbp->st_gen;
13174 	usbp->st_lspare = sbp->st_lspare;
13175 	usbp->st_qspare[0] = sbp->st_qspare[0];
13176 	usbp->st_qspare[1] = sbp->st_qspare[1];
13177 }
13178 
13179 /*
13180  * Purge buffer cache for simulating cold starts
13181  */
13182 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)13183 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
13184 {
13185 	ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
13186 
13187 	return VNODE_RETURNED;
13188 }
13189 
13190 static int
vfs_purge_callback(mount_t mp,__unused void * arg)13191 vfs_purge_callback(mount_t mp, __unused void * arg)
13192 {
13193 	vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
13194 
13195 	return VFS_RETURNED;
13196 }
13197 
13198 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)13199 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
13200 {
13201 	if (!kauth_cred_issuser(kauth_cred_get())) {
13202 		return EPERM;
13203 	}
13204 
13205 	vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
13206 
13207 	return 0;
13208 }
13209 
13210 /*
13211  * gets the vnode associated with the (unnamed) snapshot directory
13212  * for a Filesystem. The snapshot directory vnode is returned with
13213  * an iocount on it.
13214  */
13215 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)13216 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
13217 {
13218 	return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
13219 }
13220 
13221 /*
13222  * Get the snapshot vnode.
13223  *
13224  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
13225  * needs nameidone() on ndp.
13226  *
13227  * If the snapshot vnode exists it is returned in ndp->ni_vp.
13228  *
13229  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
13230  * not needed.
13231  */
13232 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)13233 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
13234     user_addr_t name, struct nameidata *ndp, int32_t op,
13235 #if !CONFIG_TRIGGERS
13236     __unused
13237 #endif
13238     enum path_operation pathop,
13239     vfs_context_t ctx)
13240 {
13241 	int error, i;
13242 	caddr_t name_buf;
13243 	size_t name_len;
13244 	struct vfs_attr vfa;
13245 
13246 	*sdvpp = NULLVP;
13247 	*rvpp = NULLVP;
13248 
13249 	error = vnode_getfromfd(ctx, dirfd, rvpp);
13250 	if (error) {
13251 		return error;
13252 	}
13253 
13254 	if (!vnode_isvroot(*rvpp)) {
13255 		error = EINVAL;
13256 		goto out;
13257 	}
13258 
13259 	/* Make sure the filesystem supports snapshots */
13260 	VFSATTR_INIT(&vfa);
13261 	VFSATTR_WANTED(&vfa, f_capabilities);
13262 	if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
13263 	    !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
13264 	    !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
13265 	    VOL_CAP_INT_SNAPSHOT)) ||
13266 	    !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
13267 	    VOL_CAP_INT_SNAPSHOT))) {
13268 		error = ENOTSUP;
13269 		goto out;
13270 	}
13271 
13272 	error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
13273 	if (error) {
13274 		goto out;
13275 	}
13276 
13277 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13278 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13279 	if (error) {
13280 		goto out1;
13281 	}
13282 
13283 	/*
13284 	 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
13285 	 * (the length returned by copyinstr includes the terminating NUL)
13286 	 */
13287 	if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
13288 	    (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
13289 		error = EINVAL;
13290 		goto out1;
13291 	}
13292 	for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
13293 		;
13294 	}
13295 	if (i < (int)name_len) {
13296 		error = EINVAL;
13297 		goto out1;
13298 	}
13299 
13300 #if CONFIG_MACF
13301 	if (op == CREATE) {
13302 		error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
13303 		    name_buf);
13304 	} else if (op == DELETE) {
13305 		error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
13306 		    name_buf);
13307 	}
13308 	if (error) {
13309 		goto out1;
13310 	}
13311 #endif
13312 
13313 	/* Check if the snapshot already exists ... */
13314 	NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
13315 	    UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
13316 	ndp->ni_dvp = *sdvpp;
13317 
13318 	error = namei(ndp);
13319 out1:
13320 	zfree(ZV_NAMEI, name_buf);
13321 out:
13322 	if (error) {
13323 		if (*sdvpp) {
13324 			vnode_put(*sdvpp);
13325 			*sdvpp = NULLVP;
13326 		}
13327 		if (*rvpp) {
13328 			vnode_put(*rvpp);
13329 			*rvpp = NULLVP;
13330 		}
13331 	}
13332 	return error;
13333 }
13334 
13335 /*
13336  * create a filesystem snapshot (for supporting filesystems)
13337  *
13338  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
13339  * We get to the (unnamed) snapshot directory vnode and create the vnode
13340  * for the snapshot in it.
13341  *
13342  * Restrictions:
13343  *
13344  *    a) Passed in name for snapshot cannot have slashes.
13345  *    b) name can't be "." or ".."
13346  *
13347  * Since this requires superuser privileges, vnode_authorize calls are not
13348  * made.
13349  */
13350 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13351 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
13352     vfs_context_t ctx)
13353 {
13354 	vnode_t rvp, snapdvp;
13355 	int error;
13356 	struct nameidata *ndp;
13357 
13358 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
13359 
13360 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
13361 	    OP_LINK, ctx);
13362 	if (error) {
13363 		goto out;
13364 	}
13365 
13366 	if (ndp->ni_vp) {
13367 		vnode_put(ndp->ni_vp);
13368 		error = EEXIST;
13369 	} else {
13370 		struct vnode_attr *vap;
13371 		vnode_t vp = NULLVP;
13372 
13373 		vap = kalloc_type(struct vnode_attr, Z_WAITOK);
13374 
13375 		VATTR_INIT(vap);
13376 		VATTR_SET(vap, va_type, VREG);
13377 		VATTR_SET(vap, va_mode, 0);
13378 
13379 		error = vn_create(snapdvp, &vp, ndp, vap,
13380 		    VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
13381 		if (!error && vp) {
13382 			vnode_put(vp);
13383 		}
13384 
13385 		kfree_type(struct vnode_attr, vap);
13386 	}
13387 
13388 	nameidone(ndp);
13389 	vnode_put(snapdvp);
13390 	vnode_put(rvp);
13391 out:
13392 	kfree_type(struct nameidata, ndp);
13393 
13394 	return error;
13395 }
13396 
13397 /*
13398  * Delete a Filesystem snapshot
13399  *
13400  * get the vnode for the unnamed snapshot directory and the snapshot and
13401  * delete the snapshot.
13402  */
13403 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13404 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
13405     vfs_context_t ctx)
13406 {
13407 	vnode_t rvp, snapdvp;
13408 	int error;
13409 	struct nameidata *ndp;
13410 
13411 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
13412 
13413 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
13414 	    OP_UNLINK, ctx);
13415 	if (error) {
13416 		goto out;
13417 	}
13418 
13419 	error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
13420 	    VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
13421 
13422 	vnode_put(ndp->ni_vp);
13423 	nameidone(ndp);
13424 	vnode_put(snapdvp);
13425 	vnode_put(rvp);
13426 out:
13427 	kfree_type(struct nameidata, ndp);
13428 
13429 	return error;
13430 }
13431 
13432 /*
13433  * Revert a filesystem to a snapshot
13434  *
13435  * Marks the filesystem to revert to the given snapshot on next mount.
13436  */
13437 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13438 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
13439     vfs_context_t ctx)
13440 {
13441 	int error;
13442 	vnode_t rvp;
13443 	mount_t mp;
13444 	struct fs_snapshot_revert_args revert_data;
13445 	struct componentname cnp;
13446 	caddr_t name_buf;
13447 	size_t name_len;
13448 
13449 	error = vnode_getfromfd(ctx, dirfd, &rvp);
13450 	if (error) {
13451 		return error;
13452 	}
13453 	mp = vnode_mount(rvp);
13454 
13455 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13456 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13457 	if (error) {
13458 		zfree(ZV_NAMEI, name_buf);
13459 		vnode_put(rvp);
13460 		return error;
13461 	}
13462 
13463 #if CONFIG_MACF
13464 	error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
13465 	if (error) {
13466 		zfree(ZV_NAMEI, name_buf);
13467 		vnode_put(rvp);
13468 		return error;
13469 	}
13470 #endif
13471 
13472 	/*
13473 	 * Grab mount_iterref so that we can release the vnode,
13474 	 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
13475 	 */
13476 	error = mount_iterref(mp, 0);
13477 	vnode_put(rvp);
13478 	if (error) {
13479 		zfree(ZV_NAMEI, name_buf);
13480 		return error;
13481 	}
13482 
13483 	memset(&cnp, 0, sizeof(cnp));
13484 	cnp.cn_pnbuf = (char *)name_buf;
13485 	cnp.cn_nameiop = LOOKUP;
13486 	cnp.cn_flags = ISLASTCN | HASBUF;
13487 	cnp.cn_pnlen = MAXPATHLEN;
13488 	cnp.cn_nameptr = cnp.cn_pnbuf;
13489 	cnp.cn_namelen = (int)name_len;
13490 	revert_data.sr_cnp = &cnp;
13491 
13492 	error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
13493 	mount_iterdrop(mp);
13494 	zfree(ZV_NAMEI, name_buf);
13495 
13496 	if (error) {
13497 		/* If there was any error, try again using VNOP_IOCTL */
13498 
13499 		vnode_t snapdvp;
13500 		struct nameidata namend;
13501 
13502 		error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
13503 		    OP_LOOKUP, ctx);
13504 		if (error) {
13505 			return error;
13506 		}
13507 
13508 
13509 		error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
13510 		    0, ctx);
13511 
13512 		vnode_put(namend.ni_vp);
13513 		nameidone(&namend);
13514 		vnode_put(snapdvp);
13515 		vnode_put(rvp);
13516 	}
13517 
13518 	return error;
13519 }
13520 
13521 /*
13522  * rename a Filesystem snapshot
13523  *
13524  * get the vnode for the unnamed snapshot directory and the snapshot and
13525  * rename the snapshot. This is a very specialised (and simple) case of
13526  * rename(2) (which has to deal with a lot more complications). It differs
13527  * slightly from rename(2) in that EEXIST is returned if the new name exists.
13528  */
13529 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)13530 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
13531     __unused uint32_t flags, vfs_context_t ctx)
13532 {
13533 	vnode_t rvp, snapdvp;
13534 	int error, i;
13535 	caddr_t newname_buf;
13536 	size_t name_len;
13537 	vnode_t fvp;
13538 	struct nameidata *fromnd, *tond;
13539 	/* carving out a chunk for structs that are too big to be on stack. */
13540 	struct {
13541 		struct nameidata from_node;
13542 		struct nameidata to_node;
13543 	} * __rename_data;
13544 
13545 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
13546 	fromnd = &__rename_data->from_node;
13547 	tond = &__rename_data->to_node;
13548 
13549 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
13550 	    OP_UNLINK, ctx);
13551 	if (error) {
13552 		goto out;
13553 	}
13554 	fvp  = fromnd->ni_vp;
13555 
13556 	newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13557 	error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
13558 	if (error) {
13559 		goto out1;
13560 	}
13561 
13562 	/*
13563 	 * Some sanity checks- new name can't be empty, "." or ".." or have
13564 	 * slashes.
13565 	 * (the length returned by copyinstr includes the terminating NUL)
13566 	 *
13567 	 * The FS rename VNOP is suppossed to handle this but we'll pick it
13568 	 * off here itself.
13569 	 */
13570 	if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
13571 	    (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
13572 		error = EINVAL;
13573 		goto out1;
13574 	}
13575 	for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
13576 		;
13577 	}
13578 	if (i < (int)name_len) {
13579 		error = EINVAL;
13580 		goto out1;
13581 	}
13582 
13583 #if CONFIG_MACF
13584 	error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
13585 	    newname_buf);
13586 	if (error) {
13587 		goto out1;
13588 	}
13589 #endif
13590 
13591 	NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
13592 	    UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
13593 	tond->ni_dvp = snapdvp;
13594 
13595 	error = namei(tond);
13596 	if (error) {
13597 		goto out2;
13598 	} else if (tond->ni_vp) {
13599 		/*
13600 		 * snapshot rename behaves differently than rename(2) - if the
13601 		 * new name exists, EEXIST is returned.
13602 		 */
13603 		vnode_put(tond->ni_vp);
13604 		error = EEXIST;
13605 		goto out2;
13606 	}
13607 
13608 	error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
13609 	    &tond->ni_cnd, ctx);
13610 
13611 out2:
13612 	nameidone(tond);
13613 out1:
13614 	zfree(ZV_NAMEI, newname_buf);
13615 	vnode_put(fvp);
13616 	vnode_put(snapdvp);
13617 	vnode_put(rvp);
13618 	nameidone(fromnd);
13619 out:
13620 	kfree_type(typeof(*__rename_data), __rename_data);
13621 	return error;
13622 }
13623 
13624 /*
13625  * Mount a Filesystem snapshot
13626  *
13627  * get the vnode for the unnamed snapshot directory and the snapshot and
13628  * mount the snapshot.
13629  */
13630 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)13631 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
13632     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
13633 {
13634 	mount_t mp;
13635 	vnode_t rvp, snapdvp, snapvp, vp, pvp;
13636 	struct fs_snapshot_mount_args smnt_data;
13637 	int error;
13638 	struct nameidata *snapndp, *dirndp;
13639 	/* carving out a chunk for structs that are too big to be on stack. */
13640 	struct {
13641 		struct nameidata snapnd;
13642 		struct nameidata dirnd;
13643 	} * __snapshot_mount_data;
13644 
13645 	__snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
13646 	snapndp = &__snapshot_mount_data->snapnd;
13647 	dirndp = &__snapshot_mount_data->dirnd;
13648 
13649 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
13650 	    OP_LOOKUP, ctx);
13651 	if (error) {
13652 		goto out;
13653 	}
13654 
13655 	snapvp  = snapndp->ni_vp;
13656 	if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
13657 		error = EIO;
13658 		goto out1;
13659 	}
13660 
13661 	/* Get the vnode to be covered */
13662 	NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
13663 	    UIO_USERSPACE, directory, ctx);
13664 	error = namei(dirndp);
13665 	if (error) {
13666 		goto out1;
13667 	}
13668 
13669 	vp = dirndp->ni_vp;
13670 	pvp = dirndp->ni_dvp;
13671 	mp = vnode_mount(rvp);
13672 
13673 	if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
13674 		error = EINVAL;
13675 		goto out2;
13676 	}
13677 
13678 #if CONFIG_MACF
13679 	error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
13680 	    mp->mnt_vfsstat.f_fstypename);
13681 	if (error) {
13682 		goto out2;
13683 	}
13684 #endif
13685 
13686 	smnt_data.sm_mp  = mp;
13687 	smnt_data.sm_cnp = &snapndp->ni_cnd;
13688 	error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
13689 	    &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
13690 	    KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
13691 
13692 out2:
13693 	vnode_put(vp);
13694 	vnode_put(pvp);
13695 	nameidone(dirndp);
13696 out1:
13697 	vnode_put(snapvp);
13698 	vnode_put(snapdvp);
13699 	vnode_put(rvp);
13700 	nameidone(snapndp);
13701 out:
13702 	kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
13703 	return error;
13704 }
13705 
13706 /*
13707  * Root from a snapshot of the filesystem
13708  *
13709  * Marks the filesystem to root from the given snapshot on next boot.
13710  */
13711 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13712 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
13713     vfs_context_t ctx)
13714 {
13715 	int error;
13716 	vnode_t rvp;
13717 	mount_t mp;
13718 	struct fs_snapshot_root_args root_data;
13719 	struct componentname cnp;
13720 	caddr_t name_buf;
13721 	size_t name_len;
13722 
13723 	error = vnode_getfromfd(ctx, dirfd, &rvp);
13724 	if (error) {
13725 		return error;
13726 	}
13727 	mp = vnode_mount(rvp);
13728 
13729 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13730 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13731 	if (error) {
13732 		zfree(ZV_NAMEI, name_buf);
13733 		vnode_put(rvp);
13734 		return error;
13735 	}
13736 
13737 	// XXX MAC checks ?
13738 
13739 	/*
13740 	 * Grab mount_iterref so that we can release the vnode,
13741 	 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
13742 	 */
13743 	error = mount_iterref(mp, 0);
13744 	vnode_put(rvp);
13745 	if (error) {
13746 		zfree(ZV_NAMEI, name_buf);
13747 		return error;
13748 	}
13749 
13750 	memset(&cnp, 0, sizeof(cnp));
13751 	cnp.cn_pnbuf = (char *)name_buf;
13752 	cnp.cn_nameiop = LOOKUP;
13753 	cnp.cn_flags = ISLASTCN | HASBUF;
13754 	cnp.cn_pnlen = MAXPATHLEN;
13755 	cnp.cn_nameptr = cnp.cn_pnbuf;
13756 	cnp.cn_namelen = (int)name_len;
13757 	root_data.sr_cnp = &cnp;
13758 
13759 	error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
13760 
13761 	mount_iterdrop(mp);
13762 	zfree(ZV_NAMEI, name_buf);
13763 
13764 	return error;
13765 }
13766 
13767 /*
13768  * FS snapshot operations dispatcher
13769  */
13770 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)13771 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
13772     __unused int32_t *retval)
13773 {
13774 	int error;
13775 	vfs_context_t ctx = vfs_context_current();
13776 
13777 	AUDIT_ARG(fd, uap->dirfd);
13778 	AUDIT_ARG(value32, uap->op);
13779 
13780 	error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_SNAPSHOT, 0);
13781 	if (error) {
13782 		return error;
13783 	}
13784 
13785 	/*
13786 	 * Enforce user authorization for snapshot modification operations,
13787 	 * or if trying to root from snapshot.
13788 	 */
13789 	if (uap->op != SNAPSHOT_OP_MOUNT) {
13790 		vnode_t dvp = NULLVP;
13791 		vnode_t devvp = NULLVP;
13792 		mount_t mp;
13793 
13794 		error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
13795 		if (error) {
13796 			return error;
13797 		}
13798 		mp = vnode_mount(dvp);
13799 		devvp = mp->mnt_devvp;
13800 
13801 		/* get an iocount on devvp */
13802 		if (devvp == NULLVP) {
13803 			error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
13804 			/* for mounts which arent block devices */
13805 			if (error == ENOENT) {
13806 				error = ENXIO;
13807 			}
13808 		} else {
13809 			error = vnode_getwithref(devvp);
13810 		}
13811 
13812 		if (error) {
13813 			vnode_put(dvp);
13814 			return error;
13815 		}
13816 
13817 		if ((vfs_context_issuser(ctx) == 0) &&
13818 		    (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
13819 		    (!IOCurrentTaskHasEntitlement("com.apple.private.vfs.snapshot.user"))) {
13820 			error = EPERM;
13821 		}
13822 		vnode_put(dvp);
13823 		vnode_put(devvp);
13824 
13825 		if (error) {
13826 			return error;
13827 		}
13828 	}
13829 
13830 	switch (uap->op) {
13831 	case SNAPSHOT_OP_CREATE:
13832 		error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
13833 		break;
13834 	case SNAPSHOT_OP_DELETE:
13835 		error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
13836 		break;
13837 	case SNAPSHOT_OP_RENAME:
13838 		error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
13839 		    uap->flags, ctx);
13840 		break;
13841 	case SNAPSHOT_OP_MOUNT:
13842 		error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
13843 		    uap->data, uap->flags, ctx);
13844 		break;
13845 	case SNAPSHOT_OP_REVERT:
13846 		error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
13847 		break;
13848 #if CONFIG_MNT_ROOTSNAP
13849 	case SNAPSHOT_OP_ROOT:
13850 		error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
13851 		break;
13852 #endif /* CONFIG_MNT_ROOTSNAP */
13853 	default:
13854 		error = ENOSYS;
13855 	}
13856 
13857 	return error;
13858 }
13859