xref: /xnu-8020.121.3/bsd/vfs/vfs_syscalls.c (revision fdd8201d7b966f0c3ea610489d29bd841d358941)
1 /*
2  * Copyright (c) 1995-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1989, 1993
30  *	The Regents of the University of California.  All rights reserved.
31  * (c) UNIX System Laboratories, Inc.
32  * All or some portions of this file are derived from material licensed
33  * to the University of California by American Telephone and Telegraph
34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35  * the permission of UNIX System Laboratories, Inc.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. All advertising materials mentioning features or use of this software
46  *    must display the following acknowledgement:
47  *	This product includes software developed by the University of
48  *	California, Berkeley and its contributors.
49  * 4. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.41 (Berkeley) 6/15/95
66  */
67 /*
68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69  * support for mandatory and extensible security protections.  This notice
70  * is included in support of clause 2.2 (b) of the Apple Public License,
71  * Version 2.0.
72  */
73 
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112 
113 #include <vfs/vfs_disk_conditioner.h>
114 
115 #include <security/audit/audit.h>
116 #include <bsm/audit_kevents.h>
117 
118 #include <mach/mach_types.h>
119 #include <kern/kern_types.h>
120 #include <kern/kalloc.h>
121 #include <kern/task.h>
122 
123 #include <vm/vm_pageout.h>
124 #include <vm/vm_protos.h>
125 
126 #include <libkern/OSAtomic.h>
127 #include <os/atomic_private.h>
128 #include <pexpert/pexpert.h>
129 #include <IOKit/IOBSD.h>
130 
131 // deps for MIG call
132 #include <kern/host.h>
133 #include <kern/ipc_misc.h>
134 #include <mach/host_priv.h>
135 #include <mach/vfs_nspace.h>
136 #include <os/log.h>
137 
138 #include <nfs/nfs_conf.h>
139 
140 #if ROUTEFS
141 #include <miscfs/routefs/routefs.h>
142 #endif /* ROUTEFS */
143 
144 #if CONFIG_MACF
145 #include <security/mac.h>
146 #include <security/mac_framework.h>
147 #endif
148 
149 #if CONFIG_FSE
150 #define GET_PATH(x) \
151 	((x) = get_pathbuff())
152 #define RELEASE_PATH(x) \
153 	release_pathbuff(x)
154 #else
155 #define GET_PATH(x)     \
156 	((x) = zalloc(ZV_NAMEI))
157 #define RELEASE_PATH(x) \
158 	zfree(ZV_NAMEI, x)
159 #endif /* CONFIG_FSE */
160 
161 #ifndef HFS_GET_BOOT_INFO
162 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
163 #endif
164 
165 #ifndef HFS_SET_BOOT_INFO
166 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
167 #endif
168 
169 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
170 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
171 #endif
172 
173 extern void disk_conditioner_unmount(mount_t mp);
174 
175 /* struct for checkdirs iteration */
176 struct cdirargs {
177 	vnode_t olddp;
178 	vnode_t newdp;
179 };
180 /* callback  for checkdirs iteration */
181 static int checkdirs_callback(proc_t p, void * arg);
182 
183 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
184 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
185 void enablequotas(struct mount *mp, vfs_context_t ctx);
186 static int getfsstat_callback(mount_t mp, void * arg);
187 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
188 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
189 static int sync_callback(mount_t, void *);
190 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
191     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
192     boolean_t partial_copy);
193 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
194 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
195     struct componentname *cnp, user_addr_t fsmountargs,
196     int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
197 void vfs_notify_mount(vnode_t pdvp);
198 
199 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
200 
201 struct fd_vn_data * fg_vn_data_alloc(void);
202 
203 /*
204  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
205  * Concurrent lookups (or lookups by ids) on hard links can cause the
206  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
207  * does) to return ENOENT as the path cannot be returned from the name cache
208  * alone. We have no option but to retry and hope to get one namei->reverse path
209  * generation done without an intervening lookup, lookup by id on the hard link
210  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
211  * which currently are the MAC hooks for rename, unlink and rmdir.
212  */
213 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
214 
215 /* Max retry limit for rename due to vnode recycling. */
216 #define MAX_RENAME_ERECYCLE_RETRIES 1024
217 
218 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
219     int unlink_flags);
220 
221 #ifdef CONFIG_IMGSRC_ACCESS
222 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
223 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
224 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
225 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
226 static void mount_end_update(mount_t mp);
227 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
228 #endif /* CONFIG_IMGSRC_ACCESS */
229 
230 //snapshot functions
231 #if CONFIG_MNT_ROOTSNAP
232 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
233 #else
234 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
235 #endif
236 
237 __private_extern__
238 int sync_internal(void);
239 
240 __private_extern__
241 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
242 
243 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
244 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
245 
246 /* vars for sync mutex */
247 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
248 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
249 
250 extern lck_rw_t rootvnode_rw_lock;
251 
252 /*
253  * incremented each time a mount or unmount operation occurs
254  * used to invalidate the cached value of the rootvp in the
255  * mount structure utilized by cache_lookup_path
256  */
257 uint32_t mount_generation = 0;
258 
259 /* counts number of mount and unmount operations */
260 unsigned int vfs_nummntops = 0;
261 
262 /* system-wide, per-boot unique mount ID */
263 static _Atomic uint64_t mount_unique_id = 1;
264 
265 extern const struct fileops vnops;
266 #if CONFIG_APPLEDOUBLE
267 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
268 #endif /* CONFIG_APPLEDOUBLE */
269 
270 /*
271  * Virtual File System System Calls
272  */
273 
274 /*
275  * Private in-kernel mounting spi (specific use-cases only)
276  */
277 boolean_t
vfs_iskernelmount(mount_t mp)278 vfs_iskernelmount(mount_t mp)
279 {
280 	return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
281 }
282 
283 __private_extern__
284 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)285 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
286     void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
287     vfs_context_t ctx)
288 {
289 	struct nameidata nd;
290 	boolean_t did_namei;
291 	int error;
292 
293 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
294 	    UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
295 
296 	kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
297 
298 	/*
299 	 * Get the vnode to be covered if it's not supplied
300 	 */
301 	if (vp == NULLVP) {
302 		error = namei(&nd);
303 		if (error) {
304 			if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
305 				printf("failed to locate mount-on path: %s ", path);
306 			}
307 			return error;
308 		}
309 		vp = nd.ni_vp;
310 		pvp = nd.ni_dvp;
311 		did_namei = TRUE;
312 	} else {
313 		char *pnbuf = CAST_DOWN(char *, path);
314 
315 		nd.ni_cnd.cn_pnbuf = pnbuf;
316 		nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
317 		did_namei = FALSE;
318 	}
319 
320 	kern_flags |= KERNEL_MOUNT_KMOUNT;
321 	error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
322 	    syscall_flags, kern_flags, NULL, ctx);
323 
324 	if (did_namei) {
325 		vnode_put(vp);
326 		vnode_put(pvp);
327 		nameidone(&nd);
328 	}
329 
330 	return error;
331 }
332 
333 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)334 vfs_mount_at_path(const char *fstype, const char *path,
335     vnode_t pvp, vnode_t vp, void *data, size_t datalen,
336     int mnt_flags, int flags)
337 {
338 	int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
339 	int error, km_flags = 0;
340 
341 	/*
342 	 * This call is currently restricted to specific use cases.
343 	 */
344 	if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
345 		return ENOTSUP;
346 	}
347 
348 #if !defined(XNU_TARGET_OS_OSX)
349 	if (strcmp(fstype, "lifs") == 0) {
350 		syscall_flags |= MNT_NOEXEC;
351 	}
352 #endif
353 
354 	if (flags & VFS_MOUNT_FLAG_NOAUTH) {
355 		km_flags |= KERNEL_MOUNT_NOAUTH;
356 	}
357 	if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
358 		km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
359 	}
360 
361 	error = kernel_mount(fstype, pvp, vp, path, data, datalen,
362 	    syscall_flags, km_flags, vfs_context_kernel());
363 	if (error) {
364 		printf("%s: mount on %s failed, error %d\n", __func__, path,
365 		    error);
366 	}
367 
368 	return error;
369 }
370 
371 int
vfs_mount_override_type_name(mount_t mp,const char * name)372 vfs_mount_override_type_name(mount_t mp, const char *name)
373 {
374 	if (mp == NULL || name == NULL) {
375 		return EINVAL;
376 	}
377 
378 	/* Override the FS type name. */
379 	mount_lock_spin(mp);
380 	strlcpy(mp->fstypename_override, name, sizeof(mp->fstypename_override));
381 	mp->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
382 	mount_unlock(mp);
383 
384 	return 0;
385 }
386 
387 /*
388  * Mount a file system.
389  */
390 /* ARGSUSED */
391 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)392 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
393 {
394 	struct __mac_mount_args muap;
395 
396 	muap.type = uap->type;
397 	muap.path = uap->path;
398 	muap.flags = uap->flags;
399 	muap.data = uap->data;
400 	muap.mac_p = USER_ADDR_NULL;
401 	return __mac_mount(p, &muap, retval);
402 }
403 
404 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)405 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
406 {
407 	struct componentname    cn;
408 	vfs_context_t           ctx = vfs_context_current();
409 	size_t                  dummy = 0;
410 	int                     error;
411 	int                     flags = uap->flags;
412 	char                    fstypename[MFSNAMELEN];
413 	char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
414 	vnode_t                 pvp;
415 	vnode_t                 vp;
416 
417 	AUDIT_ARG(fd, uap->fd);
418 	AUDIT_ARG(fflags, flags);
419 	/* fstypename will get audited by mount_common */
420 
421 	/* Sanity check the flags */
422 	if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
423 		return ENOTSUP;
424 	}
425 
426 	if (flags & MNT_UNION) {
427 		return EPERM;
428 	}
429 
430 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
431 	if (error) {
432 		return error;
433 	}
434 
435 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
436 		return error;
437 	}
438 
439 	if ((error = vnode_getwithref(vp)) != 0) {
440 		file_drop(uap->fd);
441 		return error;
442 	}
443 
444 	pvp = vnode_getparent(vp);
445 	if (pvp == NULL) {
446 		vnode_put(vp);
447 		file_drop(uap->fd);
448 		return EINVAL;
449 	}
450 
451 	memset(&cn, 0, sizeof(struct componentname));
452 	cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
453 	cn.cn_pnlen = MAXPATHLEN;
454 
455 	if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
456 		zfree(ZV_NAMEI, cn.cn_pnbuf);
457 		vnode_put(pvp);
458 		vnode_put(vp);
459 		file_drop(uap->fd);
460 		return error;
461 	}
462 
463 	error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
464 
465 	zfree(ZV_NAMEI, cn.cn_pnbuf);
466 	vnode_put(pvp);
467 	vnode_put(vp);
468 	file_drop(uap->fd);
469 
470 	return error;
471 }
472 
473 void
vfs_notify_mount(vnode_t pdvp)474 vfs_notify_mount(vnode_t pdvp)
475 {
476 	vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
477 	lock_vnode_and_post(pdvp, NOTE_WRITE);
478 }
479 
480 /*
481  * __mac_mount:
482  *	Mount a file system taking into account MAC label behavior.
483  *	See mount(2) man page for more information
484  *
485  * Parameters:    p                        Process requesting the mount
486  *                uap                      User argument descriptor (see below)
487  *                retval                   (ignored)
488  *
489  * Indirect:      uap->type                Filesystem type
490  *                uap->path                Path to mount
491  *                uap->data                Mount arguments
492  *                uap->mac_p               MAC info
493  *                uap->flags               Mount flags
494  *
495  *
496  * Returns:        0                       Success
497  *                !0                       Not success
498  */
499 boolean_t root_fs_upgrade_try = FALSE;
500 
501 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)502 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
503 {
504 	vnode_t pvp = NULL;
505 	vnode_t vp = NULL;
506 	int need_nameidone = 0;
507 	vfs_context_t ctx = vfs_context_current();
508 	char fstypename[MFSNAMELEN];
509 	struct nameidata nd;
510 	size_t dummy = 0;
511 	char *labelstr = NULL;
512 	size_t labelsz = 0;
513 	int flags = uap->flags;
514 	int error;
515 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
516 	boolean_t is_64bit = IS_64BIT_PROCESS(p);
517 #else
518 #pragma unused(p)
519 #endif
520 	/*
521 	 * Get the fs type name from user space
522 	 */
523 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
524 	if (error) {
525 		return error;
526 	}
527 
528 	/*
529 	 * Get the vnode to be covered
530 	 */
531 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
532 	    UIO_USERSPACE, uap->path, ctx);
533 	if (flags & MNT_NOFOLLOW) {
534 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
535 	}
536 	error = namei(&nd);
537 	if (error) {
538 		goto out;
539 	}
540 	need_nameidone = 1;
541 	vp = nd.ni_vp;
542 	pvp = nd.ni_dvp;
543 
544 #ifdef CONFIG_IMGSRC_ACCESS
545 	/* Mounting image source cannot be batched with other operations */
546 	if (flags == MNT_IMGSRC_BY_INDEX) {
547 		error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
548 		    ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
549 		goto out;
550 	}
551 #endif /* CONFIG_IMGSRC_ACCESS */
552 
553 #if CONFIG_MACF
554 	/*
555 	 * Get the label string (if any) from user space
556 	 */
557 	if (uap->mac_p != USER_ADDR_NULL) {
558 		struct user_mac mac;
559 		size_t ulen = 0;
560 
561 		if (is_64bit) {
562 			struct user64_mac mac64;
563 			error = copyin(uap->mac_p, &mac64, sizeof(mac64));
564 			mac.m_buflen = (user_size_t)mac64.m_buflen;
565 			mac.m_string = (user_addr_t)mac64.m_string;
566 		} else {
567 			struct user32_mac mac32;
568 			error = copyin(uap->mac_p, &mac32, sizeof(mac32));
569 			mac.m_buflen = mac32.m_buflen;
570 			mac.m_string = mac32.m_string;
571 		}
572 		if (error) {
573 			goto out;
574 		}
575 		if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
576 		    (mac.m_buflen < 2)) {
577 			error = EINVAL;
578 			goto out;
579 		}
580 		labelsz = mac.m_buflen;
581 		labelstr = kalloc_data(labelsz, Z_WAITOK);
582 		error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
583 		if (error) {
584 			goto out;
585 		}
586 		AUDIT_ARG(mac_string, labelstr);
587 	}
588 #endif /* CONFIG_MACF */
589 
590 	AUDIT_ARG(fflags, flags);
591 
592 #if !CONFIG_UNION_MOUNTS
593 	if (flags & MNT_UNION) {
594 		error = EPERM;
595 		goto out;
596 	}
597 #endif
598 
599 	if ((vp->v_flag & VROOT) &&
600 	    (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
601 #if CONFIG_UNION_MOUNTS
602 		if (!(flags & MNT_UNION)) {
603 			flags |= MNT_UPDATE;
604 		} else {
605 			/*
606 			 * For a union mount on '/', treat it as fresh
607 			 * mount instead of update.
608 			 * Otherwise, union mouting on '/' used to panic the
609 			 * system before, since mnt_vnodecovered was found to
610 			 * be NULL for '/' which is required for unionlookup
611 			 * after it gets ENOENT on union mount.
612 			 */
613 			flags = (flags & ~(MNT_UPDATE));
614 		}
615 #else
616 		flags |= MNT_UPDATE;
617 #endif /* CONFIG_UNION_MOUNTS */
618 
619 #if SECURE_KERNEL
620 		if ((flags & MNT_RDONLY) == 0) {
621 			/* Release kernels are not allowed to mount "/" as rw */
622 			error = EPERM;
623 			goto out;
624 		}
625 #endif
626 
627 		/*
628 		 * See 7392553 for more details on why this check exists.
629 		 * Suffice to say: If this check is ON and something tries
630 		 * to mount the rootFS RW, we'll turn off the codesign
631 		 * bitmap optimization.
632 		 */
633 #if CHECK_CS_VALIDATION_BITMAP
634 		if ((flags & MNT_RDONLY) == 0) {
635 			root_fs_upgrade_try = TRUE;
636 		}
637 #endif
638 	}
639 
640 	error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
641 	    labelstr, ctx);
642 
643 out:
644 
645 #if CONFIG_MACF
646 	kfree_data(labelstr, labelsz);
647 #endif /* CONFIG_MACF */
648 
649 	if (vp) {
650 		vnode_put(vp);
651 	}
652 	if (pvp) {
653 		vnode_put(pvp);
654 	}
655 	if (need_nameidone) {
656 		nameidone(&nd);
657 	}
658 
659 	return error;
660 }
661 
662 /*
663  * common mount implementation (final stage of mounting)
664  *
665  * Arguments:
666  *  fstypename	file system type (ie it's vfs name)
667  *  pvp		parent of covered vnode
668  *  vp		covered vnode
669  *  cnp		component name (ie path) of covered vnode
670  *  flags	generic mount flags
671  *  fsmountargs	file system specific data
672  *  labelstr	optional MAC label
673  *  kernelmount	TRUE for mounts initiated from inside the kernel
674  *  ctx		caller's context
675  */
676 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)677 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
678     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
679     char *labelstr, vfs_context_t ctx)
680 {
681 #if !CONFIG_MACF
682 #pragma unused(labelstr)
683 #endif
684 	struct vnode *devvp = NULLVP;
685 	struct vnode *device_vnode = NULLVP;
686 #if CONFIG_MACF
687 	struct vnode *rvp;
688 #endif
689 	struct mount *mp;
690 	struct vfstable *vfsp = (struct vfstable *)0;
691 	struct proc *p = vfs_context_proc(ctx);
692 	int error, flag = 0;
693 	bool flag_set = false;
694 	user_addr_t devpath = USER_ADDR_NULL;
695 	int ronly = 0;
696 	int mntalloc = 0;
697 	boolean_t vfsp_ref = FALSE;
698 	boolean_t is_rwlock_locked = FALSE;
699 	boolean_t did_rele = FALSE;
700 	boolean_t have_usecount = FALSE;
701 	boolean_t did_set_lmount = FALSE;
702 	boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
703 
704 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
705 	/* Check for mutually-exclusive flag bits */
706 	uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
707 	int bitcount = 0;
708 	while (checkflags != 0) {
709 		checkflags &= (checkflags - 1);
710 		bitcount++;
711 	}
712 
713 	if (bitcount > 1) {
714 		//not allowed to request multiple mount-by-role flags
715 		error = EINVAL;
716 		goto out1;
717 	}
718 #endif
719 
720 	/*
721 	 * Process an update for an existing mount
722 	 */
723 	if (flags & MNT_UPDATE) {
724 		if ((vp->v_flag & VROOT) == 0) {
725 			error = EINVAL;
726 			goto out1;
727 		}
728 		mp = vp->v_mount;
729 
730 		/* if unmount or mount in progress, return error */
731 		mount_lock_spin(mp);
732 		if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
733 			mount_unlock(mp);
734 			error = EBUSY;
735 			goto out1;
736 		}
737 		mp->mnt_lflag |= MNT_LMOUNT;
738 		did_set_lmount = TRUE;
739 		mount_unlock(mp);
740 		lck_rw_lock_exclusive(&mp->mnt_rwlock);
741 		is_rwlock_locked = TRUE;
742 		/*
743 		 * We only allow the filesystem to be reloaded if it
744 		 * is currently mounted read-only.
745 		 */
746 		if ((flags & MNT_RELOAD) &&
747 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
748 			error = ENOTSUP;
749 			goto out1;
750 		}
751 
752 		/*
753 		 * If content protection is enabled, update mounts are not
754 		 * allowed to turn it off.
755 		 */
756 		if ((mp->mnt_flag & MNT_CPROTECT) &&
757 		    ((flags & MNT_CPROTECT) == 0)) {
758 			error = EINVAL;
759 			goto out1;
760 		}
761 
762 		/*
763 		 * can't turn off MNT_REMOVABLE either but it may be an unexpected
764 		 * failure to return an error for this so we'll just silently
765 		 * add it if it is not passed in.
766 		 */
767 		if ((mp->mnt_flag & MNT_REMOVABLE) &&
768 		    ((flags & MNT_REMOVABLE) == 0)) {
769 			flags |= MNT_REMOVABLE;
770 		}
771 
772 		/* Can't downgrade the backer of the root FS */
773 		if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
774 		    (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
775 			error = ENOTSUP;
776 			goto out1;
777 		}
778 
779 		/*
780 		 * Only root, or the user that did the original mount is
781 		 * permitted to update it.
782 		 */
783 		if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
784 		    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
785 			goto out1;
786 		}
787 #if CONFIG_MACF
788 		error = mac_mount_check_remount(ctx, mp);
789 		if (error != 0) {
790 			goto out1;
791 		}
792 #endif
793 		/*
794 		 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
795 		 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
796 		 */
797 		if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
798 			flags |= MNT_NOSUID | MNT_NODEV;
799 			if (mp->mnt_flag & MNT_NOEXEC) {
800 				flags |= MNT_NOEXEC;
801 			}
802 		}
803 		flag = mp->mnt_flag;
804 		flag_set = true;
805 
806 
807 
808 		mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
809 
810 		vfsp = mp->mnt_vtable;
811 		goto update;
812 	} // MNT_UPDATE
813 
814 	/*
815 	 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
816 	 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
817 	 */
818 	if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
819 		flags |= MNT_NOSUID | MNT_NODEV;
820 		if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
821 			flags |= MNT_NOEXEC;
822 		}
823 	}
824 
825 	/* XXXAUDIT: Should we capture the type on the error path as well? */
826 	/* XXX cast-away const (audit_arg_text() does not modify its input) */
827 	AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
828 	mount_list_lock();
829 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
830 		if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
831 			vfsp->vfc_refcount++;
832 			vfsp_ref = TRUE;
833 			break;
834 		}
835 	}
836 	mount_list_unlock();
837 	if (vfsp == NULL) {
838 		error = ENODEV;
839 		goto out1;
840 	}
841 
842 	/*
843 	 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
844 	 * except in ROSV configs and for the initial BaseSystem root.
845 	 */
846 	if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
847 	    ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
848 	    ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
849 		error = EINVAL;  /* unsupported request */
850 		goto out1;
851 	}
852 
853 	error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
854 	if (error != 0) {
855 		goto out1;
856 	}
857 
858 	/*
859 	 * Allocate and initialize the filesystem (mount_t)
860 	 */
861 	mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
862 	mntalloc = 1;
863 
864 	/* Initialize the default IO constraints */
865 	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
866 	mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
867 	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
868 	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
869 	mp->mnt_devblocksize = DEV_BSIZE;
870 	mp->mnt_alignmentmask = PAGE_MASK;
871 	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
872 	mp->mnt_ioscale = 1;
873 	mp->mnt_ioflags = 0;
874 	mp->mnt_realrootvp = NULLVP;
875 	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
876 
877 	mp->mnt_lflag |= MNT_LMOUNT;
878 	did_set_lmount = TRUE;
879 
880 	TAILQ_INIT(&mp->mnt_vnodelist);
881 	TAILQ_INIT(&mp->mnt_workerqueue);
882 	TAILQ_INIT(&mp->mnt_newvnodes);
883 	mount_lock_init(mp);
884 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
885 	is_rwlock_locked = TRUE;
886 	mp->mnt_op = vfsp->vfc_vfsops;
887 	mp->mnt_vtable = vfsp;
888 	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
889 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
890 	strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
891 	do {
892 		int pathlen = MAXPATHLEN;
893 
894 		if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
895 			strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
896 		}
897 	} while (0);
898 	mp->mnt_vnodecovered = vp;
899 	mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
900 	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
901 	mp->mnt_devbsdunit = 0;
902 	mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
903 
904 	/* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
905 	vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
906 
907 	if (kernelmount) {
908 		mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
909 	}
910 	if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
911 		mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
912 	}
913 
914 	if (KERNEL_MOUNT_DEVFS & internal_flags) {
915 		// kernel mounted devfs
916 		mp->mnt_kern_flag |= MNTK_SYSTEM;
917 	}
918 
919 update:
920 
921 	/*
922 	 * Set the mount level flags.
923 	 */
924 	if (flags & MNT_RDONLY) {
925 		mp->mnt_flag |= MNT_RDONLY;
926 	} else if (mp->mnt_flag & MNT_RDONLY) {
927 		// disallow read/write upgrades of file systems that
928 		// had the TYPENAME_OVERRIDE feature set.
929 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
930 			error = EPERM;
931 			goto out1;
932 		}
933 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
934 	}
935 	mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
936 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
937 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
938 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
939 	    MNT_QUARANTINE | MNT_CPROTECT);
940 
941 #if SECURE_KERNEL
942 #if !CONFIG_MNT_SUID
943 	/*
944 	 * On release builds of iOS based platforms, always enforce NOSUID on
945 	 * all mounts. We do this here because we can catch update mounts as well as
946 	 * non-update mounts in this case.
947 	 */
948 	mp->mnt_flag |= (MNT_NOSUID);
949 #endif
950 #endif
951 
952 	mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
953 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
954 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
955 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
956 	    MNT_QUARANTINE | MNT_CPROTECT);
957 
958 #if CONFIG_MACF
959 	if (flags & MNT_MULTILABEL) {
960 		if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
961 			error = EINVAL;
962 			goto out1;
963 		}
964 		mp->mnt_flag |= MNT_MULTILABEL;
965 	}
966 #endif
967 	/*
968 	 * Process device path for local file systems if requested.
969 	 *
970 	 * Snapshot and mount-by-role mounts do not use this path; they are
971 	 * passing other opaque data in the device path field.
972 	 *
973 	 * Basesystemroot mounts pass a device path to be resolved here,
974 	 * but it's just a char * already inside the kernel, which
975 	 * kernel_mount() shoved into a user_addr_t to call us. So for such
976 	 * mounts we must skip copyin (both of the address and of the string
977 	 * (in NDINIT).
978 	 */
979 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
980 	    !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
981 		boolean_t do_copyin_devpath = true;
982 #if CONFIG_BASESYSTEMROOT
983 		if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
984 			// KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
985 			// We have been passed fsmountargs, which is typed as a user_addr_t,
986 			// but is actually a char ** pointing to a (kernelspace) string.
987 			// We manually unpack it with a series of casts and dereferences
988 			// that reverses what was done just above us on the stack in
989 			// imageboot_pivot_image().
990 			// After retrieving the path to the dev node (which we will NDINIT
991 			// in a moment), we pass NULL fsmountargs on to the filesystem.
992 			_Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
993 			char **devnamepp = (char **)fsmountargs;
994 			char *devnamep = *devnamepp;
995 			devpath = CAST_USER_ADDR_T(devnamep);
996 			do_copyin_devpath = false;
997 			fsmountargs = USER_ADDR_NULL;
998 
999 			//Now that we have a mp, denote that this mount is for the basesystem.
1000 			mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1001 		}
1002 #endif // CONFIG_BASESYSTEMROOT
1003 
1004 		if (do_copyin_devpath) {
1005 			if (vfs_context_is64bit(ctx)) {
1006 				if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1007 					goto out1;
1008 				}
1009 				fsmountargs += sizeof(devpath);
1010 			} else {
1011 				user32_addr_t tmp;
1012 				if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1013 					goto out1;
1014 				}
1015 				/* munge into LP64 addr */
1016 				devpath = CAST_USER_ADDR_T(tmp);
1017 				fsmountargs += sizeof(tmp);
1018 			}
1019 		}
1020 
1021 		/* Lookup device and authorize access to it */
1022 		if ((devpath)) {
1023 			struct nameidata nd;
1024 
1025 			enum uio_seg seg = UIO_USERSPACE;
1026 #if CONFIG_BASESYSTEMROOT
1027 			if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1028 				seg = UIO_SYSSPACE;
1029 			}
1030 #endif // CONFIG_BASESYSTEMROOT
1031 
1032 			NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1033 			if ((error = namei(&nd))) {
1034 				goto out1;
1035 			}
1036 
1037 			strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1038 			devvp = nd.ni_vp;
1039 
1040 			nameidone(&nd);
1041 
1042 			if (devvp->v_type != VBLK) {
1043 				error = ENOTBLK;
1044 				goto out2;
1045 			}
1046 			if (major(devvp->v_rdev) >= nblkdev) {
1047 				error = ENXIO;
1048 				goto out2;
1049 			}
1050 			/*
1051 			 * If mount by non-root, then verify that user has necessary
1052 			 * permissions on the device.
1053 			 */
1054 			if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1055 				mode_t accessmode = KAUTH_VNODE_READ_DATA;
1056 
1057 				if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1058 					accessmode |= KAUTH_VNODE_WRITE_DATA;
1059 				}
1060 				if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1061 					goto out2;
1062 				}
1063 			}
1064 		}
1065 		/* On first mount, preflight and open device */
1066 		if (devpath && ((flags & MNT_UPDATE) == 0)) {
1067 			if ((error = vnode_ref(devvp))) {
1068 				goto out2;
1069 			}
1070 			/*
1071 			 * Disallow multiple mounts of the same device.
1072 			 * Disallow mounting of a device that is currently in use
1073 			 * (except for root, which might share swap device for miniroot).
1074 			 * Flush out any old buffers remaining from a previous use.
1075 			 */
1076 			if ((error = vfs_mountedon(devvp))) {
1077 				goto out3;
1078 			}
1079 
1080 			if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1081 				error = EBUSY;
1082 				goto out3;
1083 			}
1084 			if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1085 				error = ENOTBLK;
1086 				goto out3;
1087 			}
1088 			if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1089 				goto out3;
1090 			}
1091 
1092 			ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1093 #if CONFIG_MACF
1094 			error = mac_vnode_check_open(ctx,
1095 			    devvp,
1096 			    ronly ? FREAD : FREAD | FWRITE);
1097 			if (error) {
1098 				goto out3;
1099 			}
1100 #endif /* MAC */
1101 			if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1102 				goto out3;
1103 			}
1104 
1105 			mp->mnt_devvp = devvp;
1106 			device_vnode = devvp;
1107 		} else if ((mp->mnt_flag & MNT_RDONLY) &&
1108 		    (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1109 		    (device_vnode = mp->mnt_devvp)) {
1110 			dev_t dev;
1111 			int maj;
1112 			/*
1113 			 * If upgrade to read-write by non-root, then verify
1114 			 * that user has necessary permissions on the device.
1115 			 */
1116 			vnode_getalways(device_vnode);
1117 
1118 			if (suser(vfs_context_ucred(ctx), NULL) &&
1119 			    (error = vnode_authorize(device_vnode, NULL,
1120 			    KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1121 			    ctx)) != 0) {
1122 				vnode_put(device_vnode);
1123 				goto out2;
1124 			}
1125 
1126 			/* Tell the device that we're upgrading */
1127 			dev = (dev_t)device_vnode->v_rdev;
1128 			maj = major(dev);
1129 
1130 			if ((u_int)maj >= (u_int)nblkdev) {
1131 				panic("Volume mounted on a device with invalid major number.");
1132 			}
1133 
1134 			error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1135 			vnode_put(device_vnode);
1136 			device_vnode = NULLVP;
1137 			if (error != 0) {
1138 				goto out2;
1139 			}
1140 		}
1141 	} // localargs && !(snapshot | data | vm)
1142 
1143 #if CONFIG_MACF
1144 	if ((flags & MNT_UPDATE) == 0) {
1145 		mac_mount_label_init(mp);
1146 		mac_mount_label_associate(ctx, mp);
1147 	}
1148 	if (labelstr) {
1149 		if ((flags & MNT_UPDATE) != 0) {
1150 			error = mac_mount_check_label_update(ctx, mp);
1151 			if (error != 0) {
1152 				goto out3;
1153 			}
1154 		}
1155 	}
1156 #endif
1157 	/*
1158 	 * Mount the filesystem.  We already asserted that internal_flags
1159 	 * cannot have more than one mount-by-role bit set.
1160 	 */
1161 	if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1162 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1163 		    (caddr_t)fsmountargs, 0, ctx);
1164 	} else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1165 #if CONFIG_ROSV_STARTUP
1166 		struct mount *origin_mp = (struct mount*)fsmountargs;
1167 		fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1168 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1169 		if (error) {
1170 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1171 		} else {
1172 			/* Mark volume associated with system volume */
1173 			mp->mnt_kern_flag |= MNTK_SYSTEM;
1174 
1175 			/* Attempt to acquire the mnt_devvp and set it up */
1176 			struct vnode *mp_devvp = NULL;
1177 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1178 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1179 				    0, &mp_devvp, vfs_context_kernel());
1180 				if (!lerr) {
1181 					mp->mnt_devvp = mp_devvp;
1182 					//vnode_lookup took an iocount, need to drop it.
1183 					vnode_put(mp_devvp);
1184 					// now set `device_vnode` to the devvp that was acquired.
1185 					// this is needed in order to ensure vfs_init_io_attributes is invoked.
1186 					// note that though the iocount above was dropped, the mount acquires
1187 					// an implicit reference against the device.
1188 					device_vnode = mp_devvp;
1189 				}
1190 			}
1191 		}
1192 #else
1193 		error = EINVAL;
1194 #endif
1195 	} else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1196 #if CONFIG_MOUNT_VM
1197 		struct mount *origin_mp = (struct mount*)fsmountargs;
1198 		fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1199 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1200 		if (error) {
1201 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1202 		} else {
1203 			/* Mark volume associated with system volume and a swap mount */
1204 			mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1205 			/* Attempt to acquire the mnt_devvp and set it up */
1206 			struct vnode *mp_devvp = NULL;
1207 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1208 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1209 				    0, &mp_devvp, vfs_context_kernel());
1210 				if (!lerr) {
1211 					mp->mnt_devvp = mp_devvp;
1212 					//vnode_lookup took an iocount, need to drop it.
1213 					vnode_put(mp_devvp);
1214 
1215 					// now set `device_vnode` to the devvp that was acquired.
1216 					// note that though the iocount above was dropped, the mount acquires
1217 					// an implicit reference against the device.
1218 					device_vnode = mp_devvp;
1219 				}
1220 			}
1221 		}
1222 #else
1223 		error = EINVAL;
1224 #endif
1225 	} else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1226 #if CONFIG_MOUNT_PREBOOTRECOVERY
1227 		struct mount *origin_mp = (struct mount*)fsmountargs;
1228 		uint32_t mount_role = 0;
1229 		if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1230 			mount_role = VFS_PREBOOT_ROLE;
1231 		} else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1232 			mount_role = VFS_RECOVERY_ROLE;
1233 		}
1234 
1235 		if (mount_role != 0) {
1236 			fs_role_mount_args_t frma = {origin_mp, mount_role};
1237 			error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1238 			if (error) {
1239 				printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1240 			} else {
1241 				// NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1242 				/* Mark volume associated with system volume */
1243 				//mp->mnt_kern_flag |= MNTK_SYSTEM;
1244 				/* Attempt to acquire the mnt_devvp and set it up */
1245 				struct vnode *mp_devvp = NULL;
1246 				if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1247 					errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1248 					    0, &mp_devvp, vfs_context_kernel());
1249 					if (!lerr) {
1250 						mp->mnt_devvp = mp_devvp;
1251 						//vnode_lookup took an iocount, need to drop it.
1252 						vnode_put(mp_devvp);
1253 
1254 						// now set `device_vnode` to the devvp that was acquired.
1255 						// note that though the iocount above was dropped, the mount acquires
1256 						// an implicit reference against the device.
1257 						device_vnode = mp_devvp;
1258 					}
1259 				}
1260 			}
1261 		} else {
1262 			printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1263 			error = EINVAL;
1264 		}
1265 #else
1266 		error = EINVAL;
1267 #endif
1268 	} else {
1269 		error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1270 	}
1271 
1272 	if (flags & MNT_UPDATE) {
1273 		if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1274 			mp->mnt_flag &= ~MNT_RDONLY;
1275 		}
1276 		mp->mnt_flag &= ~
1277 		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1278 		mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1279 		if (error) {
1280 			mp->mnt_flag = flag;  /* restore flag value */
1281 		}
1282 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1283 		lck_rw_done(&mp->mnt_rwlock);
1284 		is_rwlock_locked = FALSE;
1285 		if (!error) {
1286 			enablequotas(mp, ctx);
1287 		}
1288 		goto exit;
1289 	}
1290 
1291 	/*
1292 	 * Put the new filesystem on the mount list after root.
1293 	 */
1294 	if (error == 0) {
1295 		struct vfs_attr vfsattr;
1296 #if CONFIG_MACF
1297 		error = mac_mount_check_mount_late(ctx, mp);
1298 		if (error != 0) {
1299 			goto out4;
1300 		}
1301 
1302 		if (vfs_flags(mp) & MNT_MULTILABEL) {
1303 			error = VFS_ROOT(mp, &rvp, ctx);
1304 			if (error) {
1305 				printf("%s() VFS_ROOT returned %d\n", __func__, error);
1306 				goto out4;
1307 			}
1308 			error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1309 			/*
1310 			 * drop reference provided by VFS_ROOT
1311 			 */
1312 			vnode_put(rvp);
1313 
1314 			if (error) {
1315 				goto out4;
1316 			}
1317 		}
1318 #endif  /* MAC */
1319 
1320 		vnode_lock_spin(vp);
1321 		CLR(vp->v_flag, VMOUNT);
1322 		vp->v_mountedhere = mp;
1323 		vnode_unlock(vp);
1324 
1325 		/*
1326 		 * taking the name_cache_lock exclusively will
1327 		 * insure that everyone is out of the fast path who
1328 		 * might be trying to use a now stale copy of
1329 		 * vp->v_mountedhere->mnt_realrootvp
1330 		 * bumping mount_generation causes the cached values
1331 		 * to be invalidated
1332 		 */
1333 		name_cache_lock();
1334 		mount_generation++;
1335 		name_cache_unlock();
1336 
1337 		error = vnode_ref(vp);
1338 		if (error != 0) {
1339 			goto out4;
1340 		}
1341 
1342 		have_usecount = TRUE;
1343 
1344 		error = checkdirs(vp, ctx);
1345 		if (error != 0) {
1346 			/* Unmount the filesystem as cdir/rdirs cannot be updated */
1347 			goto out4;
1348 		}
1349 		/*
1350 		 * there is no cleanup code here so I have made it void
1351 		 * we need to revisit this
1352 		 */
1353 		(void)VFS_START(mp, 0, ctx);
1354 
1355 		if (mount_list_add(mp) != 0) {
1356 			/*
1357 			 * The system is shutting down trying to umount
1358 			 * everything, so fail with a plausible errno.
1359 			 */
1360 			error = EBUSY;
1361 			goto out4;
1362 		}
1363 		lck_rw_done(&mp->mnt_rwlock);
1364 		is_rwlock_locked = FALSE;
1365 
1366 		/* Check if this mounted file system supports EAs or named streams. */
1367 		/* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1368 		VFSATTR_INIT(&vfsattr);
1369 		VFSATTR_WANTED(&vfsattr, f_capabilities);
1370 		if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1371 		    vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1372 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1373 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1374 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1375 				mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1376 			}
1377 #if NAMEDSTREAMS
1378 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1379 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1380 				mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1381 			}
1382 #endif
1383 			/* Check if this file system supports path from id lookups. */
1384 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1385 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1386 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1387 			} else if (mp->mnt_flag & MNT_DOVOLFS) {
1388 				/* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1389 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1390 			}
1391 
1392 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1393 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1394 				mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1395 			}
1396 		}
1397 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1398 			mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1399 		}
1400 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1401 			mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1402 		}
1403 		/* increment the operations count */
1404 		OSAddAtomic(1, &vfs_nummntops);
1405 		enablequotas(mp, ctx);
1406 
1407 		if (device_vnode) {
1408 			device_vnode->v_specflags |= SI_MOUNTEDON;
1409 
1410 			/*
1411 			 *   cache the IO attributes for the underlying physical media...
1412 			 *   an error return indicates the underlying driver doesn't
1413 			 *   support all the queries necessary... however, reasonable
1414 			 *   defaults will have been set, so no reason to bail or care
1415 			 */
1416 			vfs_init_io_attributes(device_vnode, mp);
1417 		}
1418 
1419 		/* Now that mount is setup, notify the listeners */
1420 		vfs_notify_mount(pvp);
1421 		IOBSDMountChange(mp, kIOMountChangeMount);
1422 	} else {
1423 		/* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1424 		if (mp->mnt_vnodelist.tqh_first != NULL) {
1425 			panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1426 			    mp->mnt_vtable->vfc_name, error);
1427 		}
1428 
1429 		vnode_lock_spin(vp);
1430 		CLR(vp->v_flag, VMOUNT);
1431 		vnode_unlock(vp);
1432 		mount_list_lock();
1433 		mp->mnt_vtable->vfc_refcount--;
1434 		mount_list_unlock();
1435 
1436 		if (device_vnode) {
1437 			vnode_rele(device_vnode);
1438 			VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1439 		}
1440 		lck_rw_done(&mp->mnt_rwlock);
1441 		is_rwlock_locked = FALSE;
1442 
1443 		/*
1444 		 * if we get here, we have a mount structure that needs to be freed,
1445 		 * but since the coveredvp hasn't yet been updated to point at it,
1446 		 * no need to worry about other threads holding a crossref on this mp
1447 		 * so it's ok to just free it
1448 		 */
1449 		mount_lock_destroy(mp);
1450 #if CONFIG_MACF
1451 		mac_mount_label_destroy(mp);
1452 #endif
1453 		zfree(mount_zone, mp);
1454 		did_set_lmount = false;
1455 	}
1456 exit:
1457 	/*
1458 	 * drop I/O count on the device vp if there was one
1459 	 */
1460 	if (devpath && devvp) {
1461 		vnode_put(devvp);
1462 	}
1463 
1464 	if (did_set_lmount) {
1465 		mount_lock_spin(mp);
1466 		mp->mnt_lflag &= ~MNT_LMOUNT;
1467 		mount_unlock(mp);
1468 	}
1469 
1470 	return error;
1471 
1472 /* Error condition exits */
1473 out4:
1474 	(void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1475 
1476 	/*
1477 	 * If the mount has been placed on the covered vp,
1478 	 * it may have been discovered by now, so we have
1479 	 * to treat this just like an unmount
1480 	 */
1481 	mount_lock_spin(mp);
1482 	mp->mnt_lflag |= MNT_LDEAD;
1483 	mount_unlock(mp);
1484 
1485 	if (device_vnode != NULLVP) {
1486 		vnode_rele(device_vnode);
1487 		VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1488 		    ctx);
1489 		did_rele = TRUE;
1490 	}
1491 
1492 	vnode_lock_spin(vp);
1493 
1494 	mp->mnt_crossref++;
1495 	vp->v_mountedhere = (mount_t) 0;
1496 
1497 	vnode_unlock(vp);
1498 
1499 	if (have_usecount) {
1500 		vnode_rele(vp);
1501 	}
1502 out3:
1503 	if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1504 		vnode_rele(devvp);
1505 	}
1506 out2:
1507 	if (devpath && devvp) {
1508 		vnode_put(devvp);
1509 	}
1510 out1:
1511 	/* Release mnt_rwlock only when it was taken */
1512 	if (is_rwlock_locked == TRUE) {
1513 		if (flag_set) {
1514 			mp->mnt_flag = flag;  /* restore mnt_flag value */
1515 		}
1516 		lck_rw_done(&mp->mnt_rwlock);
1517 	}
1518 
1519 	if (did_set_lmount) {
1520 		mount_lock_spin(mp);
1521 		mp->mnt_lflag &= ~MNT_LMOUNT;
1522 		mount_unlock(mp);
1523 	}
1524 
1525 	if (mntalloc) {
1526 		if (mp->mnt_crossref) {
1527 			mount_dropcrossref(mp, vp, 0);
1528 		} else {
1529 			mount_lock_destroy(mp);
1530 #if CONFIG_MACF
1531 			mac_mount_label_destroy(mp);
1532 #endif
1533 			zfree(mount_zone, mp);
1534 		}
1535 	}
1536 	if (vfsp_ref) {
1537 		mount_list_lock();
1538 		vfsp->vfc_refcount--;
1539 		mount_list_unlock();
1540 	}
1541 
1542 	return error;
1543 }
1544 
1545 /*
1546  * Flush in-core data, check for competing mount attempts,
1547  * and set VMOUNT
1548  */
1549 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)1550 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1551 {
1552 #if !CONFIG_MACF
1553 #pragma unused(cnp,fsname)
1554 #endif
1555 	struct vnode_attr va;
1556 	int error;
1557 	boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1558 	boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1559 	boolean_t is_busy;
1560 
1561 	if (!skip_auth) {
1562 		/*
1563 		 * If the user is not root, ensure that they own the directory
1564 		 * onto which we are attempting to mount.
1565 		 */
1566 		VATTR_INIT(&va);
1567 		VATTR_WANTED(&va, va_uid);
1568 		if ((error = vnode_getattr(vp, &va, ctx)) ||
1569 		    (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1570 		    (!vfs_context_issuser(ctx)))) {
1571 			error = EPERM;
1572 			goto out;
1573 		}
1574 	}
1575 
1576 	if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1577 		goto out;
1578 	}
1579 
1580 	if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1581 		goto out;
1582 	}
1583 
1584 	if (vp->v_type != VDIR) {
1585 		error = ENOTDIR;
1586 		goto out;
1587 	}
1588 
1589 	vnode_lock_spin(vp);
1590 	is_busy = is_fmount ?
1591 	    (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL)) :
1592 	    (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL));
1593 	if (is_busy) {
1594 		vnode_unlock(vp);
1595 		error = EBUSY;
1596 		goto out;
1597 	}
1598 	SET(vp->v_flag, VMOUNT);
1599 	vnode_unlock(vp);
1600 
1601 #if CONFIG_MACF
1602 	error = mac_mount_check_mount(ctx, vp,
1603 	    cnp, fsname);
1604 	if (error != 0) {
1605 		vnode_lock_spin(vp);
1606 		CLR(vp->v_flag, VMOUNT);
1607 		vnode_unlock(vp);
1608 	}
1609 #endif
1610 
1611 out:
1612 	return error;
1613 }
1614 
1615 #if CONFIG_IMGSRC_ACCESS
1616 
1617 #define DEBUG_IMGSRC 0
1618 
1619 #if DEBUG_IMGSRC
1620 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1621 #else
1622 #define IMGSRC_DEBUG(args...) do { } while(0)
1623 #endif
1624 
1625 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)1626 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1627 {
1628 	struct nameidata nd;
1629 	vnode_t vp, realdevvp;
1630 	mode_t accessmode;
1631 	int error;
1632 	enum uio_seg uio = UIO_USERSPACE;
1633 
1634 	if (ctx == vfs_context_kernel()) {
1635 		uio = UIO_SYSSPACE;
1636 	}
1637 
1638 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1639 	if ((error = namei(&nd))) {
1640 		IMGSRC_DEBUG("namei() failed with %d\n", error);
1641 		return error;
1642 	}
1643 
1644 	vp = nd.ni_vp;
1645 
1646 	if (!vnode_isblk(vp)) {
1647 		IMGSRC_DEBUG("Not block device.\n");
1648 		error = ENOTBLK;
1649 		goto out;
1650 	}
1651 
1652 	realdevvp = mp->mnt_devvp;
1653 	if (realdevvp == NULLVP) {
1654 		IMGSRC_DEBUG("No device backs the mount.\n");
1655 		error = ENXIO;
1656 		goto out;
1657 	}
1658 
1659 	error = vnode_getwithref(realdevvp);
1660 	if (error != 0) {
1661 		IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1662 		goto out;
1663 	}
1664 
1665 	if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1666 		IMGSRC_DEBUG("Wrong dev_t.\n");
1667 		error = ENXIO;
1668 		goto out1;
1669 	}
1670 
1671 	strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1672 
1673 	/*
1674 	 * If mount by non-root, then verify that user has necessary
1675 	 * permissions on the device.
1676 	 */
1677 	if (!vfs_context_issuser(ctx)) {
1678 		accessmode = KAUTH_VNODE_READ_DATA;
1679 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1680 			accessmode |= KAUTH_VNODE_WRITE_DATA;
1681 		}
1682 		if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1683 			IMGSRC_DEBUG("Access denied.\n");
1684 			goto out1;
1685 		}
1686 	}
1687 
1688 	*devvpp = vp;
1689 
1690 out1:
1691 	vnode_put(realdevvp);
1692 
1693 out:
1694 	nameidone(&nd);
1695 
1696 	if (error) {
1697 		vnode_put(vp);
1698 	}
1699 
1700 	return error;
1701 }
1702 
1703 /*
1704  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1705  * and call checkdirs()
1706  */
1707 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)1708 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1709 {
1710 	int error;
1711 
1712 	mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1713 
1714 	IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
1715 	    mp->mnt_vtable->vfc_name, vnode_getname(vp));
1716 
1717 	vnode_lock_spin(vp);
1718 	CLR(vp->v_flag, VMOUNT);
1719 	vp->v_mountedhere = mp;
1720 	vnode_unlock(vp);
1721 
1722 	/*
1723 	 * taking the name_cache_lock exclusively will
1724 	 * insure that everyone is out of the fast path who
1725 	 * might be trying to use a now stale copy of
1726 	 * vp->v_mountedhere->mnt_realrootvp
1727 	 * bumping mount_generation causes the cached values
1728 	 * to be invalidated
1729 	 */
1730 	name_cache_lock();
1731 	mount_generation++;
1732 	name_cache_unlock();
1733 
1734 	error = vnode_ref(vp);
1735 	if (error != 0) {
1736 		goto out;
1737 	}
1738 
1739 	error = checkdirs(vp, ctx);
1740 	if (error != 0) {
1741 		/* Unmount the filesystem as cdir/rdirs cannot be updated */
1742 		vnode_rele(vp);
1743 		goto out;
1744 	}
1745 
1746 out:
1747 	if (error != 0) {
1748 		mp->mnt_vnodecovered = NULLVP;
1749 	}
1750 	return error;
1751 }
1752 
1753 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)1754 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1755 {
1756 	vnode_rele(vp);
1757 	vnode_lock_spin(vp);
1758 	vp->v_mountedhere = (mount_t)NULL;
1759 	vnode_unlock(vp);
1760 
1761 	mp->mnt_vnodecovered = NULLVP;
1762 }
1763 
1764 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)1765 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1766 {
1767 	int error;
1768 
1769 	/* unmount in progress return error */
1770 	mount_lock_spin(mp);
1771 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1772 		mount_unlock(mp);
1773 		return EBUSY;
1774 	}
1775 	mount_unlock(mp);
1776 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
1777 
1778 	/*
1779 	 * We only allow the filesystem to be reloaded if it
1780 	 * is currently mounted read-only.
1781 	 */
1782 	if ((flags & MNT_RELOAD) &&
1783 	    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1784 		error = ENOTSUP;
1785 		goto out;
1786 	}
1787 
1788 	/*
1789 	 * Only root, or the user that did the original mount is
1790 	 * permitted to update it.
1791 	 */
1792 	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1793 	    (!vfs_context_issuser(ctx))) {
1794 		error = EPERM;
1795 		goto out;
1796 	}
1797 #if CONFIG_MACF
1798 	error = mac_mount_check_remount(ctx, mp);
1799 	if (error != 0) {
1800 		goto out;
1801 	}
1802 #endif
1803 
1804 out:
1805 	if (error) {
1806 		lck_rw_done(&mp->mnt_rwlock);
1807 	}
1808 
1809 	return error;
1810 }
1811 
1812 static void
mount_end_update(mount_t mp)1813 mount_end_update(mount_t mp)
1814 {
1815 	lck_rw_done(&mp->mnt_rwlock);
1816 }
1817 
1818 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)1819 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1820 {
1821 	vnode_t vp;
1822 
1823 	if (height >= MAX_IMAGEBOOT_NESTING) {
1824 		return EINVAL;
1825 	}
1826 
1827 	vp = imgsrc_rootvnodes[height];
1828 	if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1829 		*rvpp = vp;
1830 		return 0;
1831 	} else {
1832 		return ENOENT;
1833 	}
1834 }
1835 
1836 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)1837 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
1838     struct componentname *cnp, const char *fsname, vfs_context_t ctx,
1839     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1840 {
1841 	int error;
1842 	mount_t mp;
1843 	boolean_t placed = FALSE;
1844 	struct vfstable *vfsp;
1845 	user_addr_t devpath;
1846 	char *old_mntonname;
1847 	vnode_t rvp;
1848 	vnode_t devvp;
1849 	uint32_t height;
1850 	uint32_t flags;
1851 
1852 	/* If we didn't imageboot, nothing to move */
1853 	if (imgsrc_rootvnodes[0] == NULLVP) {
1854 		return EINVAL;
1855 	}
1856 
1857 	/* Only root can do this */
1858 	if (!vfs_context_issuser(ctx)) {
1859 		return EPERM;
1860 	}
1861 
1862 	IMGSRC_DEBUG("looking for root vnode.\n");
1863 
1864 	/*
1865 	 * Get root vnode of filesystem we're moving.
1866 	 */
1867 	if (by_index) {
1868 		if (is64bit) {
1869 			struct user64_mnt_imgsrc_args mia64;
1870 			error = copyin(fsmountargs, &mia64, sizeof(mia64));
1871 			if (error != 0) {
1872 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
1873 				return error;
1874 			}
1875 
1876 			height = mia64.mi_height;
1877 			flags = mia64.mi_flags;
1878 			devpath = (user_addr_t)mia64.mi_devpath;
1879 		} else {
1880 			struct user32_mnt_imgsrc_args mia32;
1881 			error = copyin(fsmountargs, &mia32, sizeof(mia32));
1882 			if (error != 0) {
1883 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
1884 				return error;
1885 			}
1886 
1887 			height = mia32.mi_height;
1888 			flags = mia32.mi_flags;
1889 			devpath = mia32.mi_devpath;
1890 		}
1891 	} else {
1892 		/*
1893 		 * For binary compatibility--assumes one level of nesting.
1894 		 */
1895 		if (is64bit) {
1896 			if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1897 				return error;
1898 			}
1899 		} else {
1900 			user32_addr_t tmp;
1901 			if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1902 				return error;
1903 			}
1904 
1905 			/* munge into LP64 addr */
1906 			devpath = CAST_USER_ADDR_T(tmp);
1907 		}
1908 
1909 		height = 0;
1910 		flags = 0;
1911 	}
1912 
1913 	if (flags != 0) {
1914 		IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1915 		return EINVAL;
1916 	}
1917 
1918 	error = get_imgsrc_rootvnode(height, &rvp);
1919 	if (error != 0) {
1920 		IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
1921 		return error;
1922 	}
1923 
1924 	IMGSRC_DEBUG("got old root vnode\n");
1925 
1926 	old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
1927 
1928 	/* Can only move once */
1929 	mp = vnode_mount(rvp);
1930 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1931 		IMGSRC_DEBUG("Already moved.\n");
1932 		error = EBUSY;
1933 		goto out0;
1934 	}
1935 
1936 	IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
1937 	IMGSRC_DEBUG("Starting updated.\n");
1938 
1939 	/* Get exclusive rwlock on mount, authorize update on mp */
1940 	error = mount_begin_update(mp, ctx, 0);
1941 	if (error != 0) {
1942 		IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1943 		goto out0;
1944 	}
1945 
1946 	/*
1947 	 * It can only be moved once.  Flag is set under the rwlock,
1948 	 * so we're now safe to proceed.
1949 	 */
1950 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1951 		IMGSRC_DEBUG("Already moved [2]\n");
1952 		goto out1;
1953 	}
1954 
1955 	IMGSRC_DEBUG("Preparing coveredvp.\n");
1956 
1957 	/* Mark covered vnode as mount in progress, authorize placing mount on top */
1958 	error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
1959 	if (error != 0) {
1960 		IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1961 		goto out1;
1962 	}
1963 
1964 	IMGSRC_DEBUG("Covered vp OK.\n");
1965 
1966 	/* Sanity check the name caller has provided */
1967 	vfsp = mp->mnt_vtable;
1968 	if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1969 		IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
1970 		    vfsp->vfc_name, fsname);
1971 		error = EINVAL;
1972 		goto out2;
1973 	}
1974 
1975 	/* Check the device vnode and update mount-from name, for local filesystems */
1976 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1977 		IMGSRC_DEBUG("Local, doing device validation.\n");
1978 
1979 		if (devpath != USER_ADDR_NULL) {
1980 			error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1981 			if (error) {
1982 				IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1983 				goto out2;
1984 			}
1985 
1986 			vnode_put(devvp);
1987 		}
1988 	}
1989 
1990 	/*
1991 	 * Place mp on top of vnode, ref the vnode,  call checkdirs(),
1992 	 * and increment the name cache's mount generation
1993 	 */
1994 
1995 	IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1996 	error = place_mount_and_checkdirs(mp, vp, ctx);
1997 	if (error != 0) {
1998 		goto out2;
1999 	}
2000 
2001 	placed = TRUE;
2002 
2003 	strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2004 	strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2005 
2006 	/* Forbid future moves */
2007 	mount_lock(mp);
2008 	mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2009 	mount_unlock(mp);
2010 
2011 	/* Finally, add to mount list, completely ready to go */
2012 	if (mount_list_add(mp) != 0) {
2013 		/*
2014 		 * The system is shutting down trying to umount
2015 		 * everything, so fail with a plausible errno.
2016 		 */
2017 		error = EBUSY;
2018 		goto out3;
2019 	}
2020 
2021 	mount_end_update(mp);
2022 	vnode_put(rvp);
2023 	zfree(ZV_NAMEI, old_mntonname);
2024 
2025 	vfs_notify_mount(pvp);
2026 
2027 	return 0;
2028 out3:
2029 	strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2030 
2031 	mount_lock(mp);
2032 	mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2033 	mount_unlock(mp);
2034 
2035 out2:
2036 	/*
2037 	 * Placing the mp on the vnode clears VMOUNT,
2038 	 * so cleanup is different after that point
2039 	 */
2040 	if (placed) {
2041 		/* Rele the vp, clear VMOUNT and v_mountedhere */
2042 		undo_place_on_covered_vp(mp, vp);
2043 	} else {
2044 		vnode_lock_spin(vp);
2045 		CLR(vp->v_flag, VMOUNT);
2046 		vnode_unlock(vp);
2047 	}
2048 out1:
2049 	mount_end_update(mp);
2050 
2051 out0:
2052 	vnode_put(rvp);
2053 	zfree(ZV_NAMEI, old_mntonname);
2054 	return error;
2055 }
2056 
2057 #endif /* CONFIG_IMGSRC_ACCESS */
2058 
2059 void
enablequotas(struct mount * mp,vfs_context_t ctx)2060 enablequotas(struct mount *mp, vfs_context_t ctx)
2061 {
2062 	struct nameidata qnd;
2063 	int type;
2064 	char qfpath[MAXPATHLEN];
2065 	const char *qfname = QUOTAFILENAME;
2066 	const char *qfopsname = QUOTAOPSNAME;
2067 	const char *qfextension[] = INITQFNAMES;
2068 
2069 	/* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2070 	if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2071 		return;
2072 	}
2073 	/*
2074 	 * Enable filesystem disk quotas if necessary.
2075 	 * We ignore errors as this should not interfere with final mount
2076 	 */
2077 	for (type = 0; type < MAXQUOTAS; type++) {
2078 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2079 		NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2080 		    CAST_USER_ADDR_T(qfpath), ctx);
2081 		if (namei(&qnd) != 0) {
2082 			continue;           /* option file to trigger quotas is not present */
2083 		}
2084 		vnode_put(qnd.ni_vp);
2085 		nameidone(&qnd);
2086 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2087 
2088 		(void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2089 	}
2090 	return;
2091 }
2092 
2093 
2094 static int
checkdirs_callback(proc_t p,void * arg)2095 checkdirs_callback(proc_t p, void * arg)
2096 {
2097 	struct cdirargs *cdrp = (struct cdirargs *)arg;
2098 	vnode_t olddp = cdrp->olddp;
2099 	vnode_t newdp = cdrp->newdp;
2100 	struct filedesc *fdp = &p->p_fd;
2101 	vnode_t new_cvp = newdp;
2102 	vnode_t new_rvp = newdp;
2103 	vnode_t old_cvp = NULL;
2104 	vnode_t old_rvp = NULL;
2105 
2106 	/*
2107 	 * XXX Also needs to iterate each thread in the process to see if it
2108 	 * XXX is using a per-thread current working directory, and, if so,
2109 	 * XXX update that as well.
2110 	 */
2111 
2112 	/*
2113 	 * First, with the proc_fdlock held, check to see if we will need
2114 	 * to do any work.  If not, we will get out fast.
2115 	 */
2116 	proc_fdlock(p);
2117 	if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2118 		proc_fdunlock(p);
2119 		return PROC_RETURNED;
2120 	}
2121 	proc_fdunlock(p);
2122 
2123 	/*
2124 	 * Ok, we will have to do some work.  Always take two refs
2125 	 * because we might need that many.  We'll dispose of whatever
2126 	 * we ended up not using.
2127 	 */
2128 	if (vnode_ref(newdp) != 0) {
2129 		return PROC_RETURNED;
2130 	}
2131 	if (vnode_ref(newdp) != 0) {
2132 		vnode_rele(newdp);
2133 		return PROC_RETURNED;
2134 	}
2135 
2136 	proc_dirs_lock_exclusive(p);
2137 	/*
2138 	 * Now do the work.  Note: we dropped the proc_fdlock, so we
2139 	 * have to do all of the checks again.
2140 	 */
2141 	proc_fdlock(p);
2142 	if (fdp->fd_cdir == olddp) {
2143 		old_cvp = olddp;
2144 		fdp->fd_cdir = newdp;
2145 		new_cvp = NULL;
2146 	}
2147 	if (fdp->fd_rdir == olddp) {
2148 		old_rvp = olddp;
2149 		fdp->fd_rdir = newdp;
2150 		new_rvp = NULL;
2151 	}
2152 	proc_fdunlock(p);
2153 	proc_dirs_unlock_exclusive(p);
2154 
2155 	/*
2156 	 * Dispose of any references that are no longer needed.
2157 	 */
2158 	if (old_cvp != NULL) {
2159 		vnode_rele(old_cvp);
2160 	}
2161 	if (old_rvp != NULL) {
2162 		vnode_rele(old_rvp);
2163 	}
2164 	if (new_cvp != NULL) {
2165 		vnode_rele(new_cvp);
2166 	}
2167 	if (new_rvp != NULL) {
2168 		vnode_rele(new_rvp);
2169 	}
2170 
2171 	return PROC_RETURNED;
2172 }
2173 
2174 
2175 
2176 /*
2177  * Scan all active processes to see if any of them have a current
2178  * or root directory onto which the new filesystem has just been
2179  * mounted. If so, replace them with the new mount point.
2180  */
2181 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2182 checkdirs(vnode_t olddp, vfs_context_t ctx)
2183 {
2184 	vnode_t newdp;
2185 	vnode_t tvp;
2186 	int err;
2187 	struct cdirargs cdr;
2188 
2189 	if (olddp->v_usecount == 1) {
2190 		return 0;
2191 	}
2192 	err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2193 
2194 	if (err != 0) {
2195 #if DIAGNOSTIC
2196 		panic("mount: lost mount: error %d", err);
2197 #endif
2198 		return err;
2199 	}
2200 
2201 	cdr.olddp = olddp;
2202 	cdr.newdp = newdp;
2203 	/* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2204 	proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2205 
2206 	if (rootvnode == olddp) {
2207 		vnode_ref(newdp);
2208 		lck_rw_lock_exclusive(&rootvnode_rw_lock);
2209 		tvp = rootvnode;
2210 		rootvnode = newdp;
2211 		lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2212 		vnode_rele(tvp);
2213 	}
2214 
2215 	vnode_put(newdp);
2216 	return 0;
2217 }
2218 
2219 /*
2220  * Unmount a file system.
2221  *
2222  * Note: unmount takes a path to the vnode mounted on as argument,
2223  * not special file (as before).
2224  */
2225 /* ARGSUSED */
2226 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2227 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2228 {
2229 	vnode_t vp;
2230 	struct mount *mp;
2231 	int error;
2232 	struct nameidata nd;
2233 	vfs_context_t ctx = vfs_context_current();
2234 
2235 	NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2236 	    UIO_USERSPACE, uap->path, ctx);
2237 	error = namei(&nd);
2238 	if (error) {
2239 		return error;
2240 	}
2241 	vp = nd.ni_vp;
2242 	mp = vp->v_mount;
2243 	nameidone(&nd);
2244 
2245 #if CONFIG_MACF
2246 	error = mac_mount_check_umount(ctx, mp);
2247 	if (error != 0) {
2248 		vnode_put(vp);
2249 		return error;
2250 	}
2251 #endif
2252 	/*
2253 	 * Must be the root of the filesystem
2254 	 */
2255 	if ((vp->v_flag & VROOT) == 0) {
2256 		vnode_put(vp);
2257 		return EINVAL;
2258 	}
2259 	mount_ref(mp, 0);
2260 	vnode_put(vp);
2261 	/* safedounmount consumes the mount ref */
2262 	return safedounmount(mp, uap->flags, ctx);
2263 }
2264 
2265 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2266 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2267 {
2268 	mount_t mp;
2269 
2270 	mp = mount_list_lookupby_fsid(fsid, 0, 1);
2271 	if (mp == (mount_t)0) {
2272 		return ENOENT;
2273 	}
2274 	mount_ref(mp, 0);
2275 	mount_iterdrop(mp);
2276 	/* safedounmount consumes the mount ref */
2277 	return safedounmount(mp, flags, ctx);
2278 }
2279 
2280 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT        \
2281 	"com.apple.private.vfs.role-account-unmount"
2282 
2283 /*
2284  * The mount struct comes with a mount ref which will be consumed.
2285  * Do the actual file system unmount, prevent some common foot shooting.
2286  */
2287 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2288 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2289 {
2290 	int error;
2291 	proc_t p = vfs_context_proc(ctx);
2292 
2293 	/*
2294 	 * If the file system is not responding and MNT_NOBLOCK
2295 	 * is set and not a forced unmount then return EBUSY.
2296 	 */
2297 	if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2298 	    (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2299 		error = EBUSY;
2300 		goto out;
2301 	}
2302 
2303 	/*
2304 	 * Skip authorization in two cases:
2305 	 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2306 	 *   This entitlement allows non-root processes unmount volumes mounted by
2307 	 *   other processes.
2308 	 * - If the mount is tagged as permissive and this is not a forced-unmount
2309 	 *   attempt.
2310 	 */
2311 	if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2312 	    (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2313 		/*
2314 		 * Only root, or the user that did the original mount is
2315 		 * permitted to unmount this filesystem.
2316 		 */
2317 		if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2318 		    (error = suser(kauth_cred_get(), &p->p_acflag))) {
2319 			goto out;
2320 		}
2321 	}
2322 	/*
2323 	 * Don't allow unmounting the root file system, or other volumes
2324 	 * associated with it (for example, the associated VM or DATA mounts) .
2325 	 */
2326 	if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2327 		if (!(mp->mnt_flag & MNT_ROOTFS)) {
2328 			printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2329 			    mp->mnt_vfsstat.f_mntonname);
2330 		}
2331 		error = EBUSY; /* the root (or associated volumes) is always busy */
2332 		goto out;
2333 	}
2334 
2335 	/*
2336 	 * If the mount is providing the root filesystem's disk image
2337 	 * (i.e. imageboot), don't allow unmounting
2338 	 */
2339 	if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2340 		error = EBUSY;
2341 		goto out;
2342 	}
2343 
2344 	return dounmount(mp, flags, 1, ctx);
2345 
2346 out:
2347 	mount_drop(mp, 0);
2348 	return error;
2349 }
2350 
2351 /*
2352  * Do the actual file system unmount.
2353  */
2354 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2355 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2356 {
2357 	vnode_t coveredvp = (vnode_t)0;
2358 	int error;
2359 	int needwakeup = 0;
2360 	int forcedunmount = 0;
2361 	int lflags = 0;
2362 	struct vnode *devvp = NULLVP;
2363 #if CONFIG_TRIGGERS
2364 	proc_t p = vfs_context_proc(ctx);
2365 	int did_vflush = 0;
2366 	int pflags_save = 0;
2367 #endif /* CONFIG_TRIGGERS */
2368 
2369 #if CONFIG_FSE
2370 	if (!(flags & MNT_FORCE)) {
2371 		fsevent_unmount(mp, ctx);  /* has to come first! */
2372 	}
2373 #endif
2374 
2375 	mount_lock(mp);
2376 
2377 	/*
2378 	 * If already an unmount in progress just return EBUSY.
2379 	 * Even a forced unmount cannot override.
2380 	 */
2381 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2382 		if (withref != 0) {
2383 			mount_drop(mp, 1);
2384 		}
2385 		mount_unlock(mp);
2386 		return EBUSY;
2387 	}
2388 
2389 	if (flags & MNT_FORCE) {
2390 		forcedunmount = 1;
2391 		mp->mnt_lflag |= MNT_LFORCE;
2392 	}
2393 
2394 #if CONFIG_TRIGGERS
2395 	if (flags & MNT_NOBLOCK && p != kernproc) {
2396 		pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2397 	}
2398 #endif
2399 
2400 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
2401 	mp->mnt_lflag |= MNT_LUNMOUNT;
2402 	mp->mnt_flag &= ~MNT_ASYNC;
2403 	/*
2404 	 * anyone currently in the fast path that
2405 	 * trips over the cached rootvp will be
2406 	 * dumped out and forced into the slow path
2407 	 * to regenerate a new cached value
2408 	 */
2409 	mp->mnt_realrootvp = NULLVP;
2410 	mount_unlock(mp);
2411 
2412 	if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2413 		/*
2414 		 * Force unmount any mounts in this filesystem.
2415 		 * If any unmounts fail - just leave them dangling.
2416 		 * Avoids recursion.
2417 		 */
2418 		(void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2419 	}
2420 
2421 	/*
2422 	 * taking the name_cache_lock exclusively will
2423 	 * insure that everyone is out of the fast path who
2424 	 * might be trying to use a now stale copy of
2425 	 * vp->v_mountedhere->mnt_realrootvp
2426 	 * bumping mount_generation causes the cached values
2427 	 * to be invalidated
2428 	 */
2429 	name_cache_lock();
2430 	mount_generation++;
2431 	name_cache_unlock();
2432 
2433 
2434 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2435 	if (withref != 0) {
2436 		mount_drop(mp, 0);
2437 	}
2438 	error = 0;
2439 	if (forcedunmount == 0) {
2440 		ubc_umount(mp); /* release cached vnodes */
2441 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2442 			error = VFS_SYNC(mp, MNT_WAIT, ctx);
2443 			if (error) {
2444 				mount_lock(mp);
2445 				mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2446 				mp->mnt_lflag &= ~MNT_LUNMOUNT;
2447 				mp->mnt_lflag &= ~MNT_LFORCE;
2448 				goto out;
2449 			}
2450 		}
2451 	}
2452 
2453 	IOBSDMountChange(mp, kIOMountChangeUnmount);
2454 
2455 #if CONFIG_TRIGGERS
2456 	vfs_nested_trigger_unmounts(mp, flags, ctx);
2457 	did_vflush = 1;
2458 #endif
2459 	if (forcedunmount) {
2460 		lflags |= FORCECLOSE;
2461 	}
2462 	error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2463 	if ((forcedunmount == 0) && error) {
2464 		mount_lock(mp);
2465 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2466 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2467 		mp->mnt_lflag &= ~MNT_LFORCE;
2468 		goto out;
2469 	}
2470 
2471 	/* make sure there are no one in the mount iterations or lookup */
2472 	mount_iterdrain(mp);
2473 
2474 	error = VFS_UNMOUNT(mp, flags, ctx);
2475 	if (error) {
2476 		mount_iterreset(mp);
2477 		mount_lock(mp);
2478 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2479 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2480 		mp->mnt_lflag &= ~MNT_LFORCE;
2481 		goto out;
2482 	}
2483 
2484 	/* increment the operations count */
2485 	if (!error) {
2486 		OSAddAtomic(1, &vfs_nummntops);
2487 	}
2488 
2489 	if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2490 		/* hold an io reference and drop the usecount before close */
2491 		devvp = mp->mnt_devvp;
2492 		vnode_getalways(devvp);
2493 		vnode_rele(devvp);
2494 		VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2495 		    ctx);
2496 		vnode_clearmountedon(devvp);
2497 		vnode_put(devvp);
2498 	}
2499 	lck_rw_done(&mp->mnt_rwlock);
2500 	mount_list_remove(mp);
2501 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2502 
2503 	/* mark the mount point hook in the vp but not drop the ref yet */
2504 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2505 		/*
2506 		 * The covered vnode needs special handling. Trying to get an
2507 		 * iocount must not block here as this may lead to deadlocks
2508 		 * if the Filesystem to which the covered vnode belongs is
2509 		 * undergoing forced unmounts. Since we hold a usecount, the
2510 		 * vnode cannot be reused (it can, however, still be terminated)
2511 		 */
2512 		vnode_getalways(coveredvp);
2513 		vnode_lock_spin(coveredvp);
2514 
2515 		mp->mnt_crossref++;
2516 		coveredvp->v_mountedhere = (struct mount *)0;
2517 		CLR(coveredvp->v_flag, VMOUNT);
2518 
2519 		vnode_unlock(coveredvp);
2520 		vnode_put(coveredvp);
2521 	}
2522 
2523 	mount_list_lock();
2524 	mp->mnt_vtable->vfc_refcount--;
2525 	mount_list_unlock();
2526 
2527 	cache_purgevfs(mp);     /* remove cache entries for this file sys */
2528 	vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2529 	mount_lock(mp);
2530 	mp->mnt_lflag |= MNT_LDEAD;
2531 
2532 	if (mp->mnt_lflag & MNT_LWAIT) {
2533 		/*
2534 		 * do the wakeup here
2535 		 * in case we block in mount_refdrain
2536 		 * which will drop the mount lock
2537 		 * and allow anyone blocked in vfs_busy
2538 		 * to wakeup and see the LDEAD state
2539 		 */
2540 		mp->mnt_lflag &= ~MNT_LWAIT;
2541 		wakeup((caddr_t)mp);
2542 	}
2543 	mount_refdrain(mp);
2544 
2545 	/* free disk_conditioner_info structure for this mount */
2546 	disk_conditioner_unmount(mp);
2547 
2548 out:
2549 	if (mp->mnt_lflag & MNT_LWAIT) {
2550 		mp->mnt_lflag &= ~MNT_LWAIT;
2551 		needwakeup = 1;
2552 	}
2553 
2554 #if CONFIG_TRIGGERS
2555 	if (flags & MNT_NOBLOCK && p != kernproc) {
2556 		// Restore P_NOREMOTEHANG bit to its previous value
2557 		if ((pflags_save & P_NOREMOTEHANG) == 0) {
2558 			OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2559 		}
2560 	}
2561 
2562 	/*
2563 	 * Callback and context are set together under the mount lock, and
2564 	 * never cleared, so we're safe to examine them here, drop the lock,
2565 	 * and call out.
2566 	 */
2567 	if (mp->mnt_triggercallback != NULL) {
2568 		mount_unlock(mp);
2569 		if (error == 0) {
2570 			mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2571 		} else if (did_vflush) {
2572 			mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2573 		}
2574 	} else {
2575 		mount_unlock(mp);
2576 	}
2577 #else
2578 	mount_unlock(mp);
2579 #endif /* CONFIG_TRIGGERS */
2580 
2581 	lck_rw_done(&mp->mnt_rwlock);
2582 
2583 	if (needwakeup) {
2584 		wakeup((caddr_t)mp);
2585 	}
2586 
2587 	if (!error) {
2588 		if ((coveredvp != NULLVP)) {
2589 			vnode_t pvp = NULLVP;
2590 
2591 			/*
2592 			 * The covered vnode needs special handling. Trying to
2593 			 * get an iocount must not block here as this may lead
2594 			 * to deadlocks if the Filesystem to which the covered
2595 			 * vnode belongs is undergoing forced unmounts. Since we
2596 			 * hold a usecount, the  vnode cannot be reused
2597 			 * (it can, however, still be terminated).
2598 			 */
2599 			vnode_getalways(coveredvp);
2600 
2601 			mount_dropcrossref(mp, coveredvp, 0);
2602 			/*
2603 			 * We'll _try_ to detect if this really needs to be
2604 			 * done. The coveredvp can only be in termination (or
2605 			 * terminated) if the coveredvp's mount point is in a
2606 			 * forced unmount (or has been) since we still hold the
2607 			 * ref.
2608 			 */
2609 			if (!vnode_isrecycled(coveredvp)) {
2610 				pvp = vnode_getparent(coveredvp);
2611 #if CONFIG_TRIGGERS
2612 				if (coveredvp->v_resolve) {
2613 					vnode_trigger_rearm(coveredvp, ctx);
2614 				}
2615 #endif
2616 			}
2617 
2618 			vnode_rele(coveredvp);
2619 			vnode_put(coveredvp);
2620 			coveredvp = NULLVP;
2621 
2622 			if (pvp) {
2623 				lock_vnode_and_post(pvp, NOTE_WRITE);
2624 				vnode_put(pvp);
2625 			}
2626 		} else if (mp->mnt_flag & MNT_ROOTFS) {
2627 			mount_lock_destroy(mp);
2628 #if CONFIG_MACF
2629 			mac_mount_label_destroy(mp);
2630 #endif
2631 			zfree(mount_zone, mp);
2632 		} else {
2633 			panic("dounmount: no coveredvp");
2634 		}
2635 	}
2636 	return error;
2637 }
2638 
2639 /*
2640  * Unmount any mounts in this filesystem.
2641  */
2642 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)2643 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2644 {
2645 	mount_t smp;
2646 	fsid_t *fsids, fsid;
2647 	int fsids_sz;
2648 	int count = 0, i, m = 0;
2649 	vnode_t vp;
2650 
2651 	mount_list_lock();
2652 
2653 	// Get an array to hold the submounts fsids.
2654 	TAILQ_FOREACH(smp, &mountlist, mnt_list)
2655 	count++;
2656 	fsids_sz = count * sizeof(fsid_t);
2657 	fsids = kalloc_data(fsids_sz, Z_NOWAIT);
2658 	if (fsids == NULL) {
2659 		mount_list_unlock();
2660 		goto out;
2661 	}
2662 	fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
2663 
2664 	/*
2665 	 * Fill the array with submount fsids.
2666 	 * Since mounts are always added to the tail of the mount list, the
2667 	 * list is always in mount order.
2668 	 * For each mount check if the mounted-on vnode belongs to a
2669 	 * mount that's already added to our array of mounts to be unmounted.
2670 	 */
2671 	for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2672 		vp = smp->mnt_vnodecovered;
2673 		if (vp == NULL) {
2674 			continue;
2675 		}
2676 		fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
2677 		for (i = 0; i <= m; i++) {
2678 			if (fsids[i].val[0] == fsid.val[0] &&
2679 			    fsids[i].val[1] == fsid.val[1]) {
2680 				fsids[++m] = smp->mnt_vfsstat.f_fsid;
2681 				break;
2682 			}
2683 		}
2684 	}
2685 	mount_list_unlock();
2686 
2687 	// Unmount the submounts in reverse order. Ignore errors.
2688 	for (i = m; i > 0; i--) {
2689 		smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2690 		if (smp) {
2691 			mount_ref(smp, 0);
2692 			mount_iterdrop(smp);
2693 			(void) dounmount(smp, flags, 1, ctx);
2694 		}
2695 	}
2696 out:
2697 	kfree_data(fsids, fsids_sz);
2698 }
2699 
2700 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)2701 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2702 {
2703 	vnode_lock(dp);
2704 	mp->mnt_crossref--;
2705 
2706 	if (mp->mnt_crossref < 0) {
2707 		panic("mount cross refs -ve");
2708 	}
2709 
2710 	if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2711 		if (need_put) {
2712 			vnode_put_locked(dp);
2713 		}
2714 		vnode_unlock(dp);
2715 
2716 		mount_lock_destroy(mp);
2717 #if CONFIG_MACF
2718 		mac_mount_label_destroy(mp);
2719 #endif
2720 		zfree(mount_zone, mp);
2721 		return;
2722 	}
2723 	if (need_put) {
2724 		vnode_put_locked(dp);
2725 	}
2726 	vnode_unlock(dp);
2727 }
2728 
2729 
2730 /*
2731  * Sync each mounted filesystem.
2732  */
2733 #if DIAGNOSTIC
2734 int syncprt = 0;
2735 #endif
2736 
2737 int print_vmpage_stat = 0;
2738 
2739 /*
2740  * sync_callback:	simple wrapper that calls VFS_SYNC() on volumes
2741  *			mounted read-write with the passed waitfor value.
2742  *
2743  * Parameters:	mp	mount-point descriptor per mounted file-system instance.
2744  *		arg	user argument (please see below)
2745  *
2746  * User argument is a pointer to 32 bit unsigned integer which describes the
2747  * type of waitfor value to set for calling VFS_SYNC().  If user argument is
2748  * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
2749  * waitfor value.
2750  *
2751  * Returns:		VFS_RETURNED
2752  */
2753 static int
sync_callback(mount_t mp,void * arg)2754 sync_callback(mount_t mp, void *arg)
2755 {
2756 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2757 		int asyncflag = mp->mnt_flag & MNT_ASYNC;
2758 		unsigned waitfor = MNT_NOWAIT;
2759 
2760 		if (arg) {
2761 			waitfor = *(uint32_t*)arg;
2762 		}
2763 
2764 		/* Sanity check for flags - these are the only valid combinations for the flag bits*/
2765 		if (waitfor != MNT_WAIT &&
2766 		    waitfor != (MNT_WAIT | MNT_VOLUME) &&
2767 		    waitfor != MNT_NOWAIT &&
2768 		    waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
2769 		    waitfor != MNT_DWAIT &&
2770 		    waitfor != (MNT_DWAIT | MNT_VOLUME)) {
2771 			panic("Passed inappropriate waitfor %u to "
2772 			    "sync_callback()", waitfor);
2773 		}
2774 
2775 		mp->mnt_flag &= ~MNT_ASYNC;
2776 		(void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
2777 		if (asyncflag) {
2778 			mp->mnt_flag |= MNT_ASYNC;
2779 		}
2780 	}
2781 
2782 	return VFS_RETURNED;
2783 }
2784 
2785 /* ARGSUSED */
2786 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)2787 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2788 {
2789 	vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2790 
2791 	if (print_vmpage_stat) {
2792 		vm_countdirtypages();
2793 	}
2794 
2795 #if DIAGNOSTIC
2796 	if (syncprt) {
2797 		vfs_bufstats();
2798 	}
2799 #endif /* DIAGNOSTIC */
2800 	return 0;
2801 }
2802 
2803 typedef enum {
2804 	SYNC_ALL = 0,
2805 	SYNC_ONLY_RELIABLE_MEDIA = 1,
2806 	SYNC_ONLY_UNRELIABLE_MEDIA = 2
2807 } sync_type_t;
2808 
2809 static int
sync_internal_callback(mount_t mp,void * arg)2810 sync_internal_callback(mount_t mp, void *arg)
2811 {
2812 	if (arg) {
2813 		int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
2814 		    (mp->mnt_flag & MNT_LOCAL);
2815 		sync_type_t sync_type = *((sync_type_t *)arg);
2816 
2817 		if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
2818 			return VFS_RETURNED;
2819 		} else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
2820 			return VFS_RETURNED;
2821 		}
2822 	}
2823 
2824 	(void)sync_callback(mp, NULL);
2825 
2826 	return VFS_RETURNED;
2827 }
2828 
2829 int sync_thread_state = 0;
2830 int sync_timeout_seconds = 5;
2831 
2832 #define SYNC_THREAD_RUN       0x0001
2833 #define SYNC_THREAD_RUNNING   0x0002
2834 
2835 #if CONFIG_PHYS_WRITE_ACCT
2836 thread_t pm_sync_thread;
2837 #endif /* CONFIG_PHYS_WRITE_ACCT */
2838 
2839 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)2840 sync_thread(__unused void *arg, __unused wait_result_t wr)
2841 {
2842 	sync_type_t sync_type;
2843 #if CONFIG_PHYS_WRITE_ACCT
2844 	pm_sync_thread = current_thread();
2845 #endif /* CONFIG_PHYS_WRITE_ACCT */
2846 
2847 	lck_mtx_lock(&sync_mtx_lck);
2848 	while (sync_thread_state & SYNC_THREAD_RUN) {
2849 		sync_thread_state &= ~SYNC_THREAD_RUN;
2850 		lck_mtx_unlock(&sync_mtx_lck);
2851 
2852 		sync_type = SYNC_ONLY_RELIABLE_MEDIA;
2853 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2854 		sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
2855 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2856 
2857 		lck_mtx_lock(&sync_mtx_lck);
2858 	}
2859 	/*
2860 	 * This wakeup _has_ to be issued before the lock is released otherwise
2861 	 * we may end up waking up a thread in sync_internal which is
2862 	 * expecting a wakeup from a thread it just created and not from this
2863 	 * thread which is about to exit.
2864 	 */
2865 	wakeup(&sync_thread_state);
2866 	sync_thread_state &= ~SYNC_THREAD_RUNNING;
2867 #if CONFIG_PHYS_WRITE_ACCT
2868 	pm_sync_thread = NULL;
2869 #endif /* CONFIG_PHYS_WRITE_ACCT */
2870 	lck_mtx_unlock(&sync_mtx_lck);
2871 
2872 	if (print_vmpage_stat) {
2873 		vm_countdirtypages();
2874 	}
2875 
2876 #if DIAGNOSTIC
2877 	if (syncprt) {
2878 		vfs_bufstats();
2879 	}
2880 #endif /* DIAGNOSTIC */
2881 }
2882 
2883 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
2884 
2885 /*
2886  * An in-kernel sync for power management to call.
2887  * This function always returns within sync_timeout seconds.
2888  */
2889 __private_extern__ int
sync_internal(void)2890 sync_internal(void)
2891 {
2892 	thread_t thd;
2893 	int error;
2894 	int thread_created = FALSE;
2895 	struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
2896 
2897 	lck_mtx_lock(&sync_mtx_lck);
2898 	sync_thread_state |= SYNC_THREAD_RUN;
2899 	if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
2900 		int kr;
2901 
2902 		sync_thread_state |= SYNC_THREAD_RUNNING;
2903 		kr = kernel_thread_start(sync_thread, NULL, &thd);
2904 		if (kr != KERN_SUCCESS) {
2905 			sync_thread_state &= ~SYNC_THREAD_RUNNING;
2906 			lck_mtx_unlock(&sync_mtx_lck);
2907 			printf("sync_thread failed\n");
2908 			return 0;
2909 		}
2910 		thread_created = TRUE;
2911 	}
2912 
2913 	error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
2914 	    (PVFS | PDROP | PCATCH), "sync_thread", &ts);
2915 	if (error) {
2916 		struct timeval now;
2917 
2918 		microtime(&now);
2919 		if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
2920 			printf("sync timed out: %d sec\n", sync_timeout_seconds);
2921 			sync_timeout_last_print.tv_sec = now.tv_sec;
2922 		}
2923 	}
2924 
2925 	if (thread_created) {
2926 		thread_deallocate(thd);
2927 	}
2928 
2929 	return 0;
2930 } /* end of sync_internal call */
2931 
2932 /*
2933  * Change filesystem quotas.
2934  */
2935 #if QUOTA
2936 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)2937 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2938 {
2939 	struct mount *mp;
2940 	int error, quota_cmd, quota_status = 0;
2941 	caddr_t datap;
2942 	size_t fnamelen;
2943 	struct nameidata nd;
2944 	vfs_context_t ctx = vfs_context_current();
2945 	struct dqblk my_dqblk = {};
2946 
2947 	AUDIT_ARG(uid, uap->uid);
2948 	AUDIT_ARG(cmd, uap->cmd);
2949 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2950 	    uap->path, ctx);
2951 	error = namei(&nd);
2952 	if (error) {
2953 		return error;
2954 	}
2955 	mp = nd.ni_vp->v_mount;
2956 	mount_ref(mp, 0);
2957 	vnode_put(nd.ni_vp);
2958 	nameidone(&nd);
2959 
2960 #if CONFIG_MACF
2961 	error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
2962 	if (error != 0) {
2963 		goto out;
2964 	}
2965 #endif
2966 
2967 	/* copyin any data we will need for downstream code */
2968 	quota_cmd = uap->cmd >> SUBCMDSHIFT;
2969 
2970 	switch (quota_cmd) {
2971 	case Q_QUOTAON:
2972 		/* uap->arg specifies a file from which to take the quotas */
2973 		fnamelen = MAXPATHLEN;
2974 		datap = zalloc(ZV_NAMEI);
2975 		error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2976 		break;
2977 	case Q_GETQUOTA:
2978 		/* uap->arg is a pointer to a dqblk structure. */
2979 		datap = (caddr_t) &my_dqblk;
2980 		break;
2981 	case Q_SETQUOTA:
2982 	case Q_SETUSE:
2983 		/* uap->arg is a pointer to a dqblk structure. */
2984 		datap = (caddr_t) &my_dqblk;
2985 		if (proc_is64bit(p)) {
2986 			struct user_dqblk       my_dqblk64;
2987 			error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
2988 			if (error == 0) {
2989 				munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2990 			}
2991 		} else {
2992 			error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
2993 		}
2994 		break;
2995 	case Q_QUOTASTAT:
2996 		/* uap->arg is a pointer to an integer */
2997 		datap = (caddr_t) &quota_status;
2998 		break;
2999 	default:
3000 		datap = NULL;
3001 		break;
3002 	} /* switch */
3003 
3004 	if (error == 0) {
3005 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3006 	}
3007 
3008 	switch (quota_cmd) {
3009 	case Q_QUOTAON:
3010 		if (datap != NULL) {
3011 			zfree(ZV_NAMEI, datap);
3012 		}
3013 		break;
3014 	case Q_GETQUOTA:
3015 		/* uap->arg is a pointer to a dqblk structure we need to copy out to */
3016 		if (error == 0) {
3017 			if (proc_is64bit(p)) {
3018 				struct user_dqblk       my_dqblk64;
3019 
3020 				memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3021 				munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3022 				error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3023 			} else {
3024 				error = copyout(datap, uap->arg, sizeof(struct dqblk));
3025 			}
3026 		}
3027 		break;
3028 	case Q_QUOTASTAT:
3029 		/* uap->arg is a pointer to an integer */
3030 		if (error == 0) {
3031 			error = copyout(datap, uap->arg, sizeof(quota_status));
3032 		}
3033 		break;
3034 	default:
3035 		break;
3036 	} /* switch */
3037 
3038 out:
3039 	mount_drop(mp, 0);
3040 	return error;
3041 }
3042 #else
3043 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3044 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3045 {
3046 	return EOPNOTSUPP;
3047 }
3048 #endif /* QUOTA */
3049 
3050 /*
3051  * Get filesystem statistics.
3052  *
3053  * Returns:	0			Success
3054  *	namei:???
3055  *	vfs_update_vfsstat:???
3056  *	munge_statfs:EFAULT
3057  */
3058 /* ARGSUSED */
3059 int
statfs(__unused proc_t p,struct statfs_args * uap,__unused int32_t * retval)3060 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3061 {
3062 	struct mount *mp;
3063 	struct vfsstatfs *sp;
3064 	int error;
3065 	struct nameidata nd;
3066 	vfs_context_t ctx = vfs_context_current();
3067 	vnode_t vp;
3068 
3069 	NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3070 	    UIO_USERSPACE, uap->path, ctx);
3071 	error = namei(&nd);
3072 	if (error != 0) {
3073 		return error;
3074 	}
3075 	vp = nd.ni_vp;
3076 	mp = vp->v_mount;
3077 	sp = &mp->mnt_vfsstat;
3078 	nameidone(&nd);
3079 
3080 #if CONFIG_MACF
3081 	error = mac_mount_check_stat(ctx, mp);
3082 	if (error != 0) {
3083 		vnode_put(vp);
3084 		return error;
3085 	}
3086 #endif
3087 
3088 	error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3089 	if (error != 0) {
3090 		vnode_put(vp);
3091 		return error;
3092 	}
3093 
3094 	error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
3095 	vnode_put(vp);
3096 	return error;
3097 }
3098 
3099 /*
3100  * Get filesystem statistics.
3101  */
3102 /* ARGSUSED */
3103 int
fstatfs(__unused proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3104 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3105 {
3106 	vnode_t vp;
3107 	struct mount *mp;
3108 	struct vfsstatfs *sp;
3109 	int error;
3110 
3111 	AUDIT_ARG(fd, uap->fd);
3112 
3113 	if ((error = file_vnode(uap->fd, &vp))) {
3114 		return error;
3115 	}
3116 
3117 	error = vnode_getwithref(vp);
3118 	if (error) {
3119 		file_drop(uap->fd);
3120 		return error;
3121 	}
3122 
3123 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3124 
3125 	mp = vp->v_mount;
3126 	if (!mp) {
3127 		error = EBADF;
3128 		goto out;
3129 	}
3130 
3131 #if CONFIG_MACF
3132 	error = mac_mount_check_stat(vfs_context_current(), mp);
3133 	if (error != 0) {
3134 		goto out;
3135 	}
3136 #endif
3137 
3138 	sp = &mp->mnt_vfsstat;
3139 	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3140 		goto out;
3141 	}
3142 
3143 	error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
3144 
3145 out:
3146 	file_drop(uap->fd);
3147 	vnode_put(vp);
3148 
3149 	return error;
3150 }
3151 
3152 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3153 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3154 {
3155 	struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3156 
3157 	bzero(sfs, sizeof(*sfs));
3158 
3159 	sfs->f_bsize = vsfs->f_bsize;
3160 	sfs->f_iosize = (int32_t)vsfs->f_iosize;
3161 	sfs->f_blocks = vsfs->f_blocks;
3162 	sfs->f_bfree = vsfs->f_bfree;
3163 	sfs->f_bavail = vsfs->f_bavail;
3164 	sfs->f_files = vsfs->f_files;
3165 	sfs->f_ffree = vsfs->f_ffree;
3166 	sfs->f_fsid = vsfs->f_fsid;
3167 	sfs->f_owner = vsfs->f_owner;
3168 	sfs->f_type = mp->mnt_vtable->vfc_typenum;
3169 	sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3170 	sfs->f_fssubtype = vsfs->f_fssubtype;
3171 	sfs->f_flags_ext = (mp->mnt_kern_flag & MNTK_SYSTEMDATA) ? MNT_EXT_ROOT_DATA_VOL : 0;
3172 	if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3173 		strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3174 	} else {
3175 		strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
3176 	}
3177 	strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3178 	strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3179 }
3180 
3181 /*
3182  * Get file system statistics in 64-bit mode
3183  */
3184 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3185 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3186 {
3187 	struct mount *mp;
3188 	int error;
3189 	struct nameidata *ndp;
3190 	struct statfs64 *sfsp;
3191 	vfs_context_t ctxp = vfs_context_current();
3192 	vnode_t vp;
3193 	struct {
3194 		struct nameidata nd;
3195 		struct statfs64 sfs;
3196 	} *__nameidata_statfs64;
3197 
3198 	__nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3199 	    Z_WAITOK);
3200 	ndp = &__nameidata_statfs64->nd;
3201 
3202 	NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3203 	    UIO_USERSPACE, uap->path, ctxp);
3204 	error = namei(ndp);
3205 	if (error != 0) {
3206 		goto out;
3207 	}
3208 	vp = ndp->ni_vp;
3209 	mp = vp->v_mount;
3210 	nameidone(ndp);
3211 
3212 #if CONFIG_MACF
3213 	error = mac_mount_check_stat(ctxp, mp);
3214 	if (error != 0) {
3215 		vnode_put(vp);
3216 		goto out;
3217 	}
3218 #endif
3219 
3220 	error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3221 	if (error != 0) {
3222 		vnode_put(vp);
3223 		goto out;
3224 	}
3225 
3226 	sfsp = &__nameidata_statfs64->sfs;
3227 	vfs_get_statfs64(mp, sfsp);
3228 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3229 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3230 		/* This process does not want to see a seperate data volume mountpoint */
3231 		strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3232 	}
3233 	error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3234 	vnode_put(vp);
3235 
3236 out:
3237 	kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3238 
3239 	return error;
3240 }
3241 
3242 /*
3243  * Get file system statistics in 64-bit mode
3244  */
3245 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3246 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3247 {
3248 	struct vnode *vp;
3249 	struct mount *mp;
3250 	struct statfs64 sfs;
3251 	int error;
3252 
3253 	AUDIT_ARG(fd, uap->fd);
3254 
3255 	if ((error = file_vnode(uap->fd, &vp))) {
3256 		return error;
3257 	}
3258 
3259 	error = vnode_getwithref(vp);
3260 	if (error) {
3261 		file_drop(uap->fd);
3262 		return error;
3263 	}
3264 
3265 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3266 
3267 	mp = vp->v_mount;
3268 	if (!mp) {
3269 		error = EBADF;
3270 		goto out;
3271 	}
3272 
3273 #if CONFIG_MACF
3274 	error = mac_mount_check_stat(vfs_context_current(), mp);
3275 	if (error != 0) {
3276 		goto out;
3277 	}
3278 #endif
3279 
3280 	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3281 		goto out;
3282 	}
3283 
3284 	vfs_get_statfs64(mp, &sfs);
3285 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3286 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3287 		/* This process does not want to see a seperate data volume mountpoint */
3288 		strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3289 	}
3290 	error = copyout(&sfs, uap->buf, sizeof(sfs));
3291 
3292 out:
3293 	file_drop(uap->fd);
3294 	vnode_put(vp);
3295 
3296 	return error;
3297 }
3298 
3299 struct getfsstat_struct {
3300 	user_addr_t     sfsp;
3301 	user_addr_t     *mp;
3302 	int             count;
3303 	int             maxcount;
3304 	int             flags;
3305 	int             error;
3306 };
3307 
3308 
3309 static int
getfsstat_callback(mount_t mp,void * arg)3310 getfsstat_callback(mount_t mp, void * arg)
3311 {
3312 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3313 	struct vfsstatfs *sp;
3314 	int error, my_size;
3315 	vfs_context_t ctx = vfs_context_current();
3316 
3317 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3318 #if CONFIG_MACF
3319 		error = mac_mount_check_stat(ctx, mp);
3320 		if (error != 0) {
3321 			fstp->error = error;
3322 			return VFS_RETURNED_DONE;
3323 		}
3324 #endif
3325 		sp = &mp->mnt_vfsstat;
3326 		/*
3327 		 * If MNT_NOWAIT is specified, do not refresh the
3328 		 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3329 		 */
3330 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3331 		    (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3332 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3333 		    (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3334 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3335 			return VFS_RETURNED;
3336 		}
3337 
3338 		/*
3339 		 * Need to handle LP64 version of struct statfs
3340 		 */
3341 		error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3342 		if (error) {
3343 			fstp->error = error;
3344 			return VFS_RETURNED_DONE;
3345 		}
3346 		fstp->sfsp += my_size;
3347 
3348 		if (fstp->mp) {
3349 #if CONFIG_MACF
3350 			error = mac_mount_label_get(mp, *fstp->mp);
3351 			if (error) {
3352 				fstp->error = error;
3353 				return VFS_RETURNED_DONE;
3354 			}
3355 #endif
3356 			fstp->mp++;
3357 		}
3358 	}
3359 	fstp->count++;
3360 	return VFS_RETURNED;
3361 }
3362 
3363 /*
3364  * Get statistics on all filesystems.
3365  */
3366 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3367 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3368 {
3369 	struct __mac_getfsstat_args muap;
3370 
3371 	muap.buf = uap->buf;
3372 	muap.bufsize = uap->bufsize;
3373 	muap.mac = USER_ADDR_NULL;
3374 	muap.macsize = 0;
3375 	muap.flags = uap->flags;
3376 
3377 	return __mac_getfsstat(p, &muap, retval);
3378 }
3379 
3380 /*
3381  * __mac_getfsstat: Get MAC-related file system statistics
3382  *
3383  * Parameters:    p                        (ignored)
3384  *                uap                      User argument descriptor (see below)
3385  *                retval                   Count of file system statistics (N stats)
3386  *
3387  * Indirect:      uap->bufsize             Buffer size
3388  *                uap->macsize             MAC info size
3389  *                uap->buf                 Buffer where information will be returned
3390  *                uap->mac                 MAC info
3391  *                uap->flags               File system flags
3392  *
3393  *
3394  * Returns:        0                       Success
3395  *                !0                       Not success
3396  *
3397  */
3398 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3399 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3400 {
3401 	user_addr_t sfsp;
3402 	user_addr_t *mp;
3403 	size_t count, maxcount, bufsize, macsize;
3404 	struct getfsstat_struct fst;
3405 
3406 	if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3407 		return EINVAL;
3408 	}
3409 
3410 	bufsize = (size_t) uap->bufsize;
3411 	macsize = (size_t) uap->macsize;
3412 
3413 	if (IS_64BIT_PROCESS(p)) {
3414 		maxcount = bufsize / sizeof(struct user64_statfs);
3415 	} else {
3416 		maxcount = bufsize / sizeof(struct user32_statfs);
3417 	}
3418 	sfsp = uap->buf;
3419 	count = 0;
3420 
3421 	mp = NULL;
3422 
3423 #if CONFIG_MACF
3424 	if (uap->mac != USER_ADDR_NULL) {
3425 		u_int32_t *mp0;
3426 		int error;
3427 		unsigned int i;
3428 
3429 		count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3430 		if (count != maxcount) {
3431 			return EINVAL;
3432 		}
3433 
3434 		/* Copy in the array */
3435 		mp0 = kalloc_data(macsize, Z_WAITOK);
3436 		if (mp0 == NULL) {
3437 			return ENOMEM;
3438 		}
3439 
3440 		error = copyin(uap->mac, mp0, macsize);
3441 		if (error) {
3442 			kfree_data(mp0, macsize);
3443 			return error;
3444 		}
3445 
3446 		/* Normalize to an array of user_addr_t */
3447 		mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3448 		if (mp == NULL) {
3449 			kfree_data(mp0, macsize);
3450 			return ENOMEM;
3451 		}
3452 
3453 		for (i = 0; i < count; i++) {
3454 			if (IS_64BIT_PROCESS(p)) {
3455 				mp[i] = ((user_addr_t *)mp0)[i];
3456 			} else {
3457 				mp[i] = (user_addr_t)mp0[i];
3458 			}
3459 		}
3460 		kfree_data(mp0, macsize);
3461 	}
3462 #endif
3463 
3464 
3465 	fst.sfsp = sfsp;
3466 	fst.mp = mp;
3467 	fst.flags = uap->flags;
3468 	fst.count = 0;
3469 	fst.error = 0;
3470 	fst.maxcount = (int)maxcount;
3471 
3472 
3473 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3474 
3475 	if (mp) {
3476 		kfree_data(mp, count * sizeof(user_addr_t));
3477 	}
3478 
3479 	if (fst.error) {
3480 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3481 		return fst.error;
3482 	}
3483 
3484 	if (fst.sfsp && fst.count > fst.maxcount) {
3485 		*retval = fst.maxcount;
3486 	} else {
3487 		*retval = fst.count;
3488 	}
3489 	return 0;
3490 }
3491 
3492 static int
getfsstat64_callback(mount_t mp,void * arg)3493 getfsstat64_callback(mount_t mp, void * arg)
3494 {
3495 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3496 	struct vfsstatfs *sp;
3497 	struct statfs64 sfs;
3498 	int error;
3499 
3500 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3501 #if CONFIG_MACF
3502 		error = mac_mount_check_stat(vfs_context_current(), mp);
3503 		if (error != 0) {
3504 			fstp->error = error;
3505 			return VFS_RETURNED_DONE;
3506 		}
3507 #endif
3508 		sp = &mp->mnt_vfsstat;
3509 		/*
3510 		 * If MNT_NOWAIT is specified, do not refresh the fsstat
3511 		 * cache. MNT_WAIT overrides MNT_NOWAIT.
3512 		 *
3513 		 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3514 		 * getfsstat, since the constants are out of the same
3515 		 * namespace.
3516 		 */
3517 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3518 		    ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3519 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3520 		    (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3521 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3522 			return VFS_RETURNED;
3523 		}
3524 
3525 		vfs_get_statfs64(mp, &sfs);
3526 		error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3527 		if (error) {
3528 			fstp->error = error;
3529 			return VFS_RETURNED_DONE;
3530 		}
3531 		fstp->sfsp += sizeof(sfs);
3532 	}
3533 	fstp->count++;
3534 	return VFS_RETURNED;
3535 }
3536 
3537 /*
3538  * Get statistics on all file systems in 64 bit mode.
3539  */
3540 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)3541 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3542 {
3543 	user_addr_t sfsp;
3544 	int count, maxcount;
3545 	struct getfsstat_struct fst;
3546 
3547 	maxcount = uap->bufsize / sizeof(struct statfs64);
3548 
3549 	sfsp = uap->buf;
3550 	count = 0;
3551 
3552 	fst.sfsp = sfsp;
3553 	fst.flags = uap->flags;
3554 	fst.count = 0;
3555 	fst.error = 0;
3556 	fst.maxcount = maxcount;
3557 
3558 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3559 
3560 	if (fst.error) {
3561 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3562 		return fst.error;
3563 	}
3564 
3565 	if (fst.sfsp && fst.count > fst.maxcount) {
3566 		*retval = fst.maxcount;
3567 	} else {
3568 		*retval = fst.count;
3569 	}
3570 
3571 	return 0;
3572 }
3573 
3574 /*
3575  * gets the associated vnode with the file descriptor passed.
3576  * as input
3577  *
3578  * INPUT
3579  * ctx - vfs context of caller
3580  * fd - file descriptor for which vnode is required.
3581  * vpp - Pointer to pointer to vnode to be returned.
3582  *
3583  * The vnode is returned with an iocount so any vnode obtained
3584  * by this call needs a vnode_put
3585  *
3586  */
3587 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)3588 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3589 {
3590 	int error;
3591 	vnode_t vp;
3592 	struct fileproc *fp;
3593 	proc_t p = vfs_context_proc(ctx);
3594 
3595 	*vpp =  NULLVP;
3596 
3597 	error = fp_getfvp(p, fd, &fp, &vp);
3598 	if (error) {
3599 		return error;
3600 	}
3601 
3602 	error = vnode_getwithref(vp);
3603 	if (error) {
3604 		(void)fp_drop(p, fd, fp, 0);
3605 		return error;
3606 	}
3607 
3608 	(void)fp_drop(p, fd, fp, 0);
3609 	*vpp = vp;
3610 	return error;
3611 }
3612 
3613 /*
3614  * Wrapper function around namei to start lookup from a directory
3615  * specified by a file descriptor ni_dirfd.
3616  *
3617  * In addition to all the errors returned by namei, this call can
3618  * return ENOTDIR if the file descriptor does not refer to a directory.
3619  * and EBADF if the file descriptor is not valid.
3620  */
3621 int
nameiat(struct nameidata * ndp,int dirfd)3622 nameiat(struct nameidata *ndp, int dirfd)
3623 {
3624 	if ((dirfd != AT_FDCWD) &&
3625 	    !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3626 	    !(ndp->ni_cnd.cn_flags & USEDVP)) {
3627 		int error = 0;
3628 		char c;
3629 
3630 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3631 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
3632 			if (error) {
3633 				return error;
3634 			}
3635 		} else {
3636 			c = *((char *)(ndp->ni_dirp));
3637 		}
3638 
3639 		if (c != '/') {
3640 			vnode_t dvp_at;
3641 
3642 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3643 			    &dvp_at);
3644 			if (error) {
3645 				return error;
3646 			}
3647 
3648 			if (vnode_vtype(dvp_at) != VDIR) {
3649 				vnode_put(dvp_at);
3650 				return ENOTDIR;
3651 			}
3652 
3653 			ndp->ni_dvp = dvp_at;
3654 			ndp->ni_cnd.cn_flags |= USEDVP;
3655 			error = namei(ndp);
3656 			ndp->ni_cnd.cn_flags &= ~USEDVP;
3657 			vnode_put(dvp_at);
3658 			return error;
3659 		}
3660 	}
3661 
3662 	return namei(ndp);
3663 }
3664 
3665 /*
3666  * Change current working directory to a given file descriptor.
3667  */
3668 /* ARGSUSED */
3669 static int
common_fchdir(proc_t p,struct fchdir_args * uap,int per_thread)3670 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
3671 {
3672 	vnode_t vp;
3673 	vnode_t tdp;
3674 	vnode_t tvp;
3675 	struct mount *mp;
3676 	int error, should_put = 1;
3677 	vfs_context_t ctx = vfs_context_current();
3678 
3679 	AUDIT_ARG(fd, uap->fd);
3680 	if (per_thread && uap->fd == -1) {
3681 		/*
3682 		 * Switching back from per-thread to per process CWD; verify we
3683 		 * in fact have one before proceeding.  The only success case
3684 		 * for this code path is to return 0 preemptively after zapping
3685 		 * the thread structure contents.
3686 		 */
3687 		thread_t th = vfs_context_thread(ctx);
3688 		if (th) {
3689 			uthread_t uth = get_bsdthread_info(th);
3690 			tvp = uth->uu_cdir;
3691 			uth->uu_cdir = NULLVP;
3692 			if (tvp != NULLVP) {
3693 				vnode_rele(tvp);
3694 				return 0;
3695 			}
3696 		}
3697 		return EBADF;
3698 	}
3699 
3700 	if ((error = file_vnode(uap->fd, &vp))) {
3701 		return error;
3702 	}
3703 	if ((error = vnode_getwithref(vp))) {
3704 		file_drop(uap->fd);
3705 		return error;
3706 	}
3707 
3708 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
3709 
3710 	if (vp->v_type != VDIR) {
3711 		error = ENOTDIR;
3712 		goto out;
3713 	}
3714 
3715 #if CONFIG_MACF
3716 	error = mac_vnode_check_chdir(ctx, vp);
3717 	if (error) {
3718 		goto out;
3719 	}
3720 #endif
3721 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3722 	if (error) {
3723 		goto out;
3724 	}
3725 
3726 	while (!error && (mp = vp->v_mountedhere) != NULL) {
3727 		if (vfs_busy(mp, LK_NOWAIT)) {
3728 			error = EACCES;
3729 			goto out;
3730 		}
3731 		error = VFS_ROOT(mp, &tdp, ctx);
3732 		vfs_unbusy(mp);
3733 		if (error) {
3734 			break;
3735 		}
3736 		vnode_put(vp);
3737 		vp = tdp;
3738 	}
3739 	if (error) {
3740 		goto out;
3741 	}
3742 	if ((error = vnode_ref(vp))) {
3743 		goto out;
3744 	}
3745 	vnode_put(vp);
3746 	should_put = 0;
3747 
3748 	if (per_thread) {
3749 		thread_t th = vfs_context_thread(ctx);
3750 		if (th) {
3751 			uthread_t uth = get_bsdthread_info(th);
3752 			tvp = uth->uu_cdir;
3753 			uth->uu_cdir = vp;
3754 			OSBitOrAtomic(P_THCWD, &p->p_flag);
3755 		} else {
3756 			vnode_rele(vp);
3757 			error = ENOENT;
3758 			goto out;
3759 		}
3760 	} else {
3761 		proc_dirs_lock_exclusive(p);
3762 		proc_fdlock(p);
3763 		tvp = p->p_fd.fd_cdir;
3764 		p->p_fd.fd_cdir = vp;
3765 		proc_fdunlock(p);
3766 		proc_dirs_unlock_exclusive(p);
3767 	}
3768 
3769 	if (tvp) {
3770 		vnode_rele(tvp);
3771 	}
3772 
3773 out:
3774 	if (should_put) {
3775 		vnode_put(vp);
3776 	}
3777 	file_drop(uap->fd);
3778 
3779 	return error;
3780 }
3781 
3782 int
fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)3783 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3784 {
3785 	return common_fchdir(p, uap, 0);
3786 }
3787 
3788 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)3789 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3790 {
3791 	return common_fchdir(p, (void *)uap, 1);
3792 }
3793 
3794 
3795 /*
3796  * Change current working directory (".").
3797  *
3798  * Returns:	0			Success
3799  *	change_dir:ENOTDIR
3800  *	change_dir:???
3801  *	vnode_ref:ENOENT		No such file or directory
3802  */
3803 /* ARGSUSED */
3804 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)3805 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
3806 {
3807 	int error;
3808 	vnode_t tvp;
3809 
3810 	error = change_dir(ndp, ctx);
3811 	if (error) {
3812 		return error;
3813 	}
3814 	if ((error = vnode_ref(ndp->ni_vp))) {
3815 		vnode_put(ndp->ni_vp);
3816 		return error;
3817 	}
3818 	/*
3819 	 * drop the iocount we picked up in change_dir
3820 	 */
3821 	vnode_put(ndp->ni_vp);
3822 
3823 	if (per_thread) {
3824 		thread_t th = vfs_context_thread(ctx);
3825 		if (th) {
3826 			uthread_t uth = get_bsdthread_info(th);
3827 			tvp = uth->uu_cdir;
3828 			uth->uu_cdir = ndp->ni_vp;
3829 			OSBitOrAtomic(P_THCWD, &p->p_flag);
3830 		} else {
3831 			vnode_rele(ndp->ni_vp);
3832 			return ENOENT;
3833 		}
3834 	} else {
3835 		proc_dirs_lock_exclusive(p);
3836 		proc_fdlock(p);
3837 		tvp = p->p_fd.fd_cdir;
3838 		p->p_fd.fd_cdir = ndp->ni_vp;
3839 		proc_fdunlock(p);
3840 		proc_dirs_unlock_exclusive(p);
3841 	}
3842 
3843 	if (tvp) {
3844 		vnode_rele(tvp);
3845 	}
3846 
3847 	return 0;
3848 }
3849 
3850 
3851 /*
3852  * Change current working directory (".").
3853  *
3854  * Returns:	0			Success
3855  *	chdir_internal:ENOTDIR
3856  *	chdir_internal:ENOENT		No such file or directory
3857  *	chdir_internal:???
3858  */
3859 /* ARGSUSED */
3860 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)3861 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3862 {
3863 	struct nameidata nd;
3864 	vfs_context_t ctx = vfs_context_current();
3865 
3866 	NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3867 	    UIO_USERSPACE, uap->path, ctx);
3868 
3869 	return chdir_internal(p, ctx, &nd, per_thread);
3870 }
3871 
3872 
3873 /*
3874  * chdir
3875  *
3876  * Change current working directory (".") for the entire process
3877  *
3878  * Parameters:  p       Process requesting the call
3879  *              uap     User argument descriptor (see below)
3880  *              retval  (ignored)
3881  *
3882  * Indirect parameters:	uap->path	Directory path
3883  *
3884  * Returns:	0			Success
3885  *              common_chdir: ENOTDIR
3886  *              common_chdir: ENOENT	No such file or directory
3887  *              common_chdir: ???
3888  *
3889  */
3890 int
chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)3891 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3892 {
3893 	return common_chdir(p, (void *)uap, 0);
3894 }
3895 
3896 /*
3897  * __pthread_chdir
3898  *
3899  * Change current working directory (".") for a single thread
3900  *
3901  * Parameters:  p       Process requesting the call
3902  *              uap     User argument descriptor (see below)
3903  *              retval  (ignored)
3904  *
3905  * Indirect parameters:	uap->path	Directory path
3906  *
3907  * Returns:	0			Success
3908  *              common_chdir: ENOTDIR
3909  *		common_chdir: ENOENT	No such file or directory
3910  *		common_chdir: ???
3911  *
3912  */
3913 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)3914 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3915 {
3916 	return common_chdir(p, (void *)uap, 1);
3917 }
3918 
3919 
3920 /*
3921  * Change notion of root (``/'') directory.
3922  */
3923 /* ARGSUSED */
3924 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)3925 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3926 {
3927 	struct filedesc *fdp = &p->p_fd;
3928 	int error;
3929 	struct nameidata nd;
3930 	vnode_t tvp;
3931 	vfs_context_t ctx = vfs_context_current();
3932 
3933 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
3934 		return error;
3935 	}
3936 
3937 	NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3938 	    UIO_USERSPACE, uap->path, ctx);
3939 	error = change_dir(&nd, ctx);
3940 	if (error) {
3941 		return error;
3942 	}
3943 
3944 #if CONFIG_MACF
3945 	error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3946 	    &nd.ni_cnd);
3947 	if (error) {
3948 		vnode_put(nd.ni_vp);
3949 		return error;
3950 	}
3951 #endif
3952 
3953 	if ((error = vnode_ref(nd.ni_vp))) {
3954 		vnode_put(nd.ni_vp);
3955 		return error;
3956 	}
3957 	vnode_put(nd.ni_vp);
3958 
3959 	/*
3960 	 * This lock provides the guarantee that as long as you hold the lock
3961 	 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
3962 	 * on a referenced vnode in namei when determining the rootvnode for
3963 	 * a process.
3964 	 */
3965 	/* needed for synchronization with lookup */
3966 	proc_dirs_lock_exclusive(p);
3967 	/* needed for setting the flag and other activities on the fd itself */
3968 	proc_fdlock(p);
3969 	tvp = fdp->fd_rdir;
3970 	fdp->fd_rdir = nd.ni_vp;
3971 	fdt_flag_set(fdp, FD_CHROOT);
3972 	proc_fdunlock(p);
3973 	proc_dirs_unlock_exclusive(p);
3974 
3975 	if (tvp != NULL) {
3976 		vnode_rele(tvp);
3977 	}
3978 
3979 	return 0;
3980 }
3981 
3982 #define PATHSTATICBUFLEN 256
3983 #define PIVOT_ROOT_ENTITLEMENT              \
3984        "com.apple.private.vfs.pivot-root"
3985 
3986 #if defined(XNU_TARGET_OS_OSX)
3987 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)3988 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
3989 {
3990 	int error;
3991 	char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
3992 	char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
3993 	char *new_rootfs_path_before_buf = NULL;
3994 	char *old_rootfs_path_after_buf = NULL;
3995 	char *incoming = NULL;
3996 	char *outgoing = NULL;
3997 	vnode_t incoming_rootvp = NULLVP;
3998 	size_t bytes_copied;
3999 
4000 	/*
4001 	 * XXX : Additional restrictions needed
4002 	 * - perhaps callable only once.
4003 	 */
4004 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4005 		return error;
4006 	}
4007 
4008 	/*
4009 	 * pivot_root can be executed by launchd only.
4010 	 * Enforce entitlement.
4011 	 */
4012 	if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4013 		return EPERM;
4014 	}
4015 
4016 	error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4017 	if (error == ENAMETOOLONG) {
4018 		new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4019 		error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4020 	}
4021 
4022 	if (error) {
4023 		goto out;
4024 	}
4025 
4026 	error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4027 	if (error == ENAMETOOLONG) {
4028 		old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4029 		error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4030 	}
4031 	if (error) {
4032 		goto out;
4033 	}
4034 
4035 	if (new_rootfs_path_before_buf) {
4036 		incoming = new_rootfs_path_before_buf;
4037 	} else {
4038 		incoming = &new_rootfs_path_before[0];
4039 	}
4040 
4041 	if (old_rootfs_path_after_buf) {
4042 		outgoing = old_rootfs_path_after_buf;
4043 	} else {
4044 		outgoing = &old_rootfs_path_after[0];
4045 	}
4046 
4047 	/*
4048 	 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4049 	 * Userland is not allowed to pivot to an image.
4050 	 */
4051 	error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4052 	if (error) {
4053 		goto out;
4054 	}
4055 	error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4056 	if (error) {
4057 		goto out;
4058 	}
4059 
4060 	error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4061 
4062 out:
4063 	if (incoming_rootvp != NULLVP) {
4064 		vnode_put(incoming_rootvp);
4065 		incoming_rootvp = NULLVP;
4066 	}
4067 
4068 	if (old_rootfs_path_after_buf) {
4069 		zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4070 	}
4071 
4072 	if (new_rootfs_path_before_buf) {
4073 		zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4074 	}
4075 
4076 	return error;
4077 }
4078 #else
4079 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4080 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4081 {
4082 	return nosys(p, NULL, retval);
4083 }
4084 #endif /* XNU_TARGET_OS_OSX */
4085 
4086 /*
4087  * Common routine for chroot and chdir.
4088  *
4089  * Returns:	0			Success
4090  *		ENOTDIR			Not a directory
4091  *		namei:???		[anything namei can return]
4092  *		vnode_authorize:???	[anything vnode_authorize can return]
4093  */
4094 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4095 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4096 {
4097 	vnode_t vp;
4098 	int error;
4099 
4100 	if ((error = namei(ndp))) {
4101 		return error;
4102 	}
4103 	nameidone(ndp);
4104 	vp = ndp->ni_vp;
4105 
4106 	if (vp->v_type != VDIR) {
4107 		vnode_put(vp);
4108 		return ENOTDIR;
4109 	}
4110 
4111 #if CONFIG_MACF
4112 	error = mac_vnode_check_chdir(ctx, vp);
4113 	if (error) {
4114 		vnode_put(vp);
4115 		return error;
4116 	}
4117 #endif
4118 
4119 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4120 	if (error) {
4121 		vnode_put(vp);
4122 		return error;
4123 	}
4124 
4125 	return error;
4126 }
4127 
4128 /*
4129  * Free the vnode data (for directories) associated with the file glob.
4130  */
4131 struct fd_vn_data *
fg_vn_data_alloc(void)4132 fg_vn_data_alloc(void)
4133 {
4134 	struct fd_vn_data *fvdata;
4135 
4136 	/* Allocate per fd vnode data */
4137 	fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4138 	lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4139 	return fvdata;
4140 }
4141 
4142 /*
4143  * Free the vnode data (for directories) associated with the file glob.
4144  */
4145 void
fg_vn_data_free(void * fgvndata)4146 fg_vn_data_free(void *fgvndata)
4147 {
4148 	struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4149 
4150 	kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4151 	lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4152 	kfree_type(struct fd_vn_data, fvdata);
4153 }
4154 
4155 /*
4156  * Check permissions, allocate an open file structure,
4157  * and call the device open routine if any.
4158  *
4159  * Returns:	0			Success
4160  *		EINVAL
4161  *		EINTR
4162  *	falloc:ENFILE
4163  *	falloc:EMFILE
4164  *	falloc:ENOMEM
4165  *	vn_open_auth:???
4166  *	dupfdopen:???
4167  *	VNOP_ADVLOCK:???
4168  *	vnode_setsize:???
4169  *
4170  * XXX Need to implement uid, gid
4171  */
4172 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval)4173 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4174     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval)
4175 {
4176 	proc_t p = vfs_context_proc(ctx);
4177 	uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4178 	struct fileproc *fp;
4179 	vnode_t vp;
4180 	int flags, oflags;
4181 	int type, indx, error;
4182 	struct vfs_context context;
4183 
4184 	oflags = uflags;
4185 
4186 	if ((oflags & O_ACCMODE) == O_ACCMODE) {
4187 		return EINVAL;
4188 	}
4189 
4190 	flags = FFLAGS(uflags);
4191 	CLR(flags, FENCRYPTED);
4192 	CLR(flags, FUNENCRYPTED);
4193 
4194 	AUDIT_ARG(fflags, oflags);
4195 	AUDIT_ARG(mode, vap->va_mode);
4196 
4197 	if ((error = falloc_withinit(p, &fp, &indx, ctx, fp_init, initarg)) != 0) {
4198 		return error;
4199 	}
4200 	if (flags & O_CLOEXEC) {
4201 		fp->fp_flags |= FP_CLOEXEC;
4202 	}
4203 	if (flags & O_CLOFORK) {
4204 		fp->fp_flags |= FP_CLOFORK;
4205 	}
4206 
4207 	/* setup state to recognize when fdesc_open was called */
4208 	uu->uu_dupfd = -1;
4209 
4210 	if ((error = vn_open_auth(ndp, &flags, vap))) {
4211 		if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4212 			if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4213 				*retval = indx;
4214 				return 0;
4215 			}
4216 		}
4217 		if (error == ERESTART) {
4218 			error = EINTR;
4219 		}
4220 		fp_free(p, indx, fp);
4221 		return error;
4222 	}
4223 	uu->uu_dupfd = 0;
4224 	vp = ndp->ni_vp;
4225 
4226 	fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4227 	fp->fp_glob->fg_ops = &vnops;
4228 	fp_set_data(fp, vp);
4229 
4230 	if (flags & (O_EXLOCK | O_SHLOCK)) {
4231 		struct flock lf = {
4232 			.l_whence = SEEK_SET,
4233 		};
4234 
4235 		if (flags & O_EXLOCK) {
4236 			lf.l_type = F_WRLCK;
4237 		} else {
4238 			lf.l_type = F_RDLCK;
4239 		}
4240 		type = F_FLOCK;
4241 		if ((flags & FNONBLOCK) == 0) {
4242 			type |= F_WAIT;
4243 		}
4244 #if CONFIG_MACF
4245 		error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4246 		    F_SETLK, &lf);
4247 		if (error) {
4248 			goto bad;
4249 		}
4250 #endif
4251 		if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4252 			goto bad;
4253 		}
4254 		fp->fp_glob->fg_flag |= FWASLOCKED;
4255 	}
4256 
4257 	/* try to truncate by setting the size attribute */
4258 	if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4259 		goto bad;
4260 	}
4261 
4262 	/*
4263 	 * For directories we hold some additional information in the fd.
4264 	 */
4265 	if (vnode_vtype(vp) == VDIR) {
4266 		fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4267 	} else {
4268 		fp->fp_glob->fg_vn_data = NULL;
4269 	}
4270 
4271 	vnode_put(vp);
4272 
4273 	/*
4274 	 * The first terminal open (without a O_NOCTTY) by a session leader
4275 	 * results in it being set as the controlling terminal.
4276 	 */
4277 	if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4278 	    !(flags & O_NOCTTY)) {
4279 		int tmp = 0;
4280 
4281 		(void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4282 		    (caddr_t)&tmp, ctx);
4283 	}
4284 
4285 	proc_fdlock(p);
4286 	procfdtbl_releasefd(p, indx, NULL);
4287 
4288 #if CONFIG_SECLUDED_MEMORY
4289 	if (secluded_for_filecache &&
4290 	    FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE &&
4291 	    vnode_vtype(vp) == VREG) {
4292 		memory_object_control_t moc;
4293 
4294 		moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4295 
4296 		if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4297 			/* nothing to do... */
4298 		} else if (fp->fp_glob->fg_flag & FWRITE) {
4299 			/* writable -> no longer  eligible for secluded pages */
4300 			memory_object_mark_eligible_for_secluded(moc,
4301 			    FALSE);
4302 		} else if (secluded_for_filecache == 1) {
4303 			char pathname[32] = { 0, };
4304 			size_t copied;
4305 			/* XXX FBDP: better way to detect /Applications/ ? */
4306 			if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4307 				(void)copyinstr(ndp->ni_dirp,
4308 				    pathname,
4309 				    sizeof(pathname),
4310 				    &copied);
4311 			} else {
4312 				copystr(CAST_DOWN(void *, ndp->ni_dirp),
4313 				    pathname,
4314 				    sizeof(pathname),
4315 				    &copied);
4316 			}
4317 			pathname[sizeof(pathname) - 1] = '\0';
4318 			if (strncmp(pathname,
4319 			    "/Applications/",
4320 			    strlen("/Applications/")) == 0 &&
4321 			    strncmp(pathname,
4322 			    "/Applications/Camera.app/",
4323 			    strlen("/Applications/Camera.app/")) != 0) {
4324 				/*
4325 				 * not writable
4326 				 * AND from "/Applications/"
4327 				 * AND not from "/Applications/Camera.app/"
4328 				 * ==> eligible for secluded
4329 				 */
4330 				memory_object_mark_eligible_for_secluded(moc,
4331 				    TRUE);
4332 			}
4333 		} else if (secluded_for_filecache == 2) {
4334 			size_t len = strlen(vp->v_name);
4335 			if (!strncmp(vp->v_name, "dyld", len) ||
4336 			    !strncmp(vp->v_name, "launchd", len) ||
4337 			    !strncmp(vp->v_name, "Camera", len) ||
4338 			    !strncmp(vp->v_name, "mediaserverd", len) ||
4339 			    !strncmp(vp->v_name, "SpringBoard", len) ||
4340 			    !strncmp(vp->v_name, "backboardd", len)) {
4341 				/*
4342 				 * This file matters when launching Camera:
4343 				 * do not store its contents in the secluded
4344 				 * pool that will be drained on Camera launch.
4345 				 */
4346 				memory_object_mark_eligible_for_secluded(moc,
4347 				    FALSE);
4348 			}
4349 		}
4350 	}
4351 #endif /* CONFIG_SECLUDED_MEMORY */
4352 
4353 	fp_drop(p, indx, fp, 1);
4354 	proc_fdunlock(p);
4355 
4356 	*retval = indx;
4357 
4358 	return 0;
4359 bad:
4360 	context = *vfs_context_current();
4361 	context.vc_ucred = fp->fp_glob->fg_cred;
4362 
4363 	if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4364 	    (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4365 		struct flock lf = {
4366 			.l_whence = SEEK_SET,
4367 			.l_type = F_UNLCK,
4368 		};
4369 
4370 		(void)VNOP_ADVLOCK(
4371 			vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4372 	}
4373 
4374 	vn_close(vp, fp->fp_glob->fg_flag, &context);
4375 	vnode_put(vp);
4376 	fp_free(p, indx, fp);
4377 
4378 	return error;
4379 }
4380 
4381 /*
4382  * While most of the *at syscall handlers can call nameiat() which
4383  * is a wrapper around namei, the use of namei and initialisation
4384  * of nameidata are far removed and in different functions  - namei
4385  * gets called in vn_open_auth for open1. So we'll just do here what
4386  * nameiat() does.
4387  */
4388 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd)4389 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4390     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4391     int dirfd)
4392 {
4393 	if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4394 		int error;
4395 		char c;
4396 
4397 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4398 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
4399 			if (error) {
4400 				return error;
4401 			}
4402 		} else {
4403 			c = *((char *)(ndp->ni_dirp));
4404 		}
4405 
4406 		if (c != '/') {
4407 			vnode_t dvp_at;
4408 
4409 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4410 			    &dvp_at);
4411 			if (error) {
4412 				return error;
4413 			}
4414 
4415 			if (vnode_vtype(dvp_at) != VDIR) {
4416 				vnode_put(dvp_at);
4417 				return ENOTDIR;
4418 			}
4419 
4420 			ndp->ni_dvp = dvp_at;
4421 			ndp->ni_cnd.cn_flags |= USEDVP;
4422 			error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4423 			    retval);
4424 			vnode_put(dvp_at);
4425 			return error;
4426 		}
4427 	}
4428 
4429 	return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval);
4430 }
4431 
4432 /*
4433  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4434  *
4435  * Parameters:	p			Process requesting the open
4436  *		uap			User argument descriptor (see below)
4437  *		retval			Pointer to an area to receive the
4438  *					return calue from the system call
4439  *
4440  * Indirect:	uap->path		Path to open (same as 'open')
4441  *		uap->flags		Flags to open (same as 'open'
4442  *		uap->uid		UID to set, if creating
4443  *		uap->gid		GID to set, if creating
4444  *		uap->mode		File mode, if creating (same as 'open')
4445  *		uap->xsecurity		ACL to set, if creating
4446  *
4447  * Returns:	0			Success
4448  *		!0			errno value
4449  *
4450  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
4451  *
4452  * XXX:		We should enummerate the possible errno values here, and where
4453  *		in the code they originated.
4454  */
4455 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)4456 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4457 {
4458 	int ciferror;
4459 	kauth_filesec_t xsecdst;
4460 	struct vnode_attr va;
4461 	struct nameidata nd;
4462 	int cmode;
4463 
4464 	AUDIT_ARG(owner, uap->uid, uap->gid);
4465 
4466 	xsecdst = NULL;
4467 	if ((uap->xsecurity != USER_ADDR_NULL) &&
4468 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4469 		return ciferror;
4470 	}
4471 
4472 	VATTR_INIT(&va);
4473 	cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4474 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4475 	if (uap->uid != KAUTH_UID_NONE) {
4476 		VATTR_SET(&va, va_uid, uap->uid);
4477 	}
4478 	if (uap->gid != KAUTH_GID_NONE) {
4479 		VATTR_SET(&va, va_gid, uap->gid);
4480 	}
4481 	if (xsecdst != NULL) {
4482 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4483 		va.va_vaflags |= VA_FILESEC_ACL;
4484 	}
4485 
4486 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4487 	    uap->path, vfs_context_current());
4488 
4489 	ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4490 	    NULL, NULL, retval);
4491 	if (xsecdst != NULL) {
4492 		kauth_filesec_free(xsecdst);
4493 	}
4494 
4495 	return ciferror;
4496 }
4497 
4498 /*
4499  * Go through the data-protected atomically controlled open (2)
4500  *
4501  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4502  */
4503 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)4504 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
4505 {
4506 	int flags = uap->flags;
4507 	int class = uap->class;
4508 	int dpflags = uap->dpflags;
4509 
4510 	/*
4511 	 * Follow the same path as normal open(2)
4512 	 * Look up the item if it exists, and acquire the vnode.
4513 	 */
4514 	struct vnode_attr va;
4515 	struct nameidata nd;
4516 	int cmode;
4517 	int error;
4518 
4519 	VATTR_INIT(&va);
4520 	/* Mask off all but regular access permissions */
4521 	cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4522 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4523 
4524 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4525 	    uap->path, vfs_context_current());
4526 
4527 	/*
4528 	 * Initialize the extra fields in vnode_attr to pass down our
4529 	 * extra fields.
4530 	 * 1. target cprotect class.
4531 	 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4532 	 */
4533 	if (flags & O_CREAT) {
4534 		/* lower level kernel code validates that the class is valid before applying it. */
4535 		if (class != PROTECTION_CLASS_DEFAULT) {
4536 			/*
4537 			 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4538 			 * file behave the same as open (2)
4539 			 */
4540 			VATTR_SET(&va, va_dataprotect_class, class);
4541 		}
4542 	}
4543 
4544 	if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED)) {
4545 		if (flags & (O_RDWR | O_WRONLY)) {
4546 			/* Not allowed to write raw encrypted bytes */
4547 			return EINVAL;
4548 		}
4549 		if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
4550 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4551 		}
4552 		if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
4553 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4554 		}
4555 	}
4556 
4557 	error = open1(vfs_context_current(), &nd, uap->flags, &va,
4558 	    NULL, NULL, retval);
4559 
4560 	return error;
4561 }
4562 
4563 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)4564 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4565     int fd, enum uio_seg segflg, int *retval)
4566 {
4567 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
4568 	struct {
4569 		struct vnode_attr va;
4570 		struct nameidata nd;
4571 	} *__open_data;
4572 	struct vnode_attr *vap;
4573 	struct nameidata *ndp;
4574 	int cmode;
4575 	int error;
4576 
4577 	__open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
4578 	vap = &__open_data->va;
4579 	ndp = &__open_data->nd;
4580 
4581 	VATTR_INIT(vap);
4582 	/* Mask off all but regular access permissions */
4583 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4584 	VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
4585 
4586 	NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
4587 	    segflg, path, ctx);
4588 
4589 	error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd);
4590 
4591 	kfree_type(typeof(*__open_data), __open_data);
4592 
4593 	return error;
4594 }
4595 
4596 int
open(proc_t p,struct open_args * uap,int32_t * retval)4597 open(proc_t p, struct open_args *uap, int32_t *retval)
4598 {
4599 	__pthread_testcancel(1);
4600 	return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
4601 }
4602 
4603 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)4604 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
4605     int32_t *retval)
4606 {
4607 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
4608 	           uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
4609 }
4610 
4611 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)4612 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
4613     int32_t *retval)
4614 {
4615 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
4616 	           uap->mode, uap->fd, UIO_USERSPACE, retval);
4617 }
4618 
4619 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)4620 openat(proc_t p, struct openat_args *uap, int32_t *retval)
4621 {
4622 	__pthread_testcancel(1);
4623 	return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
4624 }
4625 
4626 /*
4627  * openbyid_np: open a file given a file system id and a file system object id
4628  *	the hfs file system object id is an fsobj_id_t {uint32, uint32}
4629  *	file systems that don't support object ids it is a node id (uint64_t).
4630  *
4631  * Parameters:	p			Process requesting the open
4632  *		uap			User argument descriptor (see below)
4633  *		retval			Pointer to an area to receive the
4634  *					return calue from the system call
4635  *
4636  * Indirect:	uap->path		Path to open (same as 'open')
4637  *
4638  *		uap->fsid		id of target file system
4639  *		uap->objid		id of target file system object
4640  *		uap->flags		Flags to open (same as 'open')
4641  *
4642  * Returns:	0			Success
4643  *		!0			errno value
4644  *
4645  *
4646  * XXX:		We should enummerate the possible errno values here, and where
4647  *		in the code they originated.
4648  */
4649 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)4650 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
4651 {
4652 	fsid_t fsid;
4653 	uint64_t objid;
4654 	int error;
4655 	char *buf = NULL;
4656 	int buflen = MAXPATHLEN;
4657 	int pathlen = 0;
4658 	vfs_context_t ctx = vfs_context_current();
4659 
4660 	if ((error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_OPEN_BY_ID, 0))) {
4661 		return error;
4662 	}
4663 
4664 	if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
4665 		return error;
4666 	}
4667 
4668 	/*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
4669 	if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
4670 		return error;
4671 	}
4672 
4673 	AUDIT_ARG(value32, fsid.val[0]);
4674 	AUDIT_ARG(value64, objid);
4675 
4676 	/*resolve path from fsis, objid*/
4677 	do {
4678 		buf = kalloc_data(buflen + 1, Z_WAITOK);
4679 		if (buf == NULL) {
4680 			return ENOMEM;
4681 		}
4682 
4683 		error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
4684 		    buf, FSOPT_ISREALFSID, &pathlen);
4685 
4686 		if (error) {
4687 			kfree_data(buf, buflen + 1);
4688 			buf = NULL;
4689 		}
4690 	} while (error == ENOSPC && (buflen += MAXPATHLEN));
4691 
4692 	if (error) {
4693 		return error;
4694 	}
4695 
4696 	buf[pathlen] = 0;
4697 
4698 	error = openat_internal(
4699 		ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
4700 
4701 	kfree_data(buf, buflen + 1);
4702 
4703 	return error;
4704 }
4705 
4706 
4707 /*
4708  * Create a special file.
4709  */
4710 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
4711 
4712 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)4713 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
4714 {
4715 	struct vnode_attr va;
4716 	vfs_context_t ctx = vfs_context_current();
4717 	int error;
4718 	struct nameidata nd;
4719 	vnode_t vp, dvp;
4720 
4721 	VATTR_INIT(&va);
4722 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
4723 	VATTR_SET(&va, va_rdev, uap->dev);
4724 
4725 	/* If it's a mknod() of a FIFO, call mkfifo1() instead */
4726 	if ((uap->mode & S_IFMT) == S_IFIFO) {
4727 		return mkfifo1(ctx, uap->path, &va);
4728 	}
4729 
4730 	AUDIT_ARG(mode, (mode_t)uap->mode);
4731 	AUDIT_ARG(value32, uap->dev);
4732 
4733 	if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
4734 		return error;
4735 	}
4736 	NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
4737 	    UIO_USERSPACE, uap->path, ctx);
4738 	error = namei(&nd);
4739 	if (error) {
4740 		return error;
4741 	}
4742 	dvp = nd.ni_dvp;
4743 	vp = nd.ni_vp;
4744 
4745 	if (vp != NULL) {
4746 		error = EEXIST;
4747 		goto out;
4748 	}
4749 
4750 	switch (uap->mode & S_IFMT) {
4751 	case S_IFCHR:
4752 		VATTR_SET(&va, va_type, VCHR);
4753 		break;
4754 	case S_IFBLK:
4755 		VATTR_SET(&va, va_type, VBLK);
4756 		break;
4757 	default:
4758 		error = EINVAL;
4759 		goto out;
4760 	}
4761 
4762 #if CONFIG_MACF
4763 	error = mac_vnode_check_create(ctx,
4764 	    nd.ni_dvp, &nd.ni_cnd, &va);
4765 	if (error) {
4766 		goto out;
4767 	}
4768 #endif
4769 
4770 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
4771 		goto out;
4772 	}
4773 
4774 	if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0) {
4775 		goto out;
4776 	}
4777 
4778 	if (vp) {
4779 		int     update_flags = 0;
4780 
4781 		// Make sure the name & parent pointers are hooked up
4782 		if (vp->v_name == NULL) {
4783 			update_flags |= VNODE_UPDATE_NAME;
4784 		}
4785 		if (vp->v_parent == NULLVP) {
4786 			update_flags |= VNODE_UPDATE_PARENT;
4787 		}
4788 
4789 		if (update_flags) {
4790 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4791 		}
4792 
4793 #if CONFIG_FSE
4794 		add_fsevent(FSE_CREATE_FILE, ctx,
4795 		    FSE_ARG_VNODE, vp,
4796 		    FSE_ARG_DONE);
4797 #endif
4798 	}
4799 
4800 out:
4801 	/*
4802 	 * nameidone has to happen before we vnode_put(dvp)
4803 	 * since it may need to release the fs_nodelock on the dvp
4804 	 */
4805 	nameidone(&nd);
4806 
4807 	if (vp) {
4808 		vnode_put(vp);
4809 	}
4810 	vnode_put(dvp);
4811 
4812 	return error;
4813 }
4814 
4815 /*
4816  * Create a named pipe.
4817  *
4818  * Returns:	0			Success
4819  *		EEXIST
4820  *	namei:???
4821  *	vnode_authorize:???
4822  *	vn_create:???
4823  */
4824 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap)4825 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
4826 {
4827 	vnode_t vp, dvp;
4828 	int error;
4829 	struct nameidata nd;
4830 
4831 	NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
4832 	    UIO_USERSPACE, upath, ctx);
4833 	error = namei(&nd);
4834 	if (error) {
4835 		return error;
4836 	}
4837 	dvp = nd.ni_dvp;
4838 	vp = nd.ni_vp;
4839 
4840 	/* check that this is a new file and authorize addition */
4841 	if (vp != NULL) {
4842 		error = EEXIST;
4843 		goto out;
4844 	}
4845 	VATTR_SET(vap, va_type, VFIFO);
4846 
4847 	if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
4848 		goto out;
4849 	}
4850 
4851 	error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
4852 out:
4853 	/*
4854 	 * nameidone has to happen before we vnode_put(dvp)
4855 	 * since it may need to release the fs_nodelock on the dvp
4856 	 */
4857 	nameidone(&nd);
4858 
4859 	if (vp) {
4860 		vnode_put(vp);
4861 	}
4862 	vnode_put(dvp);
4863 
4864 	return error;
4865 }
4866 
4867 
4868 /*
4869  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
4870  *
4871  * Parameters:	p			Process requesting the open
4872  *		uap			User argument descriptor (see below)
4873  *		retval			(Ignored)
4874  *
4875  * Indirect:	uap->path		Path to fifo (same as 'mkfifo')
4876  *		uap->uid		UID to set
4877  *		uap->gid		GID to set
4878  *		uap->mode		File mode to set (same as 'mkfifo')
4879  *		uap->xsecurity		ACL to set, if creating
4880  *
4881  * Returns:	0			Success
4882  *		!0			errno value
4883  *
4884  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
4885  *
4886  * XXX:		We should enummerate the possible errno values here, and where
4887  *		in the code they originated.
4888  */
4889 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)4890 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
4891 {
4892 	int ciferror;
4893 	kauth_filesec_t xsecdst;
4894 	struct vnode_attr va;
4895 
4896 	AUDIT_ARG(owner, uap->uid, uap->gid);
4897 
4898 	xsecdst = KAUTH_FILESEC_NONE;
4899 	if (uap->xsecurity != USER_ADDR_NULL) {
4900 		if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
4901 			return ciferror;
4902 		}
4903 	}
4904 
4905 	VATTR_INIT(&va);
4906 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
4907 	if (uap->uid != KAUTH_UID_NONE) {
4908 		VATTR_SET(&va, va_uid, uap->uid);
4909 	}
4910 	if (uap->gid != KAUTH_GID_NONE) {
4911 		VATTR_SET(&va, va_gid, uap->gid);
4912 	}
4913 	if (xsecdst != KAUTH_FILESEC_NONE) {
4914 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4915 		va.va_vaflags |= VA_FILESEC_ACL;
4916 	}
4917 
4918 	ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
4919 
4920 	if (xsecdst != KAUTH_FILESEC_NONE) {
4921 		kauth_filesec_free(xsecdst);
4922 	}
4923 	return ciferror;
4924 }
4925 
4926 /* ARGSUSED */
4927 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)4928 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
4929 {
4930 	struct vnode_attr va;
4931 
4932 	VATTR_INIT(&va);
4933 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
4934 
4935 	return mkfifo1(vfs_context_current(), uap->path, &va);
4936 }
4937 
4938 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
4939 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4940 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4941 
4942 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)4943 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
4944 {
4945 	int ret, len = _len;
4946 
4947 	*truncated_path = 0;
4948 
4949 	if (firmlink) {
4950 		ret = vn_getpath(dvp, path, &len);
4951 	} else {
4952 		ret = vn_getpath_no_firmlink(dvp, path, &len);
4953 	}
4954 	if (ret == 0 && len < (MAXPATHLEN - 1)) {
4955 		if (leafname) {
4956 			path[len - 1] = '/';
4957 			len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
4958 			if (len > MAXPATHLEN) {
4959 				char *ptr;
4960 
4961 				// the string got truncated!
4962 				*truncated_path = 1;
4963 				ptr = strrchr(path, '/');
4964 				if (ptr) {
4965 					*ptr = '\0';   // chop off the string at the last directory component
4966 				}
4967 				len = (int)strlen(path) + 1;
4968 			}
4969 		}
4970 	} else if (ret == 0) {
4971 		*truncated_path = 1;
4972 	} else if (ret != 0) {
4973 		struct vnode *mydvp = dvp;
4974 
4975 		if (ret != ENOSPC) {
4976 			printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
4977 			    dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
4978 		}
4979 		*truncated_path = 1;
4980 
4981 		do {
4982 			if (mydvp->v_parent != NULL) {
4983 				mydvp = mydvp->v_parent;
4984 			} else if (mydvp->v_mount) {
4985 				strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4986 				break;
4987 			} else {
4988 				// no parent and no mount point?  only thing is to punt and say "/" changed
4989 				strlcpy(path, "/", _len);
4990 				len = 2;
4991 				mydvp = NULL;
4992 			}
4993 
4994 			if (mydvp == NULL) {
4995 				break;
4996 			}
4997 
4998 			len = _len;
4999 			if (firmlink) {
5000 				ret = vn_getpath(mydvp, path, &len);
5001 			} else {
5002 				ret = vn_getpath_no_firmlink(mydvp, path, &len);
5003 			}
5004 		} while (ret == ENOSPC);
5005 	}
5006 
5007 	return len;
5008 }
5009 
5010 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5011 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5012 {
5013 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5014 }
5015 
5016 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5017 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5018 {
5019 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5020 }
5021 
5022 /*
5023  * Make a hard file link.
5024  *
5025  * Returns:	0			Success
5026  *		EPERM
5027  *		EEXIST
5028  *		EXDEV
5029  *	namei:???
5030  *	vnode_authorize:???
5031  *	VNOP_LINK:???
5032  */
5033 /* ARGSUSED */
5034 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5035 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5036     user_addr_t link, int flag, enum uio_seg segflg)
5037 {
5038 	vnode_t vp, pvp, dvp, lvp;
5039 	struct nameidata nd;
5040 	int follow;
5041 	int error;
5042 #if CONFIG_FSE
5043 	fse_info finfo;
5044 #endif
5045 	int need_event, has_listeners, need_kpath2;
5046 	char *target_path = NULL;
5047 	char  *no_firmlink_path = NULL;
5048 	int truncated = 0;
5049 	int truncated_no_firmlink_path = 0;
5050 
5051 	vp = dvp = lvp = NULLVP;
5052 
5053 	/* look up the object we are linking to */
5054 	follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5055 	NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5056 	    segflg, path, ctx);
5057 
5058 	error = nameiat(&nd, fd1);
5059 	if (error) {
5060 		return error;
5061 	}
5062 	vp = nd.ni_vp;
5063 
5064 	nameidone(&nd);
5065 
5066 	/*
5067 	 * Normally, linking to directories is not supported.
5068 	 * However, some file systems may have limited support.
5069 	 */
5070 	if (vp->v_type == VDIR) {
5071 		if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5072 			error = EPERM;   /* POSIX */
5073 			goto out;
5074 		}
5075 
5076 		/* Linking to a directory requires ownership. */
5077 		if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5078 			struct vnode_attr dva;
5079 
5080 			VATTR_INIT(&dva);
5081 			VATTR_WANTED(&dva, va_uid);
5082 			if (vnode_getattr(vp, &dva, ctx) != 0 ||
5083 			    !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5084 			    (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5085 				error = EACCES;
5086 				goto out;
5087 			}
5088 		}
5089 	}
5090 
5091 	/* lookup the target node */
5092 #if CONFIG_TRIGGERS
5093 	nd.ni_op = OP_LINK;
5094 #endif
5095 	nd.ni_cnd.cn_nameiop = CREATE;
5096 	nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5097 	nd.ni_dirp = link;
5098 	error = nameiat(&nd, fd2);
5099 	if (error != 0) {
5100 		goto out;
5101 	}
5102 	dvp = nd.ni_dvp;
5103 	lvp = nd.ni_vp;
5104 
5105 #if CONFIG_MACF
5106 	if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5107 		goto out2;
5108 	}
5109 #endif
5110 
5111 	/* or to anything that kauth doesn't want us to (eg. immutable items) */
5112 	if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5113 		goto out2;
5114 	}
5115 
5116 	/* target node must not exist */
5117 	if (lvp != NULLVP) {
5118 		error = EEXIST;
5119 		goto out2;
5120 	}
5121 	/* cannot link across mountpoints */
5122 	if (vnode_mount(vp) != vnode_mount(dvp)) {
5123 		error = EXDEV;
5124 		goto out2;
5125 	}
5126 
5127 	/* authorize creation of the target note */
5128 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5129 		goto out2;
5130 	}
5131 
5132 	/* and finally make the link */
5133 	error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5134 	if (error) {
5135 		goto out2;
5136 	}
5137 
5138 #if CONFIG_MACF
5139 	(void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5140 #endif
5141 
5142 #if CONFIG_FSE
5143 	need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5144 #else
5145 	need_event = 0;
5146 #endif
5147 	has_listeners = kauth_authorize_fileop_has_listeners();
5148 
5149 	need_kpath2 = 0;
5150 #if CONFIG_AUDIT
5151 	if (AUDIT_RECORD_EXISTS()) {
5152 		need_kpath2 = 1;
5153 	}
5154 #endif
5155 
5156 	if (need_event || has_listeners || need_kpath2) {
5157 		char *link_to_path = NULL;
5158 		int len, link_name_len;
5159 		int  len_no_firmlink_path = 0;
5160 
5161 		/* build the path to the new link file */
5162 		GET_PATH(target_path);
5163 
5164 		len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5165 		if (no_firmlink_path == NULL) {
5166 			GET_PATH(no_firmlink_path);
5167 		}
5168 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5169 
5170 		AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5171 
5172 		if (has_listeners) {
5173 			/* build the path to file we are linking to */
5174 			GET_PATH(link_to_path);
5175 
5176 			link_name_len = MAXPATHLEN;
5177 			if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5178 				/*
5179 				 * Call out to allow 3rd party notification of rename.
5180 				 * Ignore result of kauth_authorize_fileop call.
5181 				 */
5182 				kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5183 				    (uintptr_t)link_to_path,
5184 				    (uintptr_t)target_path);
5185 			}
5186 			if (link_to_path != NULL) {
5187 				RELEASE_PATH(link_to_path);
5188 			}
5189 		}
5190 #if CONFIG_FSE
5191 		if (need_event) {
5192 			/* construct fsevent */
5193 			if (get_fse_info(vp, &finfo, ctx) == 0) {
5194 				if (truncated_no_firmlink_path) {
5195 					finfo.mode |= FSE_TRUNCATED_PATH;
5196 				}
5197 
5198 				// build the path to the destination of the link
5199 				add_fsevent(FSE_CREATE_FILE, ctx,
5200 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5201 				    FSE_ARG_FINFO, &finfo,
5202 				    FSE_ARG_DONE);
5203 			}
5204 
5205 			pvp = vp->v_parent;
5206 			// need an iocount on pvp in this case
5207 			if (pvp && pvp != dvp) {
5208 				error = vnode_get(pvp);
5209 				if (error) {
5210 					pvp = NULLVP;
5211 					error = 0;
5212 				}
5213 			}
5214 			if (pvp) {
5215 				add_fsevent(FSE_STAT_CHANGED, ctx,
5216 				    FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5217 			}
5218 			if (pvp && pvp != dvp) {
5219 				vnode_put(pvp);
5220 			}
5221 		}
5222 #endif
5223 	}
5224 out2:
5225 	/*
5226 	 * nameidone has to happen before we vnode_put(dvp)
5227 	 * since it may need to release the fs_nodelock on the dvp
5228 	 */
5229 	nameidone(&nd);
5230 	if (target_path != NULL) {
5231 		RELEASE_PATH(target_path);
5232 	}
5233 	if (no_firmlink_path != NULL) {
5234 		RELEASE_PATH(no_firmlink_path);
5235 		no_firmlink_path = NULL;
5236 	}
5237 out:
5238 	if (lvp) {
5239 		vnode_put(lvp);
5240 	}
5241 	if (dvp) {
5242 		vnode_put(dvp);
5243 	}
5244 	vnode_put(vp);
5245 	return error;
5246 }
5247 
5248 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5249 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5250 {
5251 	return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5252 	           AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5253 }
5254 
5255 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5256 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5257 {
5258 	if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5259 		return EINVAL;
5260 	}
5261 
5262 	return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5263 	           uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5264 }
5265 
5266 /*
5267  * Make a symbolic link.
5268  *
5269  * We could add support for ACLs here too...
5270  */
5271 /* ARGSUSED */
5272 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5273 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5274     user_addr_t link, enum uio_seg segflg)
5275 {
5276 	struct vnode_attr va;
5277 	char *path;
5278 	int error;
5279 	struct nameidata nd;
5280 	vnode_t vp, dvp;
5281 	size_t dummy = 0;
5282 	proc_t p;
5283 
5284 	error = 0;
5285 	if (UIO_SEG_IS_USER_SPACE(segflg)) {
5286 		path = zalloc(ZV_NAMEI);
5287 		error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5288 	} else {
5289 		path = (char *)path_data;
5290 	}
5291 	if (error) {
5292 		goto out;
5293 	}
5294 	AUDIT_ARG(text, path);  /* This is the link string */
5295 
5296 	NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5297 	    segflg, link, ctx);
5298 
5299 	error = nameiat(&nd, fd);
5300 	if (error) {
5301 		goto out;
5302 	}
5303 	dvp = nd.ni_dvp;
5304 	vp = nd.ni_vp;
5305 
5306 	p = vfs_context_proc(ctx);
5307 	VATTR_INIT(&va);
5308 	VATTR_SET(&va, va_type, VLNK);
5309 	VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5310 
5311 #if CONFIG_MACF
5312 	error = mac_vnode_check_create(ctx,
5313 	    dvp, &nd.ni_cnd, &va);
5314 #endif
5315 	if (error != 0) {
5316 		goto skipit;
5317 	}
5318 
5319 	if (vp != NULL) {
5320 		error = EEXIST;
5321 		goto skipit;
5322 	}
5323 
5324 	/* authorize */
5325 	if (error == 0) {
5326 		error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5327 	}
5328 	/* get default ownership, etc. */
5329 	if (error == 0) {
5330 		error = vnode_authattr_new(dvp, &va, 0, ctx);
5331 	}
5332 	if (error == 0) {
5333 		error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5334 	}
5335 
5336 	/* do fallback attribute handling */
5337 	if (error == 0 && vp) {
5338 		error = vnode_setattr_fallback(vp, &va, ctx);
5339 	}
5340 
5341 #if CONFIG_MACF
5342 	if (error == 0 && vp) {
5343 		error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5344 	}
5345 #endif
5346 
5347 	if (error == 0) {
5348 		int     update_flags = 0;
5349 
5350 		/*check if a new vnode was created, else try to get one*/
5351 		if (vp == NULL) {
5352 			nd.ni_cnd.cn_nameiop = LOOKUP;
5353 #if CONFIG_TRIGGERS
5354 			nd.ni_op = OP_LOOKUP;
5355 #endif
5356 			/*
5357 			 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5358 			 * reallocated again in namei().
5359 			 */
5360 			nd.ni_cnd.cn_flags &= HASBUF;
5361 			error = nameiat(&nd, fd);
5362 			if (error) {
5363 				goto skipit;
5364 			}
5365 			vp = nd.ni_vp;
5366 		}
5367 
5368 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5369 		/* call out to allow 3rd party notification of rename.
5370 		 * Ignore result of kauth_authorize_fileop call.
5371 		 */
5372 		if (kauth_authorize_fileop_has_listeners() &&
5373 		    namei(&nd) == 0) {
5374 			char *new_link_path = NULL;
5375 			int             len;
5376 
5377 			/* build the path to the new link file */
5378 			new_link_path = get_pathbuff();
5379 			len = MAXPATHLEN;
5380 			vn_getpath(dvp, new_link_path, &len);
5381 			if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5382 				new_link_path[len - 1] = '/';
5383 				strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5384 			}
5385 
5386 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5387 			    (uintptr_t)path, (uintptr_t)new_link_path);
5388 			if (new_link_path != NULL) {
5389 				release_pathbuff(new_link_path);
5390 			}
5391 		}
5392 #endif
5393 		// Make sure the name & parent pointers are hooked up
5394 		if (vp->v_name == NULL) {
5395 			update_flags |= VNODE_UPDATE_NAME;
5396 		}
5397 		if (vp->v_parent == NULLVP) {
5398 			update_flags |= VNODE_UPDATE_PARENT;
5399 		}
5400 
5401 		if (update_flags) {
5402 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5403 		}
5404 
5405 #if CONFIG_FSE
5406 		add_fsevent(FSE_CREATE_FILE, ctx,
5407 		    FSE_ARG_VNODE, vp,
5408 		    FSE_ARG_DONE);
5409 #endif
5410 	}
5411 
5412 skipit:
5413 	/*
5414 	 * nameidone has to happen before we vnode_put(dvp)
5415 	 * since it may need to release the fs_nodelock on the dvp
5416 	 */
5417 	nameidone(&nd);
5418 
5419 	if (vp) {
5420 		vnode_put(vp);
5421 	}
5422 	vnode_put(dvp);
5423 out:
5424 	if (path && (path != (char *)path_data)) {
5425 		zfree(ZV_NAMEI, path);
5426 	}
5427 
5428 	return error;
5429 }
5430 
5431 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)5432 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5433 {
5434 	return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5435 	           uap->link, UIO_USERSPACE);
5436 }
5437 
5438 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)5439 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5440     __unused int32_t *retval)
5441 {
5442 	return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5443 	           uap->path2, UIO_USERSPACE);
5444 }
5445 
5446 /*
5447  * Delete a whiteout from the filesystem.
5448  * No longer supported.
5449  */
5450 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)5451 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5452 {
5453 	return ENOTSUP;
5454 }
5455 
5456 /*
5457  * Delete a name from the filesystem.
5458  */
5459 /* ARGSUSED */
5460 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5461 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5462     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5463 {
5464 	struct {
5465 		struct nameidata nd;
5466 #if CONFIG_FSE
5467 		struct vnode_attr va;
5468 		fse_info finfo;
5469 #endif
5470 	} *__unlink_data;
5471 	struct nameidata *ndp;
5472 	vnode_t vp, dvp;
5473 	int error;
5474 	struct componentname *cnp;
5475 	char  *path = NULL;
5476 	char  *no_firmlink_path = NULL;
5477 	int  len_path = 0;
5478 	int  len_no_firmlink_path = 0;
5479 	int flags;
5480 	int need_event;
5481 	int has_listeners;
5482 	int truncated_path;
5483 	int truncated_no_firmlink_path;
5484 	int batched;
5485 	struct vnode_attr *vap;
5486 	int do_retry;
5487 	int retry_count = 0;
5488 	int cn_flags;
5489 
5490 	cn_flags = LOCKPARENT;
5491 	if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
5492 		cn_flags |= AUDITVNPATH1;
5493 	}
5494 	/* If a starting dvp is passed, it trumps any fd passed. */
5495 	if (start_dvp) {
5496 		cn_flags |= USEDVP;
5497 	}
5498 
5499 #if NAMEDRSRCFORK
5500 	/* unlink or delete is allowed on rsrc forks and named streams */
5501 	cn_flags |= CN_ALLOWRSRCFORK;
5502 #endif
5503 
5504 	__unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
5505 	ndp = &__unlink_data->nd;
5506 #if CONFIG_FSE
5507 	fse_info *finfop = &__unlink_data->finfo;
5508 #endif
5509 
5510 retry:
5511 	do_retry = 0;
5512 	flags = 0;
5513 	need_event = 0;
5514 	has_listeners = 0;
5515 	truncated_path = 0;
5516 	truncated_no_firmlink_path = 0;
5517 	vap = NULL;
5518 
5519 	NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
5520 
5521 	ndp->ni_dvp = start_dvp;
5522 	ndp->ni_flag |= NAMEI_COMPOUNDREMOVE;
5523 	cnp = &ndp->ni_cnd;
5524 
5525 continue_lookup:
5526 	error = nameiat(ndp, fd);
5527 	if (error) {
5528 		goto early_out;
5529 	}
5530 
5531 	dvp = ndp->ni_dvp;
5532 	vp = ndp->ni_vp;
5533 
5534 	/* With Carbon delete semantics, busy files cannot be deleted */
5535 	if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
5536 		flags |= VNODE_REMOVE_NODELETEBUSY;
5537 	}
5538 
5539 	/* Skip any potential upcalls if told to. */
5540 	if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
5541 		flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
5542 	}
5543 
5544 	if (vp) {
5545 		batched = vnode_compound_remove_available(vp);
5546 		/*
5547 		 * The root of a mounted filesystem cannot be deleted.
5548 		 */
5549 		if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
5550 			error = EBUSY;
5551 			goto out;
5552 		}
5553 
5554 #if DEVELOPMENT || DEBUG
5555 		/*
5556 		 * XXX VSWAP: Check for entitlements or special flag here
5557 		 * so we can restrict access appropriately.
5558 		 */
5559 #else /* DEVELOPMENT || DEBUG */
5560 
5561 		if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
5562 			error = EPERM;
5563 			goto out;
5564 		}
5565 #endif /* DEVELOPMENT || DEBUG */
5566 
5567 		if (!batched) {
5568 			error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
5569 			if (error) {
5570 				if (error == ENOENT) {
5571 					if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5572 						do_retry = 1;
5573 						retry_count++;
5574 					}
5575 				}
5576 				goto out;
5577 			}
5578 		}
5579 	} else {
5580 		batched = 1;
5581 
5582 		if (!vnode_compound_remove_available(dvp)) {
5583 			panic("No vp, but no compound remove?");
5584 		}
5585 	}
5586 
5587 #if CONFIG_FSE
5588 	need_event = need_fsevent(FSE_DELETE, dvp);
5589 	if (need_event) {
5590 		if (!batched) {
5591 			if ((vp->v_flag & VISHARDLINK) == 0) {
5592 				/* XXX need to get these data in batched VNOP */
5593 				get_fse_info(vp, finfop, ctx);
5594 			}
5595 		} else {
5596 			error =
5597 			    vfs_get_notify_attributes(&__unlink_data->va);
5598 			if (error) {
5599 				goto out;
5600 			}
5601 
5602 			vap = &__unlink_data->va;
5603 		}
5604 	}
5605 #endif
5606 	has_listeners = kauth_authorize_fileop_has_listeners();
5607 	if (need_event || has_listeners) {
5608 		if (path == NULL) {
5609 			GET_PATH(path);
5610 		}
5611 		len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
5612 		if (no_firmlink_path == NULL) {
5613 			GET_PATH(no_firmlink_path);
5614 		}
5615 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5616 	}
5617 
5618 #if NAMEDRSRCFORK
5619 	if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
5620 		error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
5621 	} else
5622 #endif
5623 	{
5624 		error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
5625 		vp = ndp->ni_vp;
5626 		if (error == EKEEPLOOKING) {
5627 			if (!batched) {
5628 				panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
5629 			}
5630 
5631 			if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
5632 				panic("EKEEPLOOKING, but continue flag not set?");
5633 			}
5634 
5635 			if (vnode_isdir(vp)) {
5636 				error = EISDIR;
5637 				goto out;
5638 			}
5639 			goto continue_lookup;
5640 		} else if (error == ENOENT && batched) {
5641 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5642 				/*
5643 				 * For compound VNOPs, the authorization callback may
5644 				 * return ENOENT in case of racing hardlink lookups
5645 				 * hitting the name  cache, redrive the lookup.
5646 				 */
5647 				do_retry = 1;
5648 				retry_count += 1;
5649 				goto out;
5650 			}
5651 		}
5652 	}
5653 
5654 	/*
5655 	 * Call out to allow 3rd party notification of delete.
5656 	 * Ignore result of kauth_authorize_fileop call.
5657 	 */
5658 	if (!error) {
5659 		if (has_listeners) {
5660 			kauth_authorize_fileop(vfs_context_ucred(ctx),
5661 			    KAUTH_FILEOP_DELETE,
5662 			    (uintptr_t)vp,
5663 			    (uintptr_t)path);
5664 		}
5665 
5666 		if (vp->v_flag & VISHARDLINK) {
5667 			//
5668 			// if a hardlink gets deleted we want to blow away the
5669 			// v_parent link because the path that got us to this
5670 			// instance of the link is no longer valid.  this will
5671 			// force the next call to get the path to ask the file
5672 			// system instead of just following the v_parent link.
5673 			//
5674 			vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
5675 		}
5676 
5677 #if CONFIG_FSE
5678 		if (need_event) {
5679 			if (vp->v_flag & VISHARDLINK) {
5680 				get_fse_info(vp, finfop, ctx);
5681 			} else if (vap) {
5682 				vnode_get_fse_info_from_vap(vp, finfop, vap);
5683 			}
5684 			if (truncated_path) {
5685 				finfop->mode |= FSE_TRUNCATED_PATH;
5686 			}
5687 			add_fsevent(FSE_DELETE, ctx,
5688 			    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5689 			    FSE_ARG_FINFO, finfop,
5690 			    FSE_ARG_DONE);
5691 		}
5692 #endif
5693 	}
5694 
5695 out:
5696 	if (path != NULL) {
5697 		RELEASE_PATH(path);
5698 		path = NULL;
5699 	}
5700 
5701 	if (no_firmlink_path != NULL) {
5702 		RELEASE_PATH(no_firmlink_path);
5703 		no_firmlink_path = NULL;
5704 	}
5705 #if NAMEDRSRCFORK
5706 	/* recycle the deleted rsrc fork vnode to force a reclaim, which
5707 	 * will cause its shadow file to go away if necessary.
5708 	 */
5709 	if (vp && (vnode_isnamedstream(vp)) &&
5710 	    (vp->v_parent != NULLVP) &&
5711 	    vnode_isshadow(vp)) {
5712 		vnode_recycle(vp);
5713 	}
5714 #endif
5715 	/*
5716 	 * nameidone has to happen before we vnode_put(dvp)
5717 	 * since it may need to release the fs_nodelock on the dvp
5718 	 */
5719 	nameidone(ndp);
5720 	vnode_put(dvp);
5721 	if (vp) {
5722 		vnode_put(vp);
5723 	}
5724 
5725 	if (do_retry) {
5726 		goto retry;
5727 	}
5728 
5729 early_out:
5730 	kfree_type(typeof(*__unlink_data), __unlink_data);
5731 	return error;
5732 }
5733 
5734 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5735 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
5736     enum uio_seg segflg, int unlink_flags)
5737 {
5738 	return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
5739 	           unlink_flags);
5740 }
5741 
5742 /*
5743  * Delete a name from the filesystem using Carbon semantics.
5744  */
5745 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)5746 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
5747 {
5748 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5749 	           uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
5750 }
5751 
5752 /*
5753  * Delete a name from the filesystem using POSIX semantics.
5754  */
5755 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)5756 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
5757 {
5758 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5759 	           uap->path, UIO_USERSPACE, 0);
5760 }
5761 
5762 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)5763 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
5764 {
5765 	if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5766 		return EINVAL;
5767 	}
5768 
5769 	if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5770 		int unlink_flags = 0;
5771 
5772 		if (uap->flag & AT_REMOVEDIR_DATALESS) {
5773 			unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
5774 		}
5775 		return rmdirat_internal(vfs_context_current(), uap->fd,
5776 		           uap->path, UIO_USERSPACE, unlink_flags);
5777 	} else {
5778 		return unlinkat_internal(vfs_context_current(), uap->fd,
5779 		           NULLVP, uap->path, UIO_USERSPACE, 0);
5780 	}
5781 }
5782 
5783 /*
5784  * Reposition read/write file offset.
5785  */
5786 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)5787 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
5788 {
5789 	struct fileproc *fp;
5790 	vnode_t vp;
5791 	struct vfs_context *ctx;
5792 	off_t offset = uap->offset, file_size;
5793 	int error;
5794 
5795 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
5796 		if (error == ENOTSUP) {
5797 			return ESPIPE;
5798 		}
5799 		return error;
5800 	}
5801 	if (vnode_isfifo(vp)) {
5802 		file_drop(uap->fd);
5803 		return ESPIPE;
5804 	}
5805 
5806 
5807 	ctx = vfs_context_current();
5808 #if CONFIG_MACF
5809 	if (uap->whence == L_INCR && uap->offset == 0) {
5810 		error = mac_file_check_get_offset(vfs_context_ucred(ctx),
5811 		    fp->fp_glob);
5812 	} else {
5813 		error = mac_file_check_change_offset(vfs_context_ucred(ctx),
5814 		    fp->fp_glob);
5815 	}
5816 	if (error) {
5817 		file_drop(uap->fd);
5818 		return error;
5819 	}
5820 #endif
5821 	if ((error = vnode_getwithref(vp))) {
5822 		file_drop(uap->fd);
5823 		return error;
5824 	}
5825 
5826 	switch (uap->whence) {
5827 	case L_INCR:
5828 		offset += fp->fp_glob->fg_offset;
5829 		break;
5830 	case L_XTND:
5831 		if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
5832 			break;
5833 		}
5834 		offset += file_size;
5835 		break;
5836 	case L_SET:
5837 		break;
5838 	case SEEK_HOLE:
5839 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
5840 		break;
5841 	case SEEK_DATA:
5842 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
5843 		break;
5844 	default:
5845 		error = EINVAL;
5846 	}
5847 	if (error == 0) {
5848 		if (uap->offset > 0 && offset < 0) {
5849 			/* Incremented/relative move past max size */
5850 			error = EOVERFLOW;
5851 		} else {
5852 			/*
5853 			 * Allow negative offsets on character devices, per
5854 			 * POSIX 1003.1-2001.  Most likely for writing disk
5855 			 * labels.
5856 			 */
5857 			if (offset < 0 && vp->v_type != VCHR) {
5858 				/* Decremented/relative move before start */
5859 				error = EINVAL;
5860 			} else {
5861 				/* Success */
5862 				fp->fp_glob->fg_offset = offset;
5863 				*retval = fp->fp_glob->fg_offset;
5864 			}
5865 		}
5866 	}
5867 
5868 	/*
5869 	 * An lseek can affect whether data is "available to read."  Use
5870 	 * hint of NOTE_NONE so no EVFILT_VNODE events fire
5871 	 */
5872 	post_event_if_success(vp, error, NOTE_NONE);
5873 	(void)vnode_put(vp);
5874 	file_drop(uap->fd);
5875 	return error;
5876 }
5877 
5878 
5879 /*
5880  * Check access permissions.
5881  *
5882  * Returns:	0			Success
5883  *		vnode_authorize:???
5884  */
5885 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)5886 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
5887 {
5888 	kauth_action_t action;
5889 	int error;
5890 
5891 	/*
5892 	 * If just the regular access bits, convert them to something
5893 	 * that vnode_authorize will understand.
5894 	 */
5895 	if (!(uflags & _ACCESS_EXTENDED_MASK)) {
5896 		action = 0;
5897 		if (uflags & R_OK) {
5898 			action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
5899 		}
5900 		if (uflags & W_OK) {
5901 			if (vnode_isdir(vp)) {
5902 				action |= KAUTH_VNODE_ADD_FILE |
5903 				    KAUTH_VNODE_ADD_SUBDIRECTORY;
5904 				/* might want delete rights here too */
5905 			} else {
5906 				action |= KAUTH_VNODE_WRITE_DATA;
5907 			}
5908 		}
5909 		if (uflags & X_OK) {
5910 			if (vnode_isdir(vp)) {
5911 				action |= KAUTH_VNODE_SEARCH;
5912 			} else {
5913 				action |= KAUTH_VNODE_EXECUTE;
5914 			}
5915 		}
5916 	} else {
5917 		/* take advantage of definition of uflags */
5918 		action = uflags >> 8;
5919 	}
5920 
5921 #if CONFIG_MACF
5922 	error = mac_vnode_check_access(ctx, vp, uflags);
5923 	if (error) {
5924 		return error;
5925 	}
5926 #endif /* MAC */
5927 
5928 	/* action == 0 means only check for existence */
5929 	if (action != 0) {
5930 		error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
5931 	} else {
5932 		error = 0;
5933 	}
5934 
5935 	return error;
5936 }
5937 
5938 
5939 
5940 /*
5941  * access_extended: Check access permissions in bulk.
5942  *
5943  * Description:	uap->entries		Pointer to an array of accessx
5944  *                                      descriptor structs, plus one or
5945  *                                      more NULL terminated strings (see
5946  *                                      "Notes" section below).
5947  *		uap->size		Size of the area pointed to by
5948  *					uap->entries.
5949  *		uap->results		Pointer to the results array.
5950  *
5951  * Returns:	0			Success
5952  *		ENOMEM			Insufficient memory
5953  *		EINVAL			Invalid arguments
5954  *		namei:EFAULT		Bad address
5955  *		namei:ENAMETOOLONG	Filename too long
5956  *		namei:ENOENT		No such file or directory
5957  *		namei:ELOOP		Too many levels of symbolic links
5958  *		namei:EBADF		Bad file descriptor
5959  *		namei:ENOTDIR		Not a directory
5960  *		namei:???
5961  *		access1:
5962  *
5963  * Implicit returns:
5964  *		uap->results		Array contents modified
5965  *
5966  * Notes:	The uap->entries are structured as an arbitrary length array
5967  *		of accessx descriptors, followed by one or more NULL terminated
5968  *		strings
5969  *
5970  *			struct accessx_descriptor[0]
5971  *			...
5972  *			struct accessx_descriptor[n]
5973  *			char name_data[0];
5974  *
5975  *		We determine the entry count by walking the buffer containing
5976  *		the uap->entries argument descriptor.  For each descriptor we
5977  *		see, the valid values for the offset ad_name_offset will be
5978  *		in the byte range:
5979  *
5980  *			[ uap->entries + sizeof(struct accessx_descriptor) ]
5981  *						to
5982  *				[ uap->entries + uap->size - 2 ]
5983  *
5984  *		since we must have at least one string, and the string must
5985  *		be at least one character plus the NULL terminator in length.
5986  *
5987  * XXX:		Need to support the check-as uid argument
5988  */
5989 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)5990 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
5991 {
5992 	struct accessx_descriptor *input = NULL;
5993 	errno_t *result = NULL;
5994 	errno_t error = 0;
5995 	int wantdelete = 0;
5996 	size_t desc_max, desc_actual;
5997 	unsigned int i, j;
5998 	struct vfs_context context;
5999 	struct nameidata nd;
6000 	int niopts;
6001 	vnode_t vp = NULL;
6002 	vnode_t dvp = NULL;
6003 #define ACCESSX_MAX_DESCR_ON_STACK 10
6004 	struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6005 
6006 	context.vc_ucred = NULL;
6007 
6008 	/*
6009 	 * Validate parameters; if valid, copy the descriptor array and string
6010 	 * arguments into local memory.  Before proceeding, the following
6011 	 * conditions must have been met:
6012 	 *
6013 	 * o	The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6014 	 * o	There must be sufficient room in the request for at least one
6015 	 *	descriptor and a one yte NUL terminated string.
6016 	 * o	The allocation of local storage must not fail.
6017 	 */
6018 	if (uap->size > ACCESSX_MAX_TABLESIZE) {
6019 		return ENOMEM;
6020 	}
6021 	if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6022 		return EINVAL;
6023 	}
6024 	if (uap->size <= sizeof(stack_input)) {
6025 		input = stack_input;
6026 	} else {
6027 		input = kalloc_data(uap->size, Z_WAITOK);
6028 		if (input == NULL) {
6029 			error = ENOMEM;
6030 			goto out;
6031 		}
6032 	}
6033 	error = copyin(uap->entries, input, uap->size);
6034 	if (error) {
6035 		goto out;
6036 	}
6037 
6038 	AUDIT_ARG(opaque, input, uap->size);
6039 
6040 	/*
6041 	 * Force NUL termination of the copyin buffer to avoid nami() running
6042 	 * off the end.  If the caller passes us bogus data, they may get a
6043 	 * bogus result.
6044 	 */
6045 	((char *)input)[uap->size - 1] = 0;
6046 
6047 	/*
6048 	 * Access is defined as checking against the process' real identity,
6049 	 * even if operations are checking the effective identity.  This
6050 	 * requires that we use a local vfs context.
6051 	 */
6052 	context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6053 	context.vc_thread = current_thread();
6054 
6055 	/*
6056 	 * Find out how many entries we have, so we can allocate the result
6057 	 * array by walking the list and adjusting the count downward by the
6058 	 * earliest string offset we see.
6059 	 */
6060 	desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6061 	desc_actual = desc_max;
6062 	for (i = 0; i < desc_actual; i++) {
6063 		/*
6064 		 * Take the offset to the name string for this entry and
6065 		 * convert to an input array index, which would be one off
6066 		 * the end of the array if this entry was the lowest-addressed
6067 		 * name string.
6068 		 */
6069 		j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6070 
6071 		/*
6072 		 * An offset greater than the max allowable offset is an error.
6073 		 * It is also an error for any valid entry to point
6074 		 * to a location prior to the end of the current entry, if
6075 		 * it's not a reference to the string of the previous entry.
6076 		 */
6077 		if (j > desc_max || (j != 0 && j <= i)) {
6078 			error = EINVAL;
6079 			goto out;
6080 		}
6081 
6082 		/* Also do not let ad_name_offset point to something beyond the size of the input */
6083 		if (input[i].ad_name_offset >= uap->size) {
6084 			error = EINVAL;
6085 			goto out;
6086 		}
6087 
6088 		/*
6089 		 * An offset of 0 means use the previous descriptor's offset;
6090 		 * this is used to chain multiple requests for the same file
6091 		 * to avoid multiple lookups.
6092 		 */
6093 		if (j == 0) {
6094 			/* This is not valid for the first entry */
6095 			if (i == 0) {
6096 				error = EINVAL;
6097 				goto out;
6098 			}
6099 			continue;
6100 		}
6101 
6102 		/*
6103 		 * If the offset of the string for this descriptor is before
6104 		 * what we believe is the current actual last descriptor,
6105 		 * then we need to adjust our estimate downward; this permits
6106 		 * the string table following the last descriptor to be out
6107 		 * of order relative to the descriptor list.
6108 		 */
6109 		if (j < desc_actual) {
6110 			desc_actual = j;
6111 		}
6112 	}
6113 
6114 	/*
6115 	 * We limit the actual number of descriptors we are willing to process
6116 	 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
6117 	 * requested does not exceed this limit,
6118 	 */
6119 	if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6120 		error = ENOMEM;
6121 		goto out;
6122 	}
6123 	result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6124 	if (result == NULL) {
6125 		error = ENOMEM;
6126 		goto out;
6127 	}
6128 
6129 	/*
6130 	 * Do the work by iterating over the descriptor entries we know to
6131 	 * at least appear to contain valid data.
6132 	 */
6133 	error = 0;
6134 	for (i = 0; i < desc_actual; i++) {
6135 		/*
6136 		 * If the ad_name_offset is 0, then we use the previous
6137 		 * results to make the check; otherwise, we are looking up
6138 		 * a new file name.
6139 		 */
6140 		if (input[i].ad_name_offset != 0) {
6141 			/* discard old vnodes */
6142 			if (vp) {
6143 				vnode_put(vp);
6144 				vp = NULL;
6145 			}
6146 			if (dvp) {
6147 				vnode_put(dvp);
6148 				dvp = NULL;
6149 			}
6150 
6151 			/*
6152 			 * Scan forward in the descriptor list to see if we
6153 			 * need the parent vnode.  We will need it if we are
6154 			 * deleting, since we must have rights  to remove
6155 			 * entries in the parent directory, as well as the
6156 			 * rights to delete the object itself.
6157 			 */
6158 			wantdelete = input[i].ad_flags & _DELETE_OK;
6159 			for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6160 				if (input[j].ad_flags & _DELETE_OK) {
6161 					wantdelete = 1;
6162 				}
6163 			}
6164 
6165 			niopts = FOLLOW | AUDITVNPATH1;
6166 
6167 			/* need parent for vnode_authorize for deletion test */
6168 			if (wantdelete) {
6169 				niopts |= WANTPARENT;
6170 			}
6171 
6172 			/* do the lookup */
6173 			NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6174 			    CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6175 			    &context);
6176 			error = namei(&nd);
6177 			if (!error) {
6178 				vp = nd.ni_vp;
6179 				if (wantdelete) {
6180 					dvp = nd.ni_dvp;
6181 				}
6182 			}
6183 			nameidone(&nd);
6184 		}
6185 
6186 		/*
6187 		 * Handle lookup errors.
6188 		 */
6189 		switch (error) {
6190 		case ENOENT:
6191 		case EACCES:
6192 		case EPERM:
6193 		case ENOTDIR:
6194 			result[i] = error;
6195 			break;
6196 		case 0:
6197 			/* run this access check */
6198 			result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6199 			break;
6200 		default:
6201 			/* fatal lookup error */
6202 
6203 			goto out;
6204 		}
6205 	}
6206 
6207 	AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6208 
6209 	/* copy out results */
6210 	error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6211 
6212 out:
6213 	if (input && input != stack_input) {
6214 		kfree_data(input, uap->size);
6215 	}
6216 	if (result) {
6217 		kfree_data(result, desc_actual * sizeof(errno_t));
6218 	}
6219 	if (vp) {
6220 		vnode_put(vp);
6221 	}
6222 	if (dvp) {
6223 		vnode_put(dvp);
6224 	}
6225 	if (IS_VALID_CRED(context.vc_ucred)) {
6226 		kauth_cred_unref(&context.vc_ucred);
6227 	}
6228 	return error;
6229 }
6230 
6231 
6232 /*
6233  * Returns:	0			Success
6234  *		namei:EFAULT		Bad address
6235  *		namei:ENAMETOOLONG	Filename too long
6236  *		namei:ENOENT		No such file or directory
6237  *		namei:ELOOP		Too many levels of symbolic links
6238  *		namei:EBADF		Bad file descriptor
6239  *		namei:ENOTDIR		Not a directory
6240  *		namei:???
6241  *		access1:
6242  */
6243 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6244 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6245     int flag, enum uio_seg segflg)
6246 {
6247 	int error;
6248 	struct nameidata nd;
6249 	int niopts;
6250 	struct vfs_context context;
6251 #if NAMEDRSRCFORK
6252 	int is_namedstream = 0;
6253 #endif
6254 
6255 	/*
6256 	 * Unless the AT_EACCESS option is used, Access is defined as checking
6257 	 * against the process' real identity, even if operations are checking
6258 	 * the effective identity.  So we need to tweak the credential
6259 	 * in the context for that case.
6260 	 */
6261 	if (!(flag & AT_EACCESS)) {
6262 		context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6263 	} else {
6264 		context.vc_ucred = ctx->vc_ucred;
6265 	}
6266 	context.vc_thread = ctx->vc_thread;
6267 
6268 
6269 	niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6270 	/* need parent for vnode_authorize for deletion test */
6271 	if (amode & _DELETE_OK) {
6272 		niopts |= WANTPARENT;
6273 	}
6274 	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6275 	    path, &context);
6276 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6277 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6278 	}
6279 
6280 #if NAMEDRSRCFORK
6281 	/* access(F_OK) calls are allowed for resource forks. */
6282 	if (amode == F_OK) {
6283 		nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6284 	}
6285 #endif
6286 	error = nameiat(&nd, fd);
6287 	if (error) {
6288 		goto out;
6289 	}
6290 
6291 #if NAMEDRSRCFORK
6292 	/* Grab reference on the shadow stream file vnode to
6293 	 * force an inactive on release which will mark it
6294 	 * for recycle.
6295 	 */
6296 	if (vnode_isnamedstream(nd.ni_vp) &&
6297 	    (nd.ni_vp->v_parent != NULLVP) &&
6298 	    vnode_isshadow(nd.ni_vp)) {
6299 		is_namedstream = 1;
6300 		vnode_ref(nd.ni_vp);
6301 	}
6302 #endif
6303 
6304 	error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6305 
6306 #if NAMEDRSRCFORK
6307 	if (is_namedstream) {
6308 		vnode_rele(nd.ni_vp);
6309 	}
6310 #endif
6311 
6312 	vnode_put(nd.ni_vp);
6313 	if (amode & _DELETE_OK) {
6314 		vnode_put(nd.ni_dvp);
6315 	}
6316 	nameidone(&nd);
6317 
6318 out:
6319 	if (!(flag & AT_EACCESS)) {
6320 		kauth_cred_unref(&context.vc_ucred);
6321 	}
6322 	return error;
6323 }
6324 
6325 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)6326 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6327 {
6328 	return faccessat_internal(vfs_context_current(), AT_FDCWD,
6329 	           uap->path, uap->flags, 0, UIO_USERSPACE);
6330 }
6331 
6332 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)6333 faccessat(__unused proc_t p, struct faccessat_args *uap,
6334     __unused int32_t *retval)
6335 {
6336 	if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
6337 		return EINVAL;
6338 	}
6339 
6340 	return faccessat_internal(vfs_context_current(), uap->fd,
6341 	           uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6342 }
6343 
6344 /*
6345  * Returns:	0			Success
6346  *		EFAULT
6347  *	copyout:EFAULT
6348  *	namei:???
6349  *	vn_stat:???
6350  */
6351 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)6352 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6353     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6354     enum uio_seg segflg, int fd, int flag)
6355 {
6356 	struct nameidata nd;
6357 	int follow;
6358 	union {
6359 		struct stat sb;
6360 		struct stat64 sb64;
6361 	} source = {};
6362 	union {
6363 		struct user64_stat user64_sb;
6364 		struct user32_stat user32_sb;
6365 		struct user64_stat64 user64_sb64;
6366 		struct user32_stat64 user32_sb64;
6367 	} dest = {};
6368 	caddr_t sbp;
6369 	int error, my_size;
6370 	kauth_filesec_t fsec;
6371 	size_t xsecurity_bufsize;
6372 	void * statptr;
6373 	struct fileproc *fp = NULL;
6374 	int needsrealdev = 0;
6375 
6376 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
6377 	NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6378 	    segflg, path, ctx);
6379 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6380 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6381 	}
6382 
6383 #if NAMEDRSRCFORK
6384 	int is_namedstream = 0;
6385 	/* stat calls are allowed for resource forks. */
6386 	nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6387 #endif
6388 
6389 	if (flag & AT_FDONLY) {
6390 		vnode_t fvp;
6391 
6392 		error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6393 		if (error) {
6394 			return error;
6395 		}
6396 		if ((error = vnode_getwithref(fvp))) {
6397 			file_drop(fd);
6398 			return error;
6399 		}
6400 		nd.ni_vp = fvp;
6401 	} else {
6402 		error = nameiat(&nd, fd);
6403 		if (error) {
6404 			return error;
6405 		}
6406 	}
6407 	fsec = KAUTH_FILESEC_NONE;
6408 
6409 	statptr = (void *)&source;
6410 
6411 #if NAMEDRSRCFORK
6412 	/* Grab reference on the shadow stream file vnode to
6413 	 * force an inactive on release which will mark it
6414 	 * for recycle.
6415 	 */
6416 	if (vnode_isnamedstream(nd.ni_vp) &&
6417 	    (nd.ni_vp->v_parent != NULLVP) &&
6418 	    vnode_isshadow(nd.ni_vp)) {
6419 		is_namedstream = 1;
6420 		vnode_ref(nd.ni_vp);
6421 	}
6422 #endif
6423 
6424 	needsrealdev = flag & AT_REALDEV ? 1 : 0;
6425 	if (fp && (xsecurity == USER_ADDR_NULL)) {
6426 		/*
6427 		 * If the caller has the file open, and is not
6428 		 * requesting extended security information, we are
6429 		 * going to let them get the basic stat information.
6430 		 */
6431 		error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6432 		    fp->fp_glob->fg_cred);
6433 	} else {
6434 		error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6435 		    isstat64, needsrealdev, ctx);
6436 	}
6437 
6438 #if NAMEDRSRCFORK
6439 	if (is_namedstream) {
6440 		vnode_rele(nd.ni_vp);
6441 	}
6442 #endif
6443 	vnode_put(nd.ni_vp);
6444 	nameidone(&nd);
6445 	if (fp) {
6446 		file_drop(fd);
6447 		fp = NULL;
6448 	}
6449 
6450 	if (error) {
6451 		return error;
6452 	}
6453 	/* Zap spare fields */
6454 	if (isstat64 != 0) {
6455 		source.sb64.st_lspare = 0;
6456 		source.sb64.st_qspare[0] = 0LL;
6457 		source.sb64.st_qspare[1] = 0LL;
6458 		if (vfs_context_is64bit(ctx)) {
6459 			munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6460 			my_size = sizeof(dest.user64_sb64);
6461 			sbp = (caddr_t)&dest.user64_sb64;
6462 		} else {
6463 			munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6464 			my_size = sizeof(dest.user32_sb64);
6465 			sbp = (caddr_t)&dest.user32_sb64;
6466 		}
6467 		/*
6468 		 * Check if we raced (post lookup) against the last unlink of a file.
6469 		 */
6470 		if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
6471 			source.sb64.st_nlink = 1;
6472 		}
6473 	} else {
6474 		source.sb.st_lspare = 0;
6475 		source.sb.st_qspare[0] = 0LL;
6476 		source.sb.st_qspare[1] = 0LL;
6477 		if (vfs_context_is64bit(ctx)) {
6478 			munge_user64_stat(&source.sb, &dest.user64_sb);
6479 			my_size = sizeof(dest.user64_sb);
6480 			sbp = (caddr_t)&dest.user64_sb;
6481 		} else {
6482 			munge_user32_stat(&source.sb, &dest.user32_sb);
6483 			my_size = sizeof(dest.user32_sb);
6484 			sbp = (caddr_t)&dest.user32_sb;
6485 		}
6486 
6487 		/*
6488 		 * Check if we raced (post lookup) against the last unlink of a file.
6489 		 */
6490 		if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
6491 			source.sb.st_nlink = 1;
6492 		}
6493 	}
6494 	if ((error = copyout(sbp, ub, my_size)) != 0) {
6495 		goto out;
6496 	}
6497 
6498 	/* caller wants extended security information? */
6499 	if (xsecurity != USER_ADDR_NULL) {
6500 		/* did we get any? */
6501 		if (fsec == KAUTH_FILESEC_NONE) {
6502 			if (susize(xsecurity_size, 0) != 0) {
6503 				error = EFAULT;
6504 				goto out;
6505 			}
6506 		} else {
6507 			/* find the user buffer size */
6508 			xsecurity_bufsize = fusize(xsecurity_size);
6509 
6510 			/* copy out the actual data size */
6511 			if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
6512 				error = EFAULT;
6513 				goto out;
6514 			}
6515 
6516 			/* if the caller supplied enough room, copy out to it */
6517 			if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
6518 				error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
6519 			}
6520 		}
6521 	}
6522 out:
6523 	if (fsec != KAUTH_FILESEC_NONE) {
6524 		kauth_filesec_free(fsec);
6525 	}
6526 	return error;
6527 }
6528 
6529 /*
6530  * stat_extended: Get file status; with extended security (ACL).
6531  *
6532  * Parameters:    p                       (ignored)
6533  *                uap                     User argument descriptor (see below)
6534  *                retval                  (ignored)
6535  *
6536  * Indirect:      uap->path               Path of file to get status from
6537  *                uap->ub                 User buffer (holds file status info)
6538  *                uap->xsecurity          ACL to get (extended security)
6539  *                uap->xsecurity_size     Size of ACL
6540  *
6541  * Returns:        0                      Success
6542  *                !0                      errno value
6543  *
6544  */
6545 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)6546 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
6547     __unused int32_t *retval)
6548 {
6549 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6550 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6551 	           0);
6552 }
6553 
6554 /*
6555  * Returns:	0			Success
6556  *	fstatat_internal:???		[see fstatat_internal() in this file]
6557  */
6558 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)6559 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
6560 {
6561 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6562 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
6563 }
6564 
6565 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)6566 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
6567 {
6568 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6569 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
6570 }
6571 
6572 /*
6573  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
6574  *
6575  * Parameters:    p                       (ignored)
6576  *                uap                     User argument descriptor (see below)
6577  *                retval                  (ignored)
6578  *
6579  * Indirect:      uap->path               Path of file to get status from
6580  *                uap->ub                 User buffer (holds file status info)
6581  *                uap->xsecurity          ACL to get (extended security)
6582  *                uap->xsecurity_size     Size of ACL
6583  *
6584  * Returns:        0                      Success
6585  *                !0                      errno value
6586  *
6587  */
6588 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)6589 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
6590 {
6591 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6592 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6593 	           0);
6594 }
6595 
6596 /*
6597  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
6598  *
6599  * Parameters:    p                       (ignored)
6600  *                uap                     User argument descriptor (see below)
6601  *                retval                  (ignored)
6602  *
6603  * Indirect:      uap->path               Path of file to get status from
6604  *                uap->ub                 User buffer (holds file status info)
6605  *                uap->xsecurity          ACL to get (extended security)
6606  *                uap->xsecurity_size     Size of ACL
6607  *
6608  * Returns:        0                      Success
6609  *                !0                      errno value
6610  *
6611  */
6612 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)6613 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
6614 {
6615 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6616 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6617 	           AT_SYMLINK_NOFOLLOW);
6618 }
6619 
6620 /*
6621  * Get file status; this version does not follow links.
6622  */
6623 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)6624 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
6625 {
6626 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6627 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6628 }
6629 
6630 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)6631 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
6632 {
6633 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6634 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6635 }
6636 
6637 /*
6638  * lstat64_extended: Get file status; can handle large inode numbers; does not
6639  * follow links; with extended security (ACL).
6640  *
6641  * Parameters:    p                       (ignored)
6642  *                uap                     User argument descriptor (see below)
6643  *                retval                  (ignored)
6644  *
6645  * Indirect:      uap->path               Path of file to get status from
6646  *                uap->ub                 User buffer (holds file status info)
6647  *                uap->xsecurity          ACL to get (extended security)
6648  *                uap->xsecurity_size     Size of ACL
6649  *
6650  * Returns:        0                      Success
6651  *                !0                      errno value
6652  *
6653  */
6654 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)6655 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
6656 {
6657 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6658 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6659 	           AT_SYMLINK_NOFOLLOW);
6660 }
6661 
6662 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)6663 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
6664 {
6665 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
6666 		return EINVAL;
6667 	}
6668 
6669 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6670 	           0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
6671 }
6672 
6673 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)6674 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
6675     __unused int32_t *retval)
6676 {
6677 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
6678 		return EINVAL;
6679 	}
6680 
6681 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6682 	           0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
6683 }
6684 
6685 /*
6686  * Get configurable pathname variables.
6687  *
6688  * Returns:	0			Success
6689  *	namei:???
6690  *	vn_pathconf:???
6691  *
6692  * Notes:	Global implementation  constants are intended to be
6693  *		implemented in this function directly; all other constants
6694  *		are per-FS implementation, and therefore must be handled in
6695  *		each respective FS, instead.
6696  *
6697  * XXX We implement some things globally right now that should actually be
6698  * XXX per-FS; we will need to deal with this at some point.
6699  */
6700 /* ARGSUSED */
6701 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)6702 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
6703 {
6704 	int error;
6705 	struct nameidata nd;
6706 	vfs_context_t ctx = vfs_context_current();
6707 
6708 	NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
6709 	    UIO_USERSPACE, uap->path, ctx);
6710 	error = namei(&nd);
6711 	if (error) {
6712 		return error;
6713 	}
6714 
6715 	error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
6716 
6717 	vnode_put(nd.ni_vp);
6718 	nameidone(&nd);
6719 	return error;
6720 }
6721 
6722 /*
6723  * Return target name of a symbolic link.
6724  */
6725 /* ARGSUSED */
6726 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)6727 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
6728     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
6729     int *retval)
6730 {
6731 	vnode_t vp;
6732 	uio_t auio;
6733 	int error;
6734 	struct nameidata nd;
6735 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
6736 	bool put_vnode;
6737 
6738 	if (bufsize > INT32_MAX) {
6739 		return EINVAL;
6740 	}
6741 
6742 	if (lnk_vp) {
6743 		vp = lnk_vp;
6744 		put_vnode = false;
6745 	} else {
6746 		NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
6747 		    seg, path, ctx);
6748 
6749 		error = nameiat(&nd, fd);
6750 		if (error) {
6751 			return error;
6752 		}
6753 		vp = nd.ni_vp;
6754 		put_vnode = true;
6755 		nameidone(&nd);
6756 	}
6757 
6758 	auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
6759 	    &uio_buf[0], sizeof(uio_buf));
6760 	uio_addiov(auio, buf, bufsize);
6761 	if (vp->v_type != VLNK) {
6762 		error = EINVAL;
6763 	} else {
6764 #if CONFIG_MACF
6765 		error = mac_vnode_check_readlink(ctx, vp);
6766 #endif
6767 		if (error == 0) {
6768 			error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
6769 			    ctx);
6770 		}
6771 		if (error == 0) {
6772 			error = VNOP_READLINK(vp, auio, ctx);
6773 		}
6774 	}
6775 
6776 	if (put_vnode) {
6777 		vnode_put(vp);
6778 	}
6779 
6780 	*retval = (int)(bufsize - uio_resid(auio));
6781 	return error;
6782 }
6783 
6784 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)6785 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
6786 {
6787 	enum uio_seg procseg;
6788 	vnode_t vp;
6789 	int error;
6790 
6791 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6792 
6793 	AUDIT_ARG(fd, uap->fd);
6794 
6795 	if ((error = file_vnode(uap->fd, &vp))) {
6796 		return error;
6797 	}
6798 	if ((error = vnode_getwithref(vp))) {
6799 		file_drop(uap->fd);
6800 		return error;
6801 	}
6802 
6803 	error = readlinkat_internal(vfs_context_current(), -1,
6804 	    vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
6805 	    uap->bufsize, procseg, retval);
6806 
6807 	vnode_put(vp);
6808 	file_drop(uap->fd);
6809 	return error;
6810 }
6811 
6812 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)6813 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
6814 {
6815 	enum uio_seg procseg;
6816 
6817 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6818 	return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
6819 	           CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
6820 	           uap->count, procseg, retval);
6821 }
6822 
6823 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)6824 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
6825 {
6826 	enum uio_seg procseg;
6827 
6828 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6829 	return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
6830 	           CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
6831 	           retval);
6832 }
6833 
6834 /*
6835  * Change file flags, the deep inner layer.
6836  */
6837 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)6838 chflags0(vnode_t vp, struct vnode_attr *va,
6839     int (*setattr)(vnode_t, void *, vfs_context_t),
6840     void *arg, vfs_context_t ctx)
6841 {
6842 	kauth_action_t action = 0;
6843 	int error;
6844 
6845 #if CONFIG_MACF
6846 	error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
6847 	if (error) {
6848 		goto out;
6849 	}
6850 #endif
6851 
6852 	/* request authorisation, disregard immutability */
6853 	if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
6854 		goto out;
6855 	}
6856 	/*
6857 	 * Request that the auth layer disregard those file flags it's allowed to when
6858 	 * authorizing this operation; we need to do this in order to be able to
6859 	 * clear immutable flags.
6860 	 */
6861 	if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
6862 		goto out;
6863 	}
6864 	error = (*setattr)(vp, arg, ctx);
6865 
6866 #if CONFIG_MACF
6867 	if (error == 0) {
6868 		mac_vnode_notify_setflags(ctx, vp, va->va_flags);
6869 	}
6870 #endif
6871 
6872 out:
6873 	return error;
6874 }
6875 
6876 /*
6877  * Change file flags.
6878  *
6879  * NOTE: this will vnode_put() `vp'
6880  */
6881 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)6882 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
6883 {
6884 	struct vnode_attr va;
6885 	int error;
6886 
6887 	VATTR_INIT(&va);
6888 	VATTR_SET(&va, va_flags, flags);
6889 
6890 	error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
6891 	vnode_put(vp);
6892 
6893 	if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
6894 		error = ENOTSUP;
6895 	}
6896 
6897 	return error;
6898 }
6899 
6900 /*
6901  * Change flags of a file given a path name.
6902  */
6903 /* ARGSUSED */
6904 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)6905 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
6906 {
6907 	vnode_t vp;
6908 	vfs_context_t ctx = vfs_context_current();
6909 	int error;
6910 	struct nameidata nd;
6911 
6912 	AUDIT_ARG(fflags, uap->flags);
6913 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6914 	    UIO_USERSPACE, uap->path, ctx);
6915 	error = namei(&nd);
6916 	if (error) {
6917 		return error;
6918 	}
6919 	vp = nd.ni_vp;
6920 	nameidone(&nd);
6921 
6922 	/* we don't vnode_put() here because chflags1 does internally */
6923 	error = chflags1(vp, uap->flags, ctx);
6924 
6925 	return error;
6926 }
6927 
6928 /*
6929  * Change flags of a file given a file descriptor.
6930  */
6931 /* ARGSUSED */
6932 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)6933 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
6934 {
6935 	vnode_t vp;
6936 	int error;
6937 
6938 	AUDIT_ARG(fd, uap->fd);
6939 	AUDIT_ARG(fflags, uap->flags);
6940 	if ((error = file_vnode(uap->fd, &vp))) {
6941 		return error;
6942 	}
6943 
6944 	if ((error = vnode_getwithref(vp))) {
6945 		file_drop(uap->fd);
6946 		return error;
6947 	}
6948 
6949 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6950 
6951 	/* we don't vnode_put() here because chflags1 does internally */
6952 	error = chflags1(vp, uap->flags, vfs_context_current());
6953 
6954 	file_drop(uap->fd);
6955 	return error;
6956 }
6957 
6958 /*
6959  * Change security information on a filesystem object.
6960  *
6961  * Returns:	0			Success
6962  *		EPERM			Operation not permitted
6963  *		vnode_authattr:???	[anything vnode_authattr can return]
6964  *		vnode_authorize:???	[anything vnode_authorize can return]
6965  *		vnode_setattr:???	[anything vnode_setattr can return]
6966  *
6967  * Notes:	If vnode_authattr or vnode_authorize return EACCES, it will be
6968  *		translated to EPERM before being returned.
6969  */
6970 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)6971 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
6972 {
6973 	kauth_action_t action;
6974 	int error;
6975 
6976 	AUDIT_ARG(mode, vap->va_mode);
6977 	/* XXX audit new args */
6978 
6979 #if NAMEDSTREAMS
6980 	/* chmod calls are not allowed for resource forks. */
6981 	if (vp->v_flag & VISNAMEDSTREAM) {
6982 		return EPERM;
6983 	}
6984 #endif
6985 
6986 #if CONFIG_MACF
6987 	if (VATTR_IS_ACTIVE(vap, va_mode) &&
6988 	    (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
6989 		return error;
6990 	}
6991 
6992 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6993 		if ((error = mac_vnode_check_setowner(ctx, vp,
6994 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6995 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
6996 			return error;
6997 		}
6998 	}
6999 
7000 	if (VATTR_IS_ACTIVE(vap, va_acl) &&
7001 	    (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7002 		return error;
7003 	}
7004 #endif
7005 
7006 	/* make sure that the caller is allowed to set this security information */
7007 	if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7008 	    ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7009 		if (error == EACCES) {
7010 			error = EPERM;
7011 		}
7012 		return error;
7013 	}
7014 
7015 	if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7016 		return error;
7017 	}
7018 
7019 #if CONFIG_MACF
7020 	if (VATTR_IS_ACTIVE(vap, va_mode)) {
7021 		mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7022 	}
7023 
7024 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7025 		mac_vnode_notify_setowner(ctx, vp,
7026 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7027 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7028 	}
7029 
7030 	if (VATTR_IS_ACTIVE(vap, va_acl)) {
7031 		mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7032 	}
7033 #endif
7034 
7035 	return error;
7036 }
7037 
7038 
7039 /*
7040  * Change mode of a file given a path name.
7041  *
7042  * Returns:	0			Success
7043  *		namei:???		[anything namei can return]
7044  *		chmod_vnode:???		[anything chmod_vnode can return]
7045  */
7046 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7047 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7048     int fd, int flag, enum uio_seg segflg)
7049 {
7050 	struct nameidata nd;
7051 	int follow, error;
7052 
7053 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7054 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
7055 	    segflg, path, ctx);
7056 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7057 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7058 	}
7059 	if ((error = nameiat(&nd, fd))) {
7060 		return error;
7061 	}
7062 	error = chmod_vnode(ctx, nd.ni_vp, vap);
7063 	vnode_put(nd.ni_vp);
7064 	nameidone(&nd);
7065 	return error;
7066 }
7067 
7068 /*
7069  * chmod_extended: Change the mode of a file given a path name; with extended
7070  * argument list (including extended security (ACL)).
7071  *
7072  * Parameters:	p			Process requesting the open
7073  *		uap			User argument descriptor (see below)
7074  *		retval			(ignored)
7075  *
7076  * Indirect:	uap->path		Path to object (same as 'chmod')
7077  *		uap->uid		UID to set
7078  *		uap->gid		GID to set
7079  *		uap->mode		File mode to set (same as 'chmod')
7080  *		uap->xsecurity		ACL to set (or delete)
7081  *
7082  * Returns:	0			Success
7083  *		!0			errno value
7084  *
7085  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
7086  *
7087  * XXX:		We should enummerate the possible errno values here, and where
7088  *		in the code they originated.
7089  */
7090 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7091 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7092 {
7093 	int error;
7094 	struct vnode_attr va;
7095 	kauth_filesec_t xsecdst;
7096 
7097 	AUDIT_ARG(owner, uap->uid, uap->gid);
7098 
7099 	VATTR_INIT(&va);
7100 	if (uap->mode != -1) {
7101 		VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7102 	}
7103 	if (uap->uid != KAUTH_UID_NONE) {
7104 		VATTR_SET(&va, va_uid, uap->uid);
7105 	}
7106 	if (uap->gid != KAUTH_GID_NONE) {
7107 		VATTR_SET(&va, va_gid, uap->gid);
7108 	}
7109 
7110 	xsecdst = NULL;
7111 	switch (uap->xsecurity) {
7112 	/* explicit remove request */
7113 	case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
7114 		VATTR_SET(&va, va_acl, NULL);
7115 		break;
7116 	/* not being set */
7117 	case USER_ADDR_NULL:
7118 		break;
7119 	default:
7120 		if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
7121 			return error;
7122 		}
7123 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
7124 		va.va_vaflags |= VA_FILESEC_ACL;
7125 		KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
7126 	}
7127 
7128 	error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7129 	    UIO_USERSPACE);
7130 
7131 	if (xsecdst != NULL) {
7132 		kauth_filesec_free(xsecdst);
7133 	}
7134 	return error;
7135 }
7136 
7137 /*
7138  * Returns:	0			Success
7139  *		chmodat:???		[anything chmodat can return]
7140  */
7141 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7142 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7143     int flag, enum uio_seg segflg)
7144 {
7145 	struct vnode_attr va;
7146 
7147 	VATTR_INIT(&va);
7148 	VATTR_SET(&va, va_mode, mode & ALLPERMS);
7149 
7150 	return chmodat(ctx, path, &va, fd, flag, segflg);
7151 }
7152 
7153 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7154 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7155 {
7156 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7157 	           AT_FDCWD, 0, UIO_USERSPACE);
7158 }
7159 
7160 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7161 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7162 {
7163 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7164 		return EINVAL;
7165 	}
7166 
7167 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7168 	           uap->fd, uap->flag, UIO_USERSPACE);
7169 }
7170 
7171 /*
7172  * Change mode of a file given a file descriptor.
7173  */
7174 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7175 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7176 {
7177 	vnode_t vp;
7178 	int error;
7179 
7180 	AUDIT_ARG(fd, fd);
7181 
7182 	if ((error = file_vnode(fd, &vp)) != 0) {
7183 		return error;
7184 	}
7185 	if ((error = vnode_getwithref(vp)) != 0) {
7186 		file_drop(fd);
7187 		return error;
7188 	}
7189 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7190 
7191 	error = chmod_vnode(vfs_context_current(), vp, vap);
7192 	(void)vnode_put(vp);
7193 	file_drop(fd);
7194 
7195 	return error;
7196 }
7197 
7198 /*
7199  * fchmod_extended: Change mode of a file given a file descriptor; with
7200  * extended argument list (including extended security (ACL)).
7201  *
7202  * Parameters:    p                       Process requesting to change file mode
7203  *                uap                     User argument descriptor (see below)
7204  *                retval                  (ignored)
7205  *
7206  * Indirect:      uap->mode               File mode to set (same as 'chmod')
7207  *                uap->uid                UID to set
7208  *                uap->gid                GID to set
7209  *                uap->xsecurity          ACL to set (or delete)
7210  *                uap->fd                 File descriptor of file to change mode
7211  *
7212  * Returns:        0                      Success
7213  *                !0                      errno value
7214  *
7215  */
7216 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7217 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7218 {
7219 	int error;
7220 	struct vnode_attr va;
7221 	kauth_filesec_t xsecdst;
7222 
7223 	AUDIT_ARG(owner, uap->uid, uap->gid);
7224 
7225 	VATTR_INIT(&va);
7226 	if (uap->mode != -1) {
7227 		VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7228 	} else {
7229 		va.va_mode = 0;
7230 	}
7231 
7232 	if (uap->uid != KAUTH_UID_NONE) {
7233 		VATTR_SET(&va, va_uid, uap->uid);
7234 	}
7235 	if (uap->gid != KAUTH_GID_NONE) {
7236 		VATTR_SET(&va, va_gid, uap->gid);
7237 	}
7238 
7239 	xsecdst = NULL;
7240 	switch (uap->xsecurity) {
7241 	case USER_ADDR_NULL:
7242 		VATTR_SET(&va, va_acl, NULL);
7243 		break;
7244 	case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
7245 		VATTR_SET(&va, va_acl, NULL);
7246 		break;
7247 	/* not being set */
7248 	case CAST_USER_ADDR_T(-1):
7249 		break;
7250 	default:
7251 		if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
7252 			return error;
7253 		}
7254 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
7255 		va.va_vaflags |= VA_FILESEC_ACL;
7256 	}
7257 
7258 	error = fchmod1(p, uap->fd, &va);
7259 
7260 
7261 	switch (uap->xsecurity) {
7262 	case USER_ADDR_NULL:
7263 	case CAST_USER_ADDR_T(-1):
7264 		break;
7265 	default:
7266 		if (xsecdst != NULL) {
7267 			kauth_filesec_free(xsecdst);
7268 		}
7269 	}
7270 	return error;
7271 }
7272 
7273 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)7274 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7275 {
7276 	struct vnode_attr va;
7277 
7278 	VATTR_INIT(&va);
7279 	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7280 
7281 	return fchmod1(p, uap->fd, &va);
7282 }
7283 
7284 
7285 /*
7286  * Set ownership given a path name.
7287  */
7288 /* ARGSUSED */
7289 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)7290 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7291     gid_t gid, int flag, enum uio_seg segflg)
7292 {
7293 	vnode_t vp;
7294 	struct vnode_attr va;
7295 	int error;
7296 	struct nameidata nd;
7297 	int follow;
7298 	kauth_action_t action;
7299 
7300 	AUDIT_ARG(owner, uid, gid);
7301 
7302 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7303 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
7304 	    path, ctx);
7305 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7306 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7307 	}
7308 	error = nameiat(&nd, fd);
7309 	if (error) {
7310 		return error;
7311 	}
7312 	vp = nd.ni_vp;
7313 
7314 	nameidone(&nd);
7315 
7316 	VATTR_INIT(&va);
7317 	if (uid != (uid_t)VNOVAL) {
7318 		VATTR_SET(&va, va_uid, uid);
7319 	}
7320 	if (gid != (gid_t)VNOVAL) {
7321 		VATTR_SET(&va, va_gid, gid);
7322 	}
7323 
7324 #if CONFIG_MACF
7325 	error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7326 	if (error) {
7327 		goto out;
7328 	}
7329 #endif
7330 
7331 	/* preflight and authorize attribute changes */
7332 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7333 		goto out;
7334 	}
7335 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7336 		goto out;
7337 	}
7338 	error = vnode_setattr(vp, &va, ctx);
7339 
7340 #if CONFIG_MACF
7341 	if (error == 0) {
7342 		mac_vnode_notify_setowner(ctx, vp, uid, gid);
7343 	}
7344 #endif
7345 
7346 out:
7347 	/*
7348 	 * EACCES is only allowed from namei(); permissions failure should
7349 	 * return EPERM, so we need to translate the error code.
7350 	 */
7351 	if (error == EACCES) {
7352 		error = EPERM;
7353 	}
7354 
7355 	vnode_put(vp);
7356 	return error;
7357 }
7358 
7359 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)7360 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7361 {
7362 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7363 	           uap->uid, uap->gid, 0, UIO_USERSPACE);
7364 }
7365 
7366 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)7367 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7368 {
7369 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7370 	           uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7371 }
7372 
7373 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)7374 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7375 {
7376 	if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7377 		return EINVAL;
7378 	}
7379 
7380 	return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7381 	           uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7382 }
7383 
7384 /*
7385  * Set ownership given a file descriptor.
7386  */
7387 /* ARGSUSED */
7388 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)7389 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7390 {
7391 	struct vnode_attr va;
7392 	vfs_context_t ctx = vfs_context_current();
7393 	vnode_t vp;
7394 	int error;
7395 	kauth_action_t action;
7396 
7397 	AUDIT_ARG(owner, uap->uid, uap->gid);
7398 	AUDIT_ARG(fd, uap->fd);
7399 
7400 	if ((error = file_vnode(uap->fd, &vp))) {
7401 		return error;
7402 	}
7403 
7404 	if ((error = vnode_getwithref(vp))) {
7405 		file_drop(uap->fd);
7406 		return error;
7407 	}
7408 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7409 
7410 	VATTR_INIT(&va);
7411 	if (uap->uid != VNOVAL) {
7412 		VATTR_SET(&va, va_uid, uap->uid);
7413 	}
7414 	if (uap->gid != VNOVAL) {
7415 		VATTR_SET(&va, va_gid, uap->gid);
7416 	}
7417 
7418 #if NAMEDSTREAMS
7419 	/* chown calls are not allowed for resource forks. */
7420 	if (vp->v_flag & VISNAMEDSTREAM) {
7421 		error = EPERM;
7422 		goto out;
7423 	}
7424 #endif
7425 
7426 #if CONFIG_MACF
7427 	error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
7428 	if (error) {
7429 		goto out;
7430 	}
7431 #endif
7432 
7433 	/* preflight and authorize attribute changes */
7434 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7435 		goto out;
7436 	}
7437 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7438 		if (error == EACCES) {
7439 			error = EPERM;
7440 		}
7441 		goto out;
7442 	}
7443 	error = vnode_setattr(vp, &va, ctx);
7444 
7445 #if CONFIG_MACF
7446 	if (error == 0) {
7447 		mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
7448 	}
7449 #endif
7450 
7451 out:
7452 	(void)vnode_put(vp);
7453 	file_drop(uap->fd);
7454 	return error;
7455 }
7456 
7457 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)7458 getutimes(user_addr_t usrtvp, struct timespec *tsp)
7459 {
7460 	int error;
7461 
7462 	if (usrtvp == USER_ADDR_NULL) {
7463 		struct timeval old_tv;
7464 		/* XXX Y2038 bug because of microtime argument */
7465 		microtime(&old_tv);
7466 		TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
7467 		tsp[1] = tsp[0];
7468 	} else {
7469 		if (IS_64BIT_PROCESS(current_proc())) {
7470 			struct user64_timeval tv[2];
7471 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
7472 			if (error) {
7473 				return error;
7474 			}
7475 			TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
7476 			TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
7477 		} else {
7478 			struct user32_timeval tv[2];
7479 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
7480 			if (error) {
7481 				return error;
7482 			}
7483 			TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
7484 			TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
7485 		}
7486 	}
7487 	return 0;
7488 }
7489 
7490 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)7491 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
7492     int nullflag)
7493 {
7494 	int error;
7495 	struct vnode_attr va;
7496 	kauth_action_t action;
7497 
7498 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7499 
7500 	VATTR_INIT(&va);
7501 	VATTR_SET(&va, va_access_time, ts[0]);
7502 	VATTR_SET(&va, va_modify_time, ts[1]);
7503 	if (nullflag) {
7504 		va.va_vaflags |= VA_UTIMES_NULL;
7505 	}
7506 
7507 #if NAMEDSTREAMS
7508 	/* utimes calls are not allowed for resource forks. */
7509 	if (vp->v_flag & VISNAMEDSTREAM) {
7510 		error = EPERM;
7511 		goto out;
7512 	}
7513 #endif
7514 
7515 #if CONFIG_MACF
7516 	error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
7517 	if (error) {
7518 		goto out;
7519 	}
7520 #endif
7521 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7522 		if (!nullflag && error == EACCES) {
7523 			error = EPERM;
7524 		}
7525 		goto out;
7526 	}
7527 
7528 	/* since we may not need to auth anything, check here */
7529 	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7530 		if (!nullflag && error == EACCES) {
7531 			error = EPERM;
7532 		}
7533 		goto out;
7534 	}
7535 	error = vnode_setattr(vp, &va, ctx);
7536 
7537 #if CONFIG_MACF
7538 	if (error == 0) {
7539 		mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
7540 	}
7541 #endif
7542 
7543 out:
7544 	return error;
7545 }
7546 
7547 /*
7548  * Set the access and modification times of a file.
7549  */
7550 /* ARGSUSED */
7551 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)7552 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
7553 {
7554 	struct timespec ts[2];
7555 	user_addr_t usrtvp;
7556 	int error;
7557 	struct nameidata nd;
7558 	vfs_context_t ctx = vfs_context_current();
7559 
7560 	/*
7561 	 * AUDIT: Needed to change the order of operations to do the
7562 	 * name lookup first because auditing wants the path.
7563 	 */
7564 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
7565 	    UIO_USERSPACE, uap->path, ctx);
7566 	error = namei(&nd);
7567 	if (error) {
7568 		return error;
7569 	}
7570 	nameidone(&nd);
7571 
7572 	/*
7573 	 * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
7574 	 * the current time instead.
7575 	 */
7576 	usrtvp = uap->tptr;
7577 	if ((error = getutimes(usrtvp, ts)) != 0) {
7578 		goto out;
7579 	}
7580 
7581 	error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
7582 
7583 out:
7584 	vnode_put(nd.ni_vp);
7585 	return error;
7586 }
7587 
7588 /*
7589  * Set the access and modification times of a file.
7590  */
7591 /* ARGSUSED */
7592 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)7593 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
7594 {
7595 	struct timespec ts[2];
7596 	vnode_t vp;
7597 	user_addr_t usrtvp;
7598 	int error;
7599 
7600 	AUDIT_ARG(fd, uap->fd);
7601 	usrtvp = uap->tptr;
7602 	if ((error = getutimes(usrtvp, ts)) != 0) {
7603 		return error;
7604 	}
7605 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
7606 		return error;
7607 	}
7608 	if ((error = vnode_getwithref(vp))) {
7609 		file_drop(uap->fd);
7610 		return error;
7611 	}
7612 
7613 	error =  setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
7614 	vnode_put(vp);
7615 	file_drop(uap->fd);
7616 	return error;
7617 }
7618 
7619 /*
7620  * Truncate a file given its path name.
7621  */
7622 /* ARGSUSED */
7623 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)7624 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
7625 {
7626 	vnode_t vp;
7627 	struct vnode_attr va;
7628 	vfs_context_t ctx = vfs_context_current();
7629 	int error;
7630 	struct nameidata nd;
7631 	kauth_action_t action;
7632 	rlim_t fsize_limit;
7633 
7634 	if (uap->length < 0) {
7635 		return EINVAL;
7636 	}
7637 
7638 	fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
7639 	if ((rlim_t)uap->length > fsize_limit) {
7640 		psignal(p, SIGXFSZ);
7641 		return EFBIG;
7642 	}
7643 
7644 	NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
7645 	    UIO_USERSPACE, uap->path, ctx);
7646 	if ((error = namei(&nd))) {
7647 		return error;
7648 	}
7649 	vp = nd.ni_vp;
7650 
7651 	nameidone(&nd);
7652 
7653 	VATTR_INIT(&va);
7654 	VATTR_SET(&va, va_data_size, uap->length);
7655 
7656 #if CONFIG_MACF
7657 	error = mac_vnode_check_truncate(ctx, NOCRED, vp);
7658 	if (error) {
7659 		goto out;
7660 	}
7661 #endif
7662 
7663 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7664 		goto out;
7665 	}
7666 	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7667 		goto out;
7668 	}
7669 	error = vnode_setattr(vp, &va, ctx);
7670 
7671 #if CONFIG_MACF
7672 	if (error == 0) {
7673 		mac_vnode_notify_truncate(ctx, NOCRED, vp);
7674 	}
7675 #endif
7676 
7677 out:
7678 	vnode_put(vp);
7679 	return error;
7680 }
7681 
7682 /*
7683  * Truncate a file given a file descriptor.
7684  */
7685 /* ARGSUSED */
7686 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)7687 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
7688 {
7689 	vfs_context_t ctx = vfs_context_current();
7690 	struct vnode_attr va;
7691 	vnode_t vp;
7692 	struct fileproc *fp;
7693 	int error;
7694 	int fd = uap->fd;
7695 	rlim_t fsize_limit;
7696 
7697 	AUDIT_ARG(fd, uap->fd);
7698 	if (uap->length < 0) {
7699 		return EINVAL;
7700 	}
7701 
7702 	fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
7703 	if ((rlim_t)uap->length > fsize_limit) {
7704 		psignal(p, SIGXFSZ);
7705 		return EFBIG;
7706 	}
7707 
7708 	if ((error = fp_lookup(p, fd, &fp, 0))) {
7709 		return error;
7710 	}
7711 
7712 	switch (FILEGLOB_DTYPE(fp->fp_glob)) {
7713 	case DTYPE_PSXSHM:
7714 		error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
7715 		goto out;
7716 	case DTYPE_VNODE:
7717 		break;
7718 	default:
7719 		error = EINVAL;
7720 		goto out;
7721 	}
7722 
7723 	vp = (vnode_t)fp_get_data(fp);
7724 
7725 	if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
7726 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7727 		error = EINVAL;
7728 		goto out;
7729 	}
7730 
7731 	if ((error = vnode_getwithref(vp)) != 0) {
7732 		goto out;
7733 	}
7734 
7735 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7736 
7737 #if CONFIG_MACF
7738 	error = mac_vnode_check_truncate(ctx,
7739 	    fp->fp_glob->fg_cred, vp);
7740 	if (error) {
7741 		(void)vnode_put(vp);
7742 		goto out;
7743 	}
7744 #endif
7745 	VATTR_INIT(&va);
7746 	VATTR_SET(&va, va_data_size, uap->length);
7747 	error = vnode_setattr(vp, &va, ctx);
7748 
7749 #if CONFIG_MACF
7750 	if (error == 0) {
7751 		mac_vnode_notify_truncate(ctx, fp->fp_glob->fg_cred, vp);
7752 	}
7753 #endif
7754 
7755 	(void)vnode_put(vp);
7756 out:
7757 	file_drop(fd);
7758 	return error;
7759 }
7760 
7761 
7762 /*
7763  * Sync an open file with synchronized I/O _file_ integrity completion
7764  */
7765 /* ARGSUSED */
7766 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)7767 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
7768 {
7769 	__pthread_testcancel(1);
7770 	return fsync_common(p, uap, MNT_WAIT);
7771 }
7772 
7773 
7774 /*
7775  * Sync an open file with synchronized I/O _file_ integrity completion
7776  *
7777  * Notes:	This is a legacy support function that does not test for
7778  *		thread cancellation points.
7779  */
7780 /* ARGSUSED */
7781 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)7782 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
7783 {
7784 	return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
7785 }
7786 
7787 
7788 /*
7789  * Sync an open file with synchronized I/O _data_ integrity completion
7790  */
7791 /* ARGSUSED */
7792 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)7793 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
7794 {
7795 	__pthread_testcancel(1);
7796 	return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
7797 }
7798 
7799 
7800 /*
7801  * fsync_common
7802  *
7803  * Common fsync code to support both synchronized I/O file integrity completion
7804  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
7805  *
7806  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
7807  * will only guarantee that the file data contents are retrievable.  If
7808  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
7809  * includes additional metadata unnecessary for retrieving the file data
7810  * contents, such as atime, mtime, ctime, etc., also be committed to stable
7811  * storage.
7812  *
7813  * Parameters:	p				The process
7814  *		uap->fd				The descriptor to synchronize
7815  *		flags				The data integrity flags
7816  *
7817  * Returns:	int				Success
7818  *	fp_getfvp:EBADF				Bad file descriptor
7819  *	fp_getfvp:ENOTSUP			fd does not refer to a vnode
7820  *	VNOP_FSYNC:???				unspecified
7821  *
7822  * Notes:	We use struct fsync_args because it is a short name, and all
7823  *		caller argument structures are otherwise identical.
7824  */
7825 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)7826 fsync_common(proc_t p, struct fsync_args *uap, int flags)
7827 {
7828 	vnode_t vp;
7829 	struct fileproc *fp;
7830 	vfs_context_t ctx = vfs_context_current();
7831 	int error;
7832 
7833 	AUDIT_ARG(fd, uap->fd);
7834 
7835 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
7836 		return error;
7837 	}
7838 	if ((error = vnode_getwithref(vp))) {
7839 		file_drop(uap->fd);
7840 		return error;
7841 	}
7842 
7843 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7844 
7845 	error = VNOP_FSYNC(vp, flags, ctx);
7846 
7847 #if NAMEDRSRCFORK
7848 	/* Sync resource fork shadow file if necessary. */
7849 	if ((error == 0) &&
7850 	    (vp->v_flag & VISNAMEDSTREAM) &&
7851 	    (vp->v_parent != NULLVP) &&
7852 	    vnode_isshadow(vp) &&
7853 	    (fp->fp_glob->fg_flag & FWASWRITTEN)) {
7854 		(void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
7855 	}
7856 #endif
7857 
7858 	(void)vnode_put(vp);
7859 	file_drop(uap->fd);
7860 	return error;
7861 }
7862 
7863 /*
7864  * Duplicate files.  Source must be a file, target must be a file or
7865  * must not exist.
7866  *
7867  * XXX Copyfile authorisation checking is woefully inadequate, and will not
7868  *     perform inheritance correctly.
7869  */
7870 /* ARGSUSED */
7871 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)7872 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
7873 {
7874 	vnode_t tvp, fvp, tdvp, sdvp;
7875 	struct nameidata fromnd, tond;
7876 	int error;
7877 	vfs_context_t ctx = vfs_context_current();
7878 
7879 	/* Check that the flags are valid. */
7880 	if (uap->flags & ~CPF_MASK) {
7881 		return EINVAL;
7882 	}
7883 
7884 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
7885 	    UIO_USERSPACE, uap->from, ctx);
7886 	if ((error = namei(&fromnd))) {
7887 		return error;
7888 	}
7889 	fvp = fromnd.ni_vp;
7890 
7891 	NDINIT(&tond, CREATE, OP_LINK,
7892 	    LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
7893 	    UIO_USERSPACE, uap->to, ctx);
7894 	if ((error = namei(&tond))) {
7895 		goto out1;
7896 	}
7897 	tdvp = tond.ni_dvp;
7898 	tvp = tond.ni_vp;
7899 
7900 	if (tvp != NULL) {
7901 		if (!(uap->flags & CPF_OVERWRITE)) {
7902 			error = EEXIST;
7903 			goto out;
7904 		}
7905 	}
7906 
7907 	if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
7908 		error = EISDIR;
7909 		goto out;
7910 	}
7911 
7912 	if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
7913 		error = EOPNOTSUPP;
7914 		goto out;
7915 	}
7916 
7917 #if CONFIG_MACF
7918 	if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
7919 		goto out;
7920 	}
7921 #endif /* CONFIG_MACF */
7922 
7923 	if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
7924 		goto out;
7925 	}
7926 	if (tvp) {
7927 		if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
7928 			goto out;
7929 		}
7930 	}
7931 	if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
7932 		goto out;
7933 	}
7934 
7935 	if (fvp == tdvp) {
7936 		error = EINVAL;
7937 	}
7938 	/*
7939 	 * If source is the same as the destination (that is the
7940 	 * same inode number) then there is nothing to do.
7941 	 * (fixed to have POSIX semantics - CSM 3/2/98)
7942 	 */
7943 	if (fvp == tvp) {
7944 		error = -1;
7945 	}
7946 	if (!error) {
7947 		error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
7948 	}
7949 out:
7950 	sdvp = tond.ni_startdir;
7951 	/*
7952 	 * nameidone has to happen before we vnode_put(tdvp)
7953 	 * since it may need to release the fs_nodelock on the tdvp
7954 	 */
7955 	nameidone(&tond);
7956 
7957 	if (tvp) {
7958 		vnode_put(tvp);
7959 	}
7960 	vnode_put(tdvp);
7961 	vnode_put(sdvp);
7962 out1:
7963 	vnode_put(fvp);
7964 
7965 	nameidone(&fromnd);
7966 
7967 	if (error == -1) {
7968 		return 0;
7969 	}
7970 	return error;
7971 }
7972 
7973 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
7974 
7975 /*
7976  * Helper function for doing clones. The caller is expected to provide an
7977  * iocounted source vnode and release it.
7978  */
7979 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)7980 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
7981     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
7982 {
7983 	vnode_t tvp, tdvp;
7984 	struct nameidata tond;
7985 	int error;
7986 	int follow;
7987 	boolean_t free_src_acl;
7988 	boolean_t attr_cleanup;
7989 	enum vtype v_type;
7990 	kauth_action_t action;
7991 	struct componentname *cnp;
7992 	uint32_t defaulted;
7993 	struct vnode_attr va;
7994 	struct vnode_attr nva;
7995 	uint32_t vnop_flags;
7996 
7997 	v_type = vnode_vtype(fvp);
7998 	switch (v_type) {
7999 	case VLNK:
8000 	/* FALLTHRU */
8001 	case VREG:
8002 		action = KAUTH_VNODE_ADD_FILE;
8003 		break;
8004 	case VDIR:
8005 		if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8006 		    fvp->v_mountedhere) {
8007 			return EINVAL;
8008 		}
8009 		action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8010 		break;
8011 	default:
8012 		return EINVAL;
8013 	}
8014 
8015 	AUDIT_ARG(fd2, dst_dirfd);
8016 	AUDIT_ARG(value32, flags);
8017 
8018 	follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8019 	NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8020 	    UIO_USERSPACE, dst, ctx);
8021 	if ((error = nameiat(&tond, dst_dirfd))) {
8022 		return error;
8023 	}
8024 	cnp = &tond.ni_cnd;
8025 	tdvp = tond.ni_dvp;
8026 	tvp = tond.ni_vp;
8027 
8028 	free_src_acl = FALSE;
8029 	attr_cleanup = FALSE;
8030 
8031 	if (tvp != NULL) {
8032 		error = EEXIST;
8033 		goto out;
8034 	}
8035 
8036 	if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8037 		error = EXDEV;
8038 		goto out;
8039 	}
8040 
8041 #if CONFIG_MACF
8042 	if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8043 		goto out;
8044 	}
8045 #endif
8046 	if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8047 		goto out;
8048 	}
8049 
8050 	action = KAUTH_VNODE_GENERIC_READ_BITS;
8051 	if (data_read_authorised) {
8052 		action &= ~KAUTH_VNODE_READ_DATA;
8053 	}
8054 	if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8055 		goto out;
8056 	}
8057 
8058 	/*
8059 	 * certain attributes may need to be changed from the source, we ask for
8060 	 * those here with the exception of source file's ACL. The clone file
8061 	 * will inherit the target directory's ACL.
8062 	 */
8063 	VATTR_INIT(&va);
8064 	VATTR_WANTED(&va, va_uid);
8065 	VATTR_WANTED(&va, va_gid);
8066 	VATTR_WANTED(&va, va_mode);
8067 	VATTR_WANTED(&va, va_flags);
8068 
8069 	if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
8070 		goto out;
8071 	}
8072 
8073 	VATTR_INIT(&nva);
8074 	VATTR_SET(&nva, va_type, v_type);
8075 	if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8076 		VATTR_SET(&nva, va_acl, va.va_acl);
8077 		free_src_acl = TRUE;
8078 	}
8079 
8080 	/* Handle ACL inheritance, initialize vap. */
8081 	if (v_type == VLNK) {
8082 		error = vnode_authattr_new(tdvp, &nva, 0, ctx);
8083 	} else {
8084 		error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
8085 		if (error) {
8086 			goto out;
8087 		}
8088 		attr_cleanup = TRUE;
8089 	}
8090 
8091 	vnop_flags = VNODE_CLONEFILE_DEFAULT;
8092 	/*
8093 	 * We've got initial values for all security parameters,
8094 	 * If we are superuser, then we can change owners to be the
8095 	 * same as the source. Both superuser and the owner have default
8096 	 * WRITE_SECURITY privileges so all other fields can be taken
8097 	 * from source as well.
8098 	 */
8099 	if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8100 		if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8101 			VATTR_SET(&nva, va_uid, va.va_uid);
8102 		}
8103 		if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8104 			VATTR_SET(&nva, va_gid, va.va_gid);
8105 		}
8106 	} else {
8107 		vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8108 	}
8109 
8110 	if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8111 		VATTR_SET(&nva, va_mode, va.va_mode);
8112 	}
8113 	if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8114 		VATTR_SET(&nva, va_flags,
8115 		    ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8116 		    (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8117 	}
8118 
8119 	error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8120 
8121 	if (!error && tvp) {
8122 		int     update_flags = 0;
8123 #if CONFIG_FSE
8124 		int fsevent;
8125 #endif /* CONFIG_FSE */
8126 
8127 		/*
8128 		 * If some of the requested attributes weren't handled by the
8129 		 * VNOP, use our fallback code.
8130 		 */
8131 		if (!VATTR_ALL_SUPPORTED(&nva)) {
8132 			(void)vnode_setattr_fallback(tvp, &nva, ctx);
8133 		}
8134 
8135 #if CONFIG_MACF
8136 		(void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8137 		    VNODE_LABEL_CREATE, ctx);
8138 #endif
8139 
8140 		// Make sure the name & parent pointers are hooked up
8141 		if (tvp->v_name == NULL) {
8142 			update_flags |= VNODE_UPDATE_NAME;
8143 		}
8144 		if (tvp->v_parent == NULLVP) {
8145 			update_flags |= VNODE_UPDATE_PARENT;
8146 		}
8147 
8148 		if (update_flags) {
8149 			(void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8150 			    cnp->cn_namelen, cnp->cn_hash, update_flags);
8151 		}
8152 
8153 #if CONFIG_FSE
8154 		switch (vnode_vtype(tvp)) {
8155 		case VLNK:
8156 		/* FALLTHRU */
8157 		case VREG:
8158 			fsevent = FSE_CREATE_FILE;
8159 			break;
8160 		case VDIR:
8161 			fsevent = FSE_CREATE_DIR;
8162 			break;
8163 		default:
8164 			goto out;
8165 		}
8166 
8167 		if (need_fsevent(fsevent, tvp)) {
8168 			/*
8169 			 * The following is a sequence of three explicit events.
8170 			 * A pair of FSE_CLONE events representing the source and destination
8171 			 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8172 			 * fseventsd may coalesce the destination clone and create events
8173 			 * into a single event resulting in the following sequence for a client
8174 			 * FSE_CLONE (src)
8175 			 * FSE_CLONE | FSE_CREATE (dst)
8176 			 */
8177 			add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8178 			    FSE_ARG_DONE);
8179 			add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8180 			    FSE_ARG_DONE);
8181 		}
8182 #endif /* CONFIG_FSE */
8183 	}
8184 
8185 out:
8186 	if (attr_cleanup) {
8187 		vn_attribute_cleanup(&nva, defaulted);
8188 	}
8189 	if (free_src_acl && va.va_acl) {
8190 		kauth_acl_free(va.va_acl);
8191 	}
8192 	nameidone(&tond);
8193 	if (tvp) {
8194 		vnode_put(tvp);
8195 	}
8196 	vnode_put(tdvp);
8197 	return error;
8198 }
8199 
8200 /*
8201  * clone files or directories, target must not exist.
8202  */
8203 /* ARGSUSED */
8204 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8205 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8206     __unused int32_t *retval)
8207 {
8208 	vnode_t fvp;
8209 	struct nameidata fromnd;
8210 	int follow;
8211 	int error;
8212 	vfs_context_t ctx = vfs_context_current();
8213 
8214 	/* Check that the flags are valid. */
8215 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
8216 		return EINVAL;
8217 	}
8218 
8219 	AUDIT_ARG(fd, uap->src_dirfd);
8220 
8221 	follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8222 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8223 	    UIO_USERSPACE, uap->src, ctx);
8224 	if ((error = nameiat(&fromnd, uap->src_dirfd))) {
8225 		return error;
8226 	}
8227 
8228 	fvp = fromnd.ni_vp;
8229 	nameidone(&fromnd);
8230 
8231 	error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8232 	    uap->flags, ctx);
8233 
8234 	vnode_put(fvp);
8235 	return error;
8236 }
8237 
8238 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)8239 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8240     __unused int32_t *retval)
8241 {
8242 	vnode_t fvp;
8243 	struct fileproc *fp;
8244 	int error;
8245 	vfs_context_t ctx = vfs_context_current();
8246 
8247 	/* Check that the flags are valid. */
8248 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
8249 		return EINVAL;
8250 	}
8251 
8252 	AUDIT_ARG(fd, uap->src_fd);
8253 	error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8254 	if (error) {
8255 		return error;
8256 	}
8257 
8258 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8259 		AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8260 		error = EBADF;
8261 		goto out;
8262 	}
8263 
8264 	if ((error = vnode_getwithref(fvp))) {
8265 		goto out;
8266 	}
8267 
8268 	AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8269 
8270 	error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8271 	    uap->flags, ctx);
8272 
8273 	vnode_put(fvp);
8274 out:
8275 	file_drop(uap->src_fd);
8276 	return error;
8277 }
8278 
8279 static int
rename_submounts_callback(mount_t mp,void * arg)8280 rename_submounts_callback(mount_t mp, void *arg)
8281 {
8282 	int error = 0;
8283 	mount_t pmp = (mount_t)arg;
8284 	int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8285 
8286 	if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8287 		return 0;
8288 	}
8289 
8290 	if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8291 		return 0;
8292 	}
8293 
8294 	if ((error = vfs_busy(mp, LK_NOWAIT))) {
8295 		printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8296 		return -1;
8297 	}
8298 
8299 	int pathlen = MAXPATHLEN;
8300 	if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8301 		printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8302 	}
8303 
8304 	vfs_unbusy(mp);
8305 
8306 	return error;
8307 }
8308 
8309 /*
8310  * Rename files.  Source and destination must either both be directories,
8311  * or both not be directories.  If target is a directory, it must be empty.
8312  */
8313 /* ARGSUSED */
8314 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)8315 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8316     int tofd, user_addr_t to, int segflg, u_int uflags)
8317 {
8318 	vnode_t tvp, tdvp;
8319 	vnode_t fvp, fdvp;
8320 	vnode_t mnt_fvp;
8321 	struct nameidata *fromnd, *tond;
8322 	int error;
8323 	int do_retry;
8324 	int retry_count;
8325 	int mntrename;
8326 	int need_event;
8327 	int need_kpath2;
8328 	int has_listeners;
8329 	const char *oname = NULL;
8330 	char *from_name = NULL, *to_name = NULL;
8331 	char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8332 	int from_len = 0, to_len = 0;
8333 	int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8334 	int holding_mntlock;
8335 	int vn_authorize_skipped;
8336 	mount_t locked_mp = NULL;
8337 	vnode_t oparent = NULLVP;
8338 #if CONFIG_FSE
8339 	fse_info from_finfo = {}, to_finfo;
8340 #endif
8341 	int from_truncated = 0, to_truncated = 0;
8342 	int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8343 	int batched = 0;
8344 	struct vnode_attr *fvap, *tvap;
8345 	int continuing = 0;
8346 	vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
8347 	int32_t nofollow_any = 0;
8348 	/* carving out a chunk for structs that are too big to be on stack. */
8349 	struct {
8350 		struct nameidata from_node, to_node;
8351 		struct vnode_attr fv_attr, tv_attr;
8352 	} * __rename_data;
8353 
8354 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
8355 	fromnd = &__rename_data->from_node;
8356 	tond = &__rename_data->to_node;
8357 
8358 	holding_mntlock = 0;
8359 	do_retry = 0;
8360 	retry_count = 0;
8361 retry:
8362 	fvp = tvp = NULL;
8363 	fdvp = tdvp = NULL;
8364 	fvap = tvap = NULL;
8365 	mnt_fvp = NULLVP;
8366 	mntrename = FALSE;
8367 	vn_authorize_skipped = FALSE;
8368 
8369 	if (uflags & RENAME_NOFOLLOW_ANY) {
8370 		nofollow_any = NAMEI_NOFOLLOW_ANY;
8371 	}
8372 	NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8373 	    segflg, from, ctx);
8374 	fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8375 
8376 	NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8377 	    segflg, to, ctx);
8378 	tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8379 
8380 continue_lookup:
8381 	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8382 		if ((error = nameiat(fromnd, fromfd))) {
8383 			goto out1;
8384 		}
8385 		fdvp = fromnd->ni_dvp;
8386 		fvp  = fromnd->ni_vp;
8387 
8388 		if (fvp && fvp->v_type == VDIR) {
8389 			tond->ni_cnd.cn_flags |= WILLBEDIR;
8390 		}
8391 	}
8392 
8393 	if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8394 		if ((error = nameiat(tond, tofd))) {
8395 			/*
8396 			 * Translate error code for rename("dir1", "dir2/.").
8397 			 */
8398 			if (error == EISDIR && fvp->v_type == VDIR) {
8399 				error = EINVAL;
8400 			}
8401 			goto out1;
8402 		}
8403 		tdvp = tond->ni_dvp;
8404 		tvp  = tond->ni_vp;
8405 	}
8406 
8407 #if DEVELOPMENT || DEBUG
8408 	/*
8409 	 * XXX VSWAP: Check for entitlements or special flag here
8410 	 * so we can restrict access appropriately.
8411 	 */
8412 #else /* DEVELOPMENT || DEBUG */
8413 
8414 	if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
8415 		error = EPERM;
8416 		goto out1;
8417 	}
8418 
8419 	if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
8420 		error = EPERM;
8421 		goto out1;
8422 	}
8423 #endif /* DEVELOPMENT || DEBUG */
8424 
8425 	if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
8426 		error = ENOENT;
8427 		goto out1;
8428 	}
8429 
8430 	if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
8431 		int32_t pval = 0;
8432 		int err = 0;
8433 
8434 		/*
8435 		 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
8436 		 * has the same name as target iff the following conditions are met:
8437 		 * 1. the target file system is case insensitive
8438 		 * 2. source and target directories are the same
8439 		 * 3. source and target files are the same
8440 		 * 4. name only differs in case (determined by underlying filesystem)
8441 		 */
8442 		if (fvp != tvp || fdvp != tdvp) {
8443 			error = EEXIST;
8444 			goto out1;
8445 		}
8446 
8447 		/*
8448 		 * Assume that the target file system is case sensitive if
8449 		 * _PC_CASE_SENSITIVE selector isn't supported.
8450 		 */
8451 		err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
8452 		if (err != 0 || pval != 0) {
8453 			error = EEXIST;
8454 			goto out1;
8455 		}
8456 	}
8457 
8458 	batched = vnode_compound_rename_available(fdvp);
8459 
8460 #if CONFIG_FSE
8461 	need_event = need_fsevent(FSE_RENAME, fdvp);
8462 	if (need_event) {
8463 		if (fvp) {
8464 			get_fse_info(fvp, &from_finfo, ctx);
8465 		} else {
8466 			error = vfs_get_notify_attributes(&__rename_data->fv_attr);
8467 			if (error) {
8468 				goto out1;
8469 			}
8470 
8471 			fvap = &__rename_data->fv_attr;
8472 		}
8473 
8474 		if (tvp) {
8475 			get_fse_info(tvp, &to_finfo, ctx);
8476 		} else if (batched) {
8477 			error = vfs_get_notify_attributes(&__rename_data->tv_attr);
8478 			if (error) {
8479 				goto out1;
8480 			}
8481 
8482 			tvap = &__rename_data->tv_attr;
8483 		}
8484 	}
8485 #else
8486 	need_event = 0;
8487 #endif /* CONFIG_FSE */
8488 
8489 	has_listeners = kauth_authorize_fileop_has_listeners();
8490 
8491 	need_kpath2 = 0;
8492 #if CONFIG_AUDIT
8493 	if (AUDIT_RECORD_EXISTS()) {
8494 		need_kpath2 = 1;
8495 	}
8496 #endif
8497 
8498 	if (need_event || has_listeners) {
8499 		if (from_name == NULL) {
8500 			GET_PATH(from_name);
8501 		}
8502 
8503 		from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
8504 
8505 		if (from_name_no_firmlink == NULL) {
8506 			GET_PATH(from_name_no_firmlink);
8507 		}
8508 
8509 		from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
8510 	}
8511 
8512 	if (need_event || need_kpath2 || has_listeners) {
8513 		if (to_name == NULL) {
8514 			GET_PATH(to_name);
8515 		}
8516 
8517 		to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
8518 
8519 		if (to_name_no_firmlink == NULL) {
8520 			GET_PATH(to_name_no_firmlink);
8521 		}
8522 
8523 		to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
8524 		if (to_name && need_kpath2) {
8525 			AUDIT_ARG(kpath, to_name, ARG_KPATH2);
8526 		}
8527 	}
8528 	if (!fvp) {
8529 		/*
8530 		 * Claim: this check will never reject a valid rename.
8531 		 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
8532 		 * Suppose fdvp and tdvp are not on the same mount.
8533 		 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
8534 		 *      then you can't move it to within another dir on the same mountpoint.
8535 		 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
8536 		 *
8537 		 * If this check passes, then we are safe to pass these vnodes to the same FS.
8538 		 */
8539 		if (fdvp->v_mount != tdvp->v_mount) {
8540 			error = EXDEV;
8541 			goto out1;
8542 		}
8543 		goto skipped_lookup;
8544 	}
8545 
8546 	/*
8547 	 * If the source and destination are the same (i.e. they're
8548 	 * links to the same vnode) and the target file system is
8549 	 * case sensitive, then there is nothing to do.
8550 	 *
8551 	 * XXX Come back to this.
8552 	 */
8553 	if (fvp == tvp) {
8554 		int pathconf_val;
8555 
8556 		/*
8557 		 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
8558 		 * then assume that this file system is case sensitive.
8559 		 */
8560 		if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
8561 		    pathconf_val != 0) {
8562 			vn_authorize_skipped = TRUE;
8563 			goto out1;
8564 		}
8565 	}
8566 
8567 	/*
8568 	 * Allow the renaming of mount points.
8569 	 * - target must not exist
8570 	 * - target must reside in the same directory as source
8571 	 * - union mounts cannot be renamed
8572 	 * - the root fs, and tightly-linked system volumes, cannot be renamed
8573 	 *
8574 	 * XXX Handle this in VFS after a continued lookup (if we missed
8575 	 * in the cache to start off)
8576 	 *
8577 	 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
8578 	 * we'll skip past here.  The file system is responsible for
8579 	 * checking that @tvp is not a descendent of @fvp and vice versa
8580 	 * so it should always return EINVAL if either @tvp or @fvp is the
8581 	 * root of a volume.
8582 	 */
8583 	if ((fvp->v_flag & VROOT) &&
8584 	    (fvp->v_type == VDIR) &&
8585 	    (tvp == NULL) &&
8586 	    (fvp->v_mountedhere == NULL) &&
8587 	    (fdvp == tdvp) &&
8588 	    ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
8589 	    ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
8590 	    (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
8591 		vnode_t coveredvp;
8592 
8593 		/* switch fvp to the covered vnode */
8594 		coveredvp = fvp->v_mount->mnt_vnodecovered;
8595 		if ((vnode_getwithref(coveredvp))) {
8596 			error = ENOENT;
8597 			goto out1;
8598 		}
8599 		/*
8600 		 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
8601 		 * later.
8602 		 */
8603 		mnt_fvp = fvp;
8604 
8605 		fvp = coveredvp;
8606 		mntrename = TRUE;
8607 	}
8608 	/*
8609 	 * Check for cross-device rename.
8610 	 */
8611 	if ((fvp->v_mount != tdvp->v_mount) ||
8612 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
8613 		error = EXDEV;
8614 		goto out1;
8615 	}
8616 
8617 	/*
8618 	 * If source is the same as the destination (that is the
8619 	 * same inode number) then there is nothing to do...
8620 	 * EXCEPT if the underlying file system supports case
8621 	 * insensitivity and is case preserving.  In this case
8622 	 * the file system needs to handle the special case of
8623 	 * getting the same vnode as target (fvp) and source (tvp).
8624 	 *
8625 	 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
8626 	 * and _PC_CASE_PRESERVING can have this exception, and they need to
8627 	 * handle the special case of getting the same vnode as target and
8628 	 * source.  NOTE: Then the target is unlocked going into vnop_rename,
8629 	 * so not to cause locking problems. There is a single reference on tvp.
8630 	 *
8631 	 * NOTE - that fvp == tvp also occurs if they are hard linked and
8632 	 * that correct behaviour then is just to return success without doing
8633 	 * anything.
8634 	 *
8635 	 * XXX filesystem should take care of this itself, perhaps...
8636 	 */
8637 	if (fvp == tvp && fdvp == tdvp) {
8638 		if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
8639 		    !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
8640 		    fromnd->ni_cnd.cn_namelen)) {
8641 			vn_authorize_skipped = TRUE;
8642 			goto out1;
8643 		}
8644 	}
8645 
8646 	if (holding_mntlock && fvp->v_mount != locked_mp) {
8647 		/*
8648 		 * we're holding a reference and lock
8649 		 * on locked_mp, but it no longer matches
8650 		 * what we want to do... so drop our hold
8651 		 */
8652 		mount_unlock_renames(locked_mp);
8653 		mount_drop(locked_mp, 0);
8654 		holding_mntlock = 0;
8655 	}
8656 	if (tdvp != fdvp && fvp->v_type == VDIR) {
8657 		/*
8658 		 * serialize renames that re-shape
8659 		 * the tree... if holding_mntlock is
8660 		 * set, then we're ready to go...
8661 		 * otherwise we
8662 		 * first need to drop the iocounts
8663 		 * we picked up, second take the
8664 		 * lock to serialize the access,
8665 		 * then finally start the lookup
8666 		 * process over with the lock held
8667 		 */
8668 		if (!holding_mntlock) {
8669 			/*
8670 			 * need to grab a reference on
8671 			 * the mount point before we
8672 			 * drop all the iocounts... once
8673 			 * the iocounts are gone, the mount
8674 			 * could follow
8675 			 */
8676 			locked_mp = fvp->v_mount;
8677 			mount_ref(locked_mp, 0);
8678 
8679 			/*
8680 			 * nameidone has to happen before we vnode_put(tvp)
8681 			 * since it may need to release the fs_nodelock on the tvp
8682 			 */
8683 			nameidone(tond);
8684 
8685 			if (tvp) {
8686 				vnode_put(tvp);
8687 			}
8688 			vnode_put(tdvp);
8689 
8690 			/*
8691 			 * nameidone has to happen before we vnode_put(fdvp)
8692 			 * since it may need to release the fs_nodelock on the fvp
8693 			 */
8694 			nameidone(fromnd);
8695 
8696 			vnode_put(fvp);
8697 			vnode_put(fdvp);
8698 
8699 			if (mnt_fvp != NULLVP) {
8700 				vnode_put(mnt_fvp);
8701 			}
8702 
8703 			mount_lock_renames(locked_mp);
8704 			holding_mntlock = 1;
8705 
8706 			goto retry;
8707 		}
8708 	} else {
8709 		/*
8710 		 * when we dropped the iocounts to take
8711 		 * the lock, we allowed the identity of
8712 		 * the various vnodes to change... if they did,
8713 		 * we may no longer be dealing with a rename
8714 		 * that reshapes the tree... once we're holding
8715 		 * the iocounts, the vnodes can't change type
8716 		 * so we're free to drop the lock at this point
8717 		 * and continue on
8718 		 */
8719 		if (holding_mntlock) {
8720 			mount_unlock_renames(locked_mp);
8721 			mount_drop(locked_mp, 0);
8722 			holding_mntlock = 0;
8723 		}
8724 	}
8725 
8726 	if (!batched) {
8727 		error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
8728 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
8729 		    flags, NULL);
8730 		if (error) {
8731 			if (error == ENOENT) {
8732 				if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8733 					/*
8734 					 * We encountered a race where after doing the namei,
8735 					 * tvp stops being valid. If so, simply re-drive the rename
8736 					 * call from the top.
8737 					 */
8738 					do_retry = 1;
8739 					retry_count += 1;
8740 				}
8741 			}
8742 			goto out1;
8743 		}
8744 	}
8745 
8746 	/* Release the 'mnt_fvp' now that it is no longer needed. */
8747 	if (mnt_fvp != NULLVP) {
8748 		vnode_put(mnt_fvp);
8749 		mnt_fvp = NULLVP;
8750 	}
8751 
8752 	// save these off so we can later verify that fvp is the same
8753 	oname   = fvp->v_name;
8754 	oparent = fvp->v_parent;
8755 
8756 skipped_lookup:
8757 	error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
8758 	    tdvp, &tvp, &tond->ni_cnd, tvap,
8759 	    flags, ctx);
8760 
8761 	if (holding_mntlock) {
8762 		/*
8763 		 * we can drop our serialization
8764 		 * lock now
8765 		 */
8766 		mount_unlock_renames(locked_mp);
8767 		mount_drop(locked_mp, 0);
8768 		holding_mntlock = 0;
8769 	}
8770 	if (error) {
8771 		if (error == EDATALESS) {
8772 			/*
8773 			 * If we've been here before, something has gone
8774 			 * horribly wrong and we should just get out lest
8775 			 * we spiral around the drain forever.
8776 			 */
8777 			if (flags & VFS_RENAME_DATALESS) {
8778 				error = EIO;
8779 				goto out1;
8780 			}
8781 
8782 			/*
8783 			 * The object we're renaming is dataless (or has a
8784 			 * dataless descendent) and requires materialization
8785 			 * before the rename occurs.  But we're holding the
8786 			 * mount point's rename lock, so it's not safe to
8787 			 * make the upcall.
8788 			 *
8789 			 * In this case, we release the lock, perform the
8790 			 * materialization, and start the whole thing over.
8791 			 */
8792 			error = vnode_materialize_dataless_file(fvp,
8793 			    NAMESPACE_HANDLER_RENAME_OP);
8794 
8795 			if (error == 0) {
8796 				/*
8797 				 * The next time around we need to tell the
8798 				 * file system that the materializtaion has
8799 				 * been performed.
8800 				 */
8801 				flags |= VFS_RENAME_DATALESS;
8802 				do_retry = 1;
8803 			}
8804 			goto out1;
8805 		}
8806 		if (error == EKEEPLOOKING) {
8807 			if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8808 				if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8809 					panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
8810 				}
8811 			}
8812 
8813 			fromnd->ni_vp = fvp;
8814 			tond->ni_vp = tvp;
8815 
8816 			goto continue_lookup;
8817 		}
8818 
8819 		/*
8820 		 * We may encounter a race in the VNOP where the destination didn't
8821 		 * exist when we did the namei, but it does by the time we go and
8822 		 * try to create the entry. In this case, we should re-drive this rename
8823 		 * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
8824 		 * but other filesystems susceptible to this race could return it, too.
8825 		 */
8826 		if (error == ERECYCLE) {
8827 			if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
8828 				do_retry = 1;
8829 				retry_count += 1;
8830 			} else {
8831 				printf("rename retry limit due to ERECYCLE reached\n");
8832 				error = ENOENT;
8833 			}
8834 		}
8835 
8836 		/*
8837 		 * For compound VNOPs, the authorization callback may return
8838 		 * ENOENT in case of racing hardlink lookups hitting the name
8839 		 * cache, redrive the lookup.
8840 		 */
8841 		if (batched && error == ENOENT) {
8842 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8843 				do_retry = 1;
8844 				retry_count += 1;
8845 			}
8846 		}
8847 
8848 		goto out1;
8849 	}
8850 
8851 	/* call out to allow 3rd party notification of rename.
8852 	 * Ignore result of kauth_authorize_fileop call.
8853 	 */
8854 	kauth_authorize_fileop(vfs_context_ucred(ctx),
8855 	    KAUTH_FILEOP_RENAME,
8856 	    (uintptr_t)from_name, (uintptr_t)to_name);
8857 	if (flags & VFS_RENAME_SWAP) {
8858 		kauth_authorize_fileop(vfs_context_ucred(ctx),
8859 		    KAUTH_FILEOP_RENAME,
8860 		    (uintptr_t)to_name, (uintptr_t)from_name);
8861 	}
8862 
8863 #if CONFIG_FSE
8864 	if (from_name != NULL && to_name != NULL) {
8865 		if (from_truncated || to_truncated) {
8866 			// set it here since only the from_finfo gets reported up to user space
8867 			from_finfo.mode |= FSE_TRUNCATED_PATH;
8868 		}
8869 
8870 		if (tvap && tvp) {
8871 			vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
8872 		}
8873 		if (fvap) {
8874 			vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
8875 		}
8876 
8877 		if (tvp) {
8878 			add_fsevent(FSE_RENAME, ctx,
8879 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8880 			    FSE_ARG_FINFO, &from_finfo,
8881 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8882 			    FSE_ARG_FINFO, &to_finfo,
8883 			    FSE_ARG_DONE);
8884 			if (flags & VFS_RENAME_SWAP) {
8885 				/*
8886 				 * Strictly speaking, swap is the equivalent of
8887 				 * *three* renames.  FSEvents clients should only take
8888 				 * the events as a hint, so we only bother reporting
8889 				 * two.
8890 				 */
8891 				add_fsevent(FSE_RENAME, ctx,
8892 				    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8893 				    FSE_ARG_FINFO, &to_finfo,
8894 				    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8895 				    FSE_ARG_FINFO, &from_finfo,
8896 				    FSE_ARG_DONE);
8897 			}
8898 		} else {
8899 			add_fsevent(FSE_RENAME, ctx,
8900 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8901 			    FSE_ARG_FINFO, &from_finfo,
8902 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8903 			    FSE_ARG_DONE);
8904 		}
8905 	}
8906 #endif /* CONFIG_FSE */
8907 
8908 	/*
8909 	 * update filesystem's mount point data
8910 	 */
8911 	if (mntrename) {
8912 		char *cp, *pathend, *mpname;
8913 		char * tobuf;
8914 		struct mount *mp;
8915 		int maxlen;
8916 		size_t len = 0;
8917 
8918 		mp = fvp->v_mountedhere;
8919 
8920 		if (vfs_busy(mp, LK_NOWAIT)) {
8921 			error = EBUSY;
8922 			goto out1;
8923 		}
8924 		tobuf = zalloc(ZV_NAMEI);
8925 
8926 		if (UIO_SEG_IS_USER_SPACE(segflg)) {
8927 			error = copyinstr(to, tobuf, MAXPATHLEN, &len);
8928 		} else {
8929 			error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
8930 		}
8931 		if (!error) {
8932 			/* find current mount point prefix */
8933 			pathend = &mp->mnt_vfsstat.f_mntonname[0];
8934 			for (cp = pathend; *cp != '\0'; ++cp) {
8935 				if (*cp == '/') {
8936 					pathend = cp + 1;
8937 				}
8938 			}
8939 			/* find last component of target name */
8940 			for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
8941 				if (*cp == '/') {
8942 					mpname = cp + 1;
8943 				}
8944 			}
8945 
8946 			/* Update f_mntonname of sub mounts */
8947 			vfs_iterate(0, rename_submounts_callback, (void *)mp);
8948 
8949 			/* append name to prefix */
8950 			maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
8951 			bzero(pathend, maxlen);
8952 
8953 			strlcpy(pathend, mpname, maxlen);
8954 		}
8955 		zfree(ZV_NAMEI, tobuf);
8956 
8957 		vfs_unbusy(mp);
8958 
8959 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
8960 	}
8961 	/*
8962 	 * fix up name & parent pointers.  note that we first
8963 	 * check that fvp has the same name/parent pointers it
8964 	 * had before the rename call... this is a 'weak' check
8965 	 * at best...
8966 	 *
8967 	 * XXX oparent and oname may not be set in the compound vnop case
8968 	 */
8969 	if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
8970 		int update_flags;
8971 
8972 		update_flags = VNODE_UPDATE_NAME;
8973 
8974 		if (fdvp != tdvp) {
8975 			update_flags |= VNODE_UPDATE_PARENT;
8976 		}
8977 
8978 		vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
8979 	}
8980 out1:
8981 	/*
8982 	 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
8983 	 * skipped earlier as no actual rename was performed.
8984 	 */
8985 	if (vn_authorize_skipped && error == 0) {
8986 		error = vn_authorize_renamex_with_paths(fdvp, fvp,
8987 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
8988 		    flags, NULL);
8989 		if (error && error == ENOENT) {
8990 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8991 				do_retry = 1;
8992 				retry_count += 1;
8993 			}
8994 		}
8995 	}
8996 	if (to_name != NULL) {
8997 		RELEASE_PATH(to_name);
8998 		to_name = NULL;
8999 	}
9000 	if (to_name_no_firmlink != NULL) {
9001 		RELEASE_PATH(to_name_no_firmlink);
9002 		to_name_no_firmlink = NULL;
9003 	}
9004 	if (from_name != NULL) {
9005 		RELEASE_PATH(from_name);
9006 		from_name = NULL;
9007 	}
9008 	if (from_name_no_firmlink != NULL) {
9009 		RELEASE_PATH(from_name_no_firmlink);
9010 		from_name_no_firmlink = NULL;
9011 	}
9012 	if (holding_mntlock) {
9013 		mount_unlock_renames(locked_mp);
9014 		mount_drop(locked_mp, 0);
9015 		holding_mntlock = 0;
9016 	}
9017 	if (tdvp) {
9018 		/*
9019 		 * nameidone has to happen before we vnode_put(tdvp)
9020 		 * since it may need to release the fs_nodelock on the tdvp
9021 		 */
9022 		nameidone(tond);
9023 
9024 		if (tvp) {
9025 			vnode_put(tvp);
9026 		}
9027 		vnode_put(tdvp);
9028 	}
9029 	if (fdvp) {
9030 		/*
9031 		 * nameidone has to happen before we vnode_put(fdvp)
9032 		 * since it may need to release the fs_nodelock on the fdvp
9033 		 */
9034 		nameidone(fromnd);
9035 
9036 		if (fvp) {
9037 			vnode_put(fvp);
9038 		}
9039 		vnode_put(fdvp);
9040 	}
9041 	if (mnt_fvp != NULLVP) {
9042 		vnode_put(mnt_fvp);
9043 	}
9044 	/*
9045 	 * If things changed after we did the namei, then we will re-drive
9046 	 * this rename call from the top.
9047 	 */
9048 	if (do_retry) {
9049 		do_retry = 0;
9050 		goto retry;
9051 	}
9052 
9053 	kfree_type(typeof(*__rename_data), __rename_data);
9054 	return error;
9055 }
9056 
9057 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9058 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9059 {
9060 	return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9061 	           AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9062 }
9063 
9064 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9065 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9066 {
9067 	if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9068 		return EINVAL;
9069 	}
9070 
9071 	if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9072 		return EINVAL;
9073 	}
9074 
9075 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9076 	           uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9077 }
9078 
9079 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9080 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9081 {
9082 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9083 	           uap->tofd, uap->to, UIO_USERSPACE, 0);
9084 }
9085 
9086 /*
9087  * Make a directory file.
9088  *
9089  * Returns:	0			Success
9090  *		EEXIST
9091  *	namei:???
9092  *	vnode_authorize:???
9093  *	vn_create:???
9094  */
9095 /* ARGSUSED */
9096 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9097 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9098     enum uio_seg segflg)
9099 {
9100 	vnode_t vp, dvp;
9101 	int error;
9102 	int update_flags = 0;
9103 	int batched;
9104 	struct nameidata nd;
9105 
9106 	AUDIT_ARG(mode, vap->va_mode);
9107 	NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9108 	    path, ctx);
9109 	nd.ni_cnd.cn_flags |= WILLBEDIR;
9110 	nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9111 
9112 continue_lookup:
9113 	error = nameiat(&nd, fd);
9114 	if (error) {
9115 		return error;
9116 	}
9117 	dvp = nd.ni_dvp;
9118 	vp = nd.ni_vp;
9119 
9120 	if (vp != NULL) {
9121 		error = EEXIST;
9122 		goto out;
9123 	}
9124 
9125 	batched = vnode_compound_mkdir_available(dvp);
9126 
9127 	VATTR_SET(vap, va_type, VDIR);
9128 
9129 	/*
9130 	 * XXX
9131 	 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9132 	 * only get EXISTS or EISDIR for existing path components, and not that it could see
9133 	 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9134 	 * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
9135 	 */
9136 	if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9137 		if (error == EACCES || error == EPERM) {
9138 			int error2;
9139 
9140 			nameidone(&nd);
9141 			vnode_put(dvp);
9142 			dvp = NULLVP;
9143 
9144 			/*
9145 			 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9146 			 * rather than EACCESS if the target exists.
9147 			 */
9148 			NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9149 			    path, ctx);
9150 			error2 = nameiat(&nd, fd);
9151 			if (error2) {
9152 				goto out;
9153 			} else {
9154 				vp = nd.ni_vp;
9155 				error = EEXIST;
9156 				goto out;
9157 			}
9158 		}
9159 
9160 		goto out;
9161 	}
9162 
9163 	/*
9164 	 * make the directory
9165 	 */
9166 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9167 		if (error == EKEEPLOOKING) {
9168 			nd.ni_vp = vp;
9169 			goto continue_lookup;
9170 		}
9171 
9172 		goto out;
9173 	}
9174 
9175 	// Make sure the name & parent pointers are hooked up
9176 	if (vp->v_name == NULL) {
9177 		update_flags |= VNODE_UPDATE_NAME;
9178 	}
9179 	if (vp->v_parent == NULLVP) {
9180 		update_flags |= VNODE_UPDATE_PARENT;
9181 	}
9182 
9183 	if (update_flags) {
9184 		vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9185 	}
9186 
9187 #if CONFIG_FSE
9188 	add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9189 #endif
9190 
9191 out:
9192 	/*
9193 	 * nameidone has to happen before we vnode_put(dvp)
9194 	 * since it may need to release the fs_nodelock on the dvp
9195 	 */
9196 	nameidone(&nd);
9197 
9198 	if (vp) {
9199 		vnode_put(vp);
9200 	}
9201 	if (dvp) {
9202 		vnode_put(dvp);
9203 	}
9204 
9205 	return error;
9206 }
9207 
9208 /*
9209  * mkdir_extended: Create a directory; with extended security (ACL).
9210  *
9211  * Parameters:    p                       Process requesting to create the directory
9212  *                uap                     User argument descriptor (see below)
9213  *                retval                  (ignored)
9214  *
9215  * Indirect:      uap->path               Path of directory to create
9216  *                uap->mode               Access permissions to set
9217  *                uap->xsecurity          ACL to set
9218  *
9219  * Returns:        0                      Success
9220  *                !0                      Not success
9221  *
9222  */
9223 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)9224 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9225 {
9226 	int ciferror;
9227 	kauth_filesec_t xsecdst;
9228 	struct vnode_attr va;
9229 
9230 	AUDIT_ARG(owner, uap->uid, uap->gid);
9231 
9232 	xsecdst = NULL;
9233 	if ((uap->xsecurity != USER_ADDR_NULL) &&
9234 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9235 		return ciferror;
9236 	}
9237 
9238 	VATTR_INIT(&va);
9239 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9240 	if (xsecdst != NULL) {
9241 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9242 		va.va_vaflags |= VA_FILESEC_ACL;
9243 	}
9244 
9245 	ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9246 	    UIO_USERSPACE);
9247 	if (xsecdst != NULL) {
9248 		kauth_filesec_free(xsecdst);
9249 	}
9250 	return ciferror;
9251 }
9252 
9253 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)9254 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9255 {
9256 	struct vnode_attr va;
9257 
9258 	VATTR_INIT(&va);
9259 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9260 
9261 	return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9262 	           UIO_USERSPACE);
9263 }
9264 
9265 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)9266 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9267 {
9268 	struct vnode_attr va;
9269 
9270 	VATTR_INIT(&va);
9271 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9272 
9273 	return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9274 	           UIO_USERSPACE);
9275 }
9276 
9277 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)9278 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9279     enum uio_seg segflg, int unlink_flags)
9280 {
9281 	struct {
9282 		struct nameidata nd;
9283 #if CONFIG_FSE
9284 		struct vnode_attr va;
9285 #endif /* CONFIG_FSE */
9286 	} *__rmdir_data;
9287 	vnode_t vp, dvp;
9288 	int error;
9289 	struct nameidata *ndp;
9290 	char     *path = NULL;
9291 	char     *no_firmlink_path = NULL;
9292 	int       len_path = 0;
9293 	int       len_no_firmlink_path = 0;
9294 	int has_listeners = 0;
9295 	int need_event = 0;
9296 	int truncated_path = 0;
9297 	int truncated_no_firmlink_path = 0;
9298 	struct vnode_attr *vap = NULL;
9299 	int restart_count = 0;
9300 	int batched;
9301 
9302 	int restart_flag;
9303 
9304 	__rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
9305 	ndp = &__rmdir_data->nd;
9306 
9307 	/*
9308 	 * This loop exists to restart rmdir in the unlikely case that two
9309 	 * processes are simultaneously trying to remove the same directory
9310 	 * containing orphaned appleDouble files.
9311 	 */
9312 	do {
9313 		NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9314 		    segflg, dirpath, ctx);
9315 		ndp->ni_flag = NAMEI_COMPOUNDRMDIR;
9316 continue_lookup:
9317 		restart_flag = 0;
9318 		vap = NULL;
9319 
9320 		error = nameiat(ndp, fd);
9321 		if (error) {
9322 			goto err_out;
9323 		}
9324 
9325 		dvp = ndp->ni_dvp;
9326 		vp = ndp->ni_vp;
9327 
9328 		if (vp) {
9329 			batched = vnode_compound_rmdir_available(vp);
9330 
9331 			if (vp->v_flag & VROOT) {
9332 				/*
9333 				 * The root of a mounted filesystem cannot be deleted.
9334 				 */
9335 				error = EBUSY;
9336 				goto out;
9337 			}
9338 
9339 #if DEVELOPMENT || DEBUG
9340 			/*
9341 			 * XXX VSWAP: Check for entitlements or special flag here
9342 			 * so we can restrict access appropriately.
9343 			 */
9344 #else /* DEVELOPMENT || DEBUG */
9345 
9346 			if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
9347 				error = EPERM;
9348 				goto out;
9349 			}
9350 #endif /* DEVELOPMENT || DEBUG */
9351 
9352 			/*
9353 			 * Removed a check here; we used to abort if vp's vid
9354 			 * was not the same as what we'd seen the last time around.
9355 			 * I do not think that check was valid, because if we retry
9356 			 * and all dirents are gone, the directory could legitimately
9357 			 * be recycled but still be present in a situation where we would
9358 			 * have had permission to delete.  Therefore, we won't make
9359 			 * an effort to preserve that check now that we may not have a
9360 			 * vp here.
9361 			 */
9362 
9363 			if (!batched) {
9364 				error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
9365 				if (error) {
9366 					if (error == ENOENT) {
9367 						if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9368 							restart_flag = 1;
9369 							restart_count += 1;
9370 						}
9371 					}
9372 					goto out;
9373 				}
9374 			}
9375 		} else {
9376 			batched = 1;
9377 
9378 			if (!vnode_compound_rmdir_available(dvp)) {
9379 				panic("No error, but no compound rmdir?");
9380 			}
9381 		}
9382 
9383 #if CONFIG_FSE
9384 		fse_info  finfo = {0};
9385 
9386 		need_event = need_fsevent(FSE_DELETE, dvp);
9387 		if (need_event) {
9388 			if (!batched) {
9389 				get_fse_info(vp, &finfo, ctx);
9390 			} else {
9391 				error = vfs_get_notify_attributes(&__rmdir_data->va);
9392 				if (error) {
9393 					goto out;
9394 				}
9395 
9396 				vap = &__rmdir_data->va;
9397 			}
9398 		}
9399 #endif
9400 		has_listeners = kauth_authorize_fileop_has_listeners();
9401 		if (need_event || has_listeners) {
9402 			if (path == NULL) {
9403 				GET_PATH(path);
9404 			}
9405 
9406 			len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
9407 
9408 			if (no_firmlink_path == NULL) {
9409 				GET_PATH(no_firmlink_path);
9410 			}
9411 
9412 			len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
9413 #if CONFIG_FSE
9414 			if (truncated_no_firmlink_path) {
9415 				finfo.mode |= FSE_TRUNCATED_PATH;
9416 			}
9417 #endif
9418 		}
9419 
9420 		error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
9421 		ndp->ni_vp = vp;
9422 		if (vp == NULLVP) {
9423 			/* Couldn't find a vnode */
9424 			goto out;
9425 		}
9426 
9427 		if (error == EKEEPLOOKING) {
9428 			goto continue_lookup;
9429 		} else if (batched && error == ENOENT) {
9430 			if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9431 				/*
9432 				 * For compound VNOPs, the authorization callback
9433 				 * may return ENOENT in case of racing hard link lookups
9434 				 * redrive the lookup.
9435 				 */
9436 				restart_flag = 1;
9437 				restart_count += 1;
9438 				goto out;
9439 			}
9440 		}
9441 
9442 		/*
9443 		 * XXX There's no provision for passing flags
9444 		 * to VNOP_RMDIR().  So, if vn_rmdir() fails
9445 		 * because it's not empty, then we try again
9446 		 * with VNOP_REMOVE(), passing in a special
9447 		 * flag that clever file systems will know
9448 		 * how to handle.
9449 		 */
9450 		if (error == ENOTEMPTY &&
9451 		    (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
9452 			/*
9453 			 * If this fails, we want to keep the original
9454 			 * error.
9455 			 */
9456 			if (vn_remove(dvp, &vp, ndp,
9457 			    VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
9458 				error = 0;
9459 			}
9460 		}
9461 
9462 #if CONFIG_APPLEDOUBLE
9463 		/*
9464 		 * Special case to remove orphaned AppleDouble
9465 		 * files. I don't like putting this in the kernel,
9466 		 * but carbon does not like putting this in carbon either,
9467 		 * so here we are.
9468 		 */
9469 		if (error == ENOTEMPTY) {
9470 			int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
9471 			if (ad_error == EBUSY) {
9472 				error = ad_error;
9473 				goto out;
9474 			}
9475 
9476 
9477 			/*
9478 			 * Assuming everything went well, we will try the RMDIR again
9479 			 */
9480 			if (!ad_error) {
9481 				error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
9482 			}
9483 		}
9484 #endif /* CONFIG_APPLEDOUBLE */
9485 		/*
9486 		 * Call out to allow 3rd party notification of delete.
9487 		 * Ignore result of kauth_authorize_fileop call.
9488 		 */
9489 		if (!error) {
9490 			if (has_listeners) {
9491 				kauth_authorize_fileop(vfs_context_ucred(ctx),
9492 				    KAUTH_FILEOP_DELETE,
9493 				    (uintptr_t)vp,
9494 				    (uintptr_t)path);
9495 			}
9496 
9497 			if (vp->v_flag & VISHARDLINK) {
9498 				// see the comment in unlink1() about why we update
9499 				// the parent of a hard link when it is removed
9500 				vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
9501 			}
9502 
9503 #if CONFIG_FSE
9504 			if (need_event) {
9505 				if (vap) {
9506 					vnode_get_fse_info_from_vap(vp, &finfo, vap);
9507 				}
9508 				add_fsevent(FSE_DELETE, ctx,
9509 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
9510 				    FSE_ARG_FINFO, &finfo,
9511 				    FSE_ARG_DONE);
9512 			}
9513 #endif
9514 		}
9515 
9516 out:
9517 		if (path != NULL) {
9518 			RELEASE_PATH(path);
9519 			path = NULL;
9520 		}
9521 
9522 		if (no_firmlink_path != NULL) {
9523 			RELEASE_PATH(no_firmlink_path);
9524 			no_firmlink_path = NULL;
9525 		}
9526 
9527 		/*
9528 		 * nameidone has to happen before we vnode_put(dvp)
9529 		 * since it may need to release the fs_nodelock on the dvp
9530 		 */
9531 		nameidone(ndp);
9532 		vnode_put(dvp);
9533 
9534 		if (vp) {
9535 			vnode_put(vp);
9536 		}
9537 
9538 		if (restart_flag == 0) {
9539 			wakeup_one((caddr_t)vp);
9540 			goto err_out;
9541 		}
9542 		tsleep(vp, PVFS, "rm AD", 1);
9543 	} while (restart_flag != 0);
9544 
9545 err_out:
9546 	kfree_type(typeof(*__rmdir_data), __rmdir_data);
9547 
9548 	return error;
9549 }
9550 
9551 /*
9552  * Remove a directory file.
9553  */
9554 /* ARGSUSED */
9555 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)9556 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
9557 {
9558 	return rmdirat_internal(vfs_context_current(), AT_FDCWD,
9559 	           CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
9560 }
9561 
9562 /* Get direntry length padded to 8 byte alignment */
9563 #define DIRENT64_LEN(namlen) \
9564 	((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
9565 
9566 /* Get dirent length padded to 4 byte alignment */
9567 #define DIRENT_LEN(namelen) \
9568 	((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
9569 
9570 /* Get the end of this dirent */
9571 #define DIRENT_END(dep) \
9572 	(((char *)(dep)) + (dep)->d_reclen - 1)
9573 
9574 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)9575 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
9576     int *numdirent, vfs_context_t ctxp)
9577 {
9578 	/* Check if fs natively supports VNODE_READDIR_EXTENDED */
9579 	if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
9580 	    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
9581 		return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
9582 	} else {
9583 		size_t bufsize;
9584 		void * bufptr;
9585 		uio_t auio;
9586 		struct direntry *entry64;
9587 		struct dirent *dep;
9588 		size_t bytesread;
9589 		int error;
9590 
9591 		/*
9592 		 * We're here because the underlying file system does not
9593 		 * support direnties or we mounted denying support so we must
9594 		 * fall back to dirents and convert them to direntries.
9595 		 *
9596 		 * Our kernel buffer needs to be smaller since re-packing will
9597 		 * expand each dirent.  The worse case (when the name length
9598 		 * is 3 or less) corresponds to a struct direntry size of 32
9599 		 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
9600 		 * (4-byte aligned).  So having a buffer that is 3/8 the size
9601 		 * will prevent us from reading more than we can pack.
9602 		 *
9603 		 * Since this buffer is wired memory, we will limit the
9604 		 * buffer size to a maximum of 32K. We would really like to
9605 		 * use 32K in the MIN(), but we use magic number 87371 to
9606 		 * prevent uio_resid() * 3 / 8 from overflowing.
9607 		 */
9608 		bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
9609 		bufptr = kalloc_data(bufsize, Z_WAITOK);
9610 		if (bufptr == NULL) {
9611 			return ENOMEM;
9612 		}
9613 
9614 		auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
9615 		uio_addiov(auio, (uintptr_t)bufptr, bufsize);
9616 		auio->uio_offset = uio->uio_offset;
9617 
9618 		error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
9619 
9620 		dep = (struct dirent *)bufptr;
9621 		bytesread = bufsize - uio_resid(auio);
9622 
9623 		entry64 = kalloc_type(struct direntry, Z_WAITOK);
9624 		/*
9625 		 * Convert all the entries and copy them out to user's buffer.
9626 		 */
9627 		while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
9628 			/* First check that the dirent struct up to d_name is within the buffer */
9629 			if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
9630 			    /* Check that the length of the entire dirent is within the buffer */
9631 			    DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
9632 			    /* Check that the actual length including the name doesn't exceed d_reclen */
9633 			    DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
9634 				printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
9635 				    vp->v_mount->mnt_vfsstat.f_mntonname,
9636 				    vp->v_name ? vp->v_name : "<unknown>");
9637 				error = EIO;
9638 				break;
9639 			}
9640 
9641 			size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
9642 
9643 			bzero(entry64, enbufsize);
9644 			/* Convert a dirent to a dirent64. */
9645 			entry64->d_ino = dep->d_ino;
9646 			entry64->d_seekoff = 0;
9647 			entry64->d_reclen = (uint16_t)enbufsize;
9648 			entry64->d_namlen = dep->d_namlen;
9649 			entry64->d_type = dep->d_type;
9650 			bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
9651 
9652 			/* Move to next entry. */
9653 			dep = (struct dirent *)((char *)dep + dep->d_reclen);
9654 
9655 			/* Copy entry64 to user's buffer. */
9656 			error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
9657 		}
9658 
9659 		/* Update the real offset using the offset we got from VNOP_READDIR. */
9660 		if (error == 0) {
9661 			uio->uio_offset = auio->uio_offset;
9662 		}
9663 		uio_free(auio);
9664 		kfree_data(bufptr, bufsize);
9665 		kfree_type(struct direntry, entry64);
9666 		return error;
9667 	}
9668 }
9669 
9670 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
9671 
9672 /*
9673  * Read a block of directory entries in a file system independent format.
9674  */
9675 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)9676 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
9677     off_t *offset, int *eofflag, int flags)
9678 {
9679 	vnode_t vp;
9680 	struct vfs_context context = *vfs_context_current();    /* local copy */
9681 	struct fileproc *fp;
9682 	uio_t auio;
9683 	int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9684 	off_t loff;
9685 	int error, numdirent;
9686 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
9687 
9688 get_from_fd:
9689 	error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
9690 	if (error) {
9691 		return error;
9692 	}
9693 
9694 	vn_offset_lock(fp->fp_glob);
9695 	if (((vnode_t)fp_get_data(fp)) != vp) {
9696 		vn_offset_unlock(fp->fp_glob);
9697 		file_drop(fd);
9698 		goto get_from_fd;
9699 	}
9700 
9701 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
9702 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9703 		error = EBADF;
9704 		goto out;
9705 	}
9706 
9707 	if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
9708 		bufsize = GETDIRENTRIES_MAXBUFSIZE;
9709 	}
9710 
9711 #if CONFIG_MACF
9712 	error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
9713 	if (error) {
9714 		goto out;
9715 	}
9716 #endif
9717 
9718 	if ((error = vnode_getwithref(vp))) {
9719 		goto out;
9720 	}
9721 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9722 
9723 #if CONFIG_UNION_MOUNTS
9724 unionread:
9725 #endif /* CONFIG_UNION_MOUNTS */
9726 	if (vp->v_type != VDIR) {
9727 		(void)vnode_put(vp);
9728 		error = EINVAL;
9729 		goto out;
9730 	}
9731 
9732 #if CONFIG_MACF
9733 	error = mac_vnode_check_readdir(&context, vp);
9734 	if (error != 0) {
9735 		(void)vnode_put(vp);
9736 		goto out;
9737 	}
9738 #endif /* MAC */
9739 
9740 	loff = fp->fp_glob->fg_offset;
9741 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9742 	uio_addiov(auio, bufp, bufsize);
9743 
9744 	if (flags & VNODE_READDIR_EXTENDED) {
9745 		error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
9746 		fp->fp_glob->fg_offset = uio_offset(auio);
9747 	} else {
9748 		error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
9749 		fp->fp_glob->fg_offset = uio_offset(auio);
9750 	}
9751 	if (error) {
9752 		(void)vnode_put(vp);
9753 		goto out;
9754 	}
9755 
9756 #if CONFIG_UNION_MOUNTS
9757 	if ((user_ssize_t)bufsize == uio_resid(auio) &&
9758 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
9759 		vnode_t uvp;
9760 
9761 		if (lookup_traverse_union(vp, &uvp, &context) == 0) {
9762 			if (vnode_ref(uvp) == 0) {
9763 				fp_set_data(fp, uvp);
9764 				fp->fp_glob->fg_offset = 0;
9765 				vnode_rele(vp);
9766 				vnode_put(vp);
9767 				vp = uvp;
9768 				goto unionread;
9769 			} else {
9770 				/* could not get a ref, can't replace in fd */
9771 				vnode_put(uvp);
9772 			}
9773 		}
9774 	}
9775 #endif /* CONFIG_UNION_MOUNTS */
9776 
9777 	vnode_put(vp);
9778 	if (offset) {
9779 		*offset = loff;
9780 	}
9781 
9782 	*bytesread = bufsize - uio_resid(auio);
9783 out:
9784 	vn_offset_unlock(fp->fp_glob);
9785 	file_drop(fd);
9786 	return error;
9787 }
9788 
9789 
9790 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)9791 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
9792 {
9793 	off_t offset;
9794 	ssize_t bytesread;
9795 	int error, eofflag;
9796 
9797 	AUDIT_ARG(fd, uap->fd);
9798 	error = getdirentries_common(uap->fd, uap->buf, uap->count,
9799 	    &bytesread, &offset, &eofflag, 0);
9800 
9801 	if (error == 0) {
9802 		if (proc_is64bit(p)) {
9803 			user64_long_t base = (user64_long_t)offset;
9804 			error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
9805 		} else {
9806 			user32_long_t base = (user32_long_t)offset;
9807 			error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
9808 		}
9809 		*retval = (int)bytesread;
9810 	}
9811 	return error;
9812 }
9813 
9814 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)9815 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
9816 {
9817 	off_t offset;
9818 	ssize_t bytesread;
9819 	int error, eofflag;
9820 	user_size_t bufsize;
9821 
9822 	AUDIT_ARG(fd, uap->fd);
9823 
9824 	/*
9825 	 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
9826 	 * then the kernel carves out the last 4 bytes to return extended
9827 	 * information to userspace (namely whether we reached EOF with this call).
9828 	 */
9829 	if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9830 		bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
9831 	} else {
9832 		bufsize = uap->bufsize;
9833 	}
9834 
9835 	error = getdirentries_common(uap->fd, uap->buf, bufsize,
9836 	    &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
9837 
9838 	if (error == 0) {
9839 		*retval = bytesread;
9840 		error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
9841 
9842 		if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9843 			getdirentries64_flags_t flags = 0;
9844 			if (eofflag) {
9845 				flags |= GETDIRENTRIES64_EOF;
9846 			}
9847 			error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
9848 			    sizeof(flags));
9849 		}
9850 	}
9851 	return error;
9852 }
9853 
9854 
9855 /*
9856  * Set the mode mask for creation of filesystem nodes.
9857  * XXX implement xsecurity
9858  */
9859 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
9860 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)9861 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
9862 {
9863 	AUDIT_ARG(mask, newmask);
9864 	proc_fdlock(p);
9865 	*retval = p->p_fd.fd_cmask;
9866 	p->p_fd.fd_cmask = newmask & ALLPERMS;
9867 	proc_fdunlock(p);
9868 	return 0;
9869 }
9870 
9871 /*
9872  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
9873  *
9874  * Parameters:    p                       Process requesting to set the umask
9875  *                uap                     User argument descriptor (see below)
9876  *                retval                  umask of the process (parameter p)
9877  *
9878  * Indirect:      uap->newmask            umask to set
9879  *                uap->xsecurity          ACL to set
9880  *
9881  * Returns:        0                      Success
9882  *                !0                      Not success
9883  *
9884  */
9885 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)9886 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
9887 {
9888 	return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
9889 }
9890 
9891 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)9892 umask(proc_t p, struct umask_args *uap, int32_t *retval)
9893 {
9894 	return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
9895 }
9896 
9897 /*
9898  * Void all references to file by ripping underlying filesystem
9899  * away from vnode.
9900  */
9901 /* ARGSUSED */
9902 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)9903 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
9904 {
9905 	vnode_t vp;
9906 	struct vnode_attr va;
9907 	vfs_context_t ctx = vfs_context_current();
9908 	int error;
9909 	struct nameidata nd;
9910 
9911 	NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
9912 	    uap->path, ctx);
9913 	error = namei(&nd);
9914 	if (error) {
9915 		return error;
9916 	}
9917 	vp = nd.ni_vp;
9918 
9919 	nameidone(&nd);
9920 
9921 	if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
9922 		error = ENOTSUP;
9923 		goto out;
9924 	}
9925 
9926 	if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
9927 		error = EBUSY;
9928 		goto out;
9929 	}
9930 
9931 #if CONFIG_MACF
9932 	error = mac_vnode_check_revoke(ctx, vp);
9933 	if (error) {
9934 		goto out;
9935 	}
9936 #endif
9937 
9938 	VATTR_INIT(&va);
9939 	VATTR_WANTED(&va, va_uid);
9940 	if ((error = vnode_getattr(vp, &va, ctx))) {
9941 		goto out;
9942 	}
9943 	if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
9944 	    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
9945 		goto out;
9946 	}
9947 	if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
9948 		VNOP_REVOKE(vp, REVOKEALL, ctx);
9949 	}
9950 out:
9951 	vnode_put(vp);
9952 	return error;
9953 }
9954 
9955 
9956 /*
9957  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
9958  *  The following system calls are designed to support features
9959  *  which are specific to the HFS & HFS Plus volume formats
9960  */
9961 
9962 
9963 /*
9964  * Obtain attribute information on objects in a directory while enumerating
9965  * the directory.
9966  */
9967 /* ARGSUSED */
9968 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)9969 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
9970 {
9971 	vnode_t vp;
9972 	struct fileproc *fp;
9973 	uio_t auio = NULL;
9974 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9975 	uint32_t count = 0, savecount = 0;
9976 	uint32_t newstate = 0;
9977 	int error, eofflag;
9978 	off_t loff = 0;
9979 	struct attrlist attributelist;
9980 	vfs_context_t ctx = vfs_context_current();
9981 	int fd = uap->fd;
9982 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
9983 	kauth_action_t action;
9984 
9985 	AUDIT_ARG(fd, fd);
9986 
9987 	/* Get the attributes into kernel space */
9988 	if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
9989 		return error;
9990 	}
9991 	if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
9992 		return error;
9993 	}
9994 	savecount = count;
9995 
9996 get_from_fd:
9997 	if ((error = fp_getfvp(p, fd, &fp, &vp))) {
9998 		return error;
9999 	}
10000 
10001 	vn_offset_lock(fp->fp_glob);
10002 	if (((vnode_t)fp_get_data(fp)) != vp) {
10003 		vn_offset_unlock(fp->fp_glob);
10004 		file_drop(fd);
10005 		goto get_from_fd;
10006 	}
10007 
10008 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10009 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10010 		error = EBADF;
10011 		goto out;
10012 	}
10013 
10014 
10015 #if CONFIG_MACF
10016 	error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10017 	    fp->fp_glob);
10018 	if (error) {
10019 		goto out;
10020 	}
10021 #endif
10022 
10023 
10024 	if ((error = vnode_getwithref(vp))) {
10025 		goto out;
10026 	}
10027 
10028 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10029 
10030 #if CONFIG_UNION_MOUNTS
10031 unionread:
10032 #endif /* CONFIG_UNION_MOUNTS */
10033 	if (vp->v_type != VDIR) {
10034 		(void)vnode_put(vp);
10035 		error = EINVAL;
10036 		goto out;
10037 	}
10038 
10039 #if CONFIG_MACF
10040 	error = mac_vnode_check_readdir(ctx, vp);
10041 	if (error != 0) {
10042 		(void)vnode_put(vp);
10043 		goto out;
10044 	}
10045 #endif /* MAC */
10046 
10047 	/* set up the uio structure which will contain the users return buffer */
10048 	loff = fp->fp_glob->fg_offset;
10049 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10050 	uio_addiov(auio, uap->buffer, uap->buffersize);
10051 
10052 	/*
10053 	 * If the only item requested is file names, we can let that past with
10054 	 * just LIST_DIRECTORY.  If they want any other attributes, that means
10055 	 * they need SEARCH as well.
10056 	 */
10057 	action = KAUTH_VNODE_LIST_DIRECTORY;
10058 	if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10059 	    attributelist.fileattr || attributelist.dirattr) {
10060 		action |= KAUTH_VNODE_SEARCH;
10061 	}
10062 
10063 	if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10064 		/* Believe it or not, uap->options only has 32-bits of valid
10065 		 * info, so truncate before extending again */
10066 
10067 		error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10068 		    (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10069 	}
10070 
10071 	if (error) {
10072 		(void) vnode_put(vp);
10073 		goto out;
10074 	}
10075 
10076 #if CONFIG_UNION_MOUNTS
10077 	/*
10078 	 * If we've got the last entry of a directory in a union mount
10079 	 * then reset the eofflag and pretend there's still more to come.
10080 	 * The next call will again set eofflag and the buffer will be empty,
10081 	 * so traverse to the underlying directory and do the directory
10082 	 * read there.
10083 	 */
10084 	if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10085 		if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10086 			eofflag = 0;
10087 		} else {                                                // Empty buffer
10088 			vnode_t uvp;
10089 			if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10090 				if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10091 					fp_set_data(fp, uvp);
10092 					fp->fp_glob->fg_offset = 0; // reset index for new dir
10093 					count = savecount;
10094 					vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10095 					vnode_put(vp);
10096 					vp = uvp;
10097 					goto unionread;
10098 				} else {
10099 					/* could not get a ref, can't replace in fd */
10100 					vnode_put(uvp);
10101 				}
10102 			}
10103 		}
10104 	}
10105 #endif /* CONFIG_UNION_MOUNTS */
10106 
10107 	(void)vnode_put(vp);
10108 
10109 	if (error) {
10110 		goto out;
10111 	}
10112 	fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10113 
10114 	if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10115 		goto out;
10116 	}
10117 	if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10118 		goto out;
10119 	}
10120 	if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10121 		goto out;
10122 	}
10123 
10124 	*retval = eofflag;  /* similar to getdirentries */
10125 	error = 0;
10126 out:
10127 	vn_offset_unlock(fp->fp_glob);
10128 	file_drop(fd);
10129 	return error; /* return error earlier, an retval of 0 or 1 now */
10130 } /* end of getdirentriesattr system call */
10131 
10132 /*
10133  * Exchange data between two files
10134  */
10135 
10136 /* ARGSUSED */
10137 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10138 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10139 {
10140 	struct nameidata fnd, snd;
10141 	vfs_context_t ctx = vfs_context_current();
10142 	vnode_t fvp;
10143 	vnode_t svp;
10144 	int error;
10145 	u_int32_t nameiflags;
10146 	char *fpath = NULL;
10147 	char *spath = NULL;
10148 	int   flen = 0, slen = 0;
10149 	int from_truncated = 0, to_truncated = 0;
10150 #if CONFIG_FSE
10151 	fse_info f_finfo, s_finfo;
10152 #endif
10153 
10154 	nameiflags = 0;
10155 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10156 		nameiflags |= FOLLOW;
10157 	}
10158 
10159 	NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10160 	    UIO_USERSPACE, uap->path1, ctx);
10161 
10162 	error = namei(&fnd);
10163 	if (error) {
10164 		goto out2;
10165 	}
10166 
10167 	nameidone(&fnd);
10168 	fvp = fnd.ni_vp;
10169 
10170 	NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10171 	    UIO_USERSPACE, uap->path2, ctx);
10172 
10173 	error = namei(&snd);
10174 	if (error) {
10175 		vnode_put(fvp);
10176 		goto out2;
10177 	}
10178 	nameidone(&snd);
10179 	svp = snd.ni_vp;
10180 
10181 	/*
10182 	 * if the files are the same, return an inval error
10183 	 */
10184 	if (svp == fvp) {
10185 		error = EINVAL;
10186 		goto out;
10187 	}
10188 
10189 	/*
10190 	 * if the files are on different volumes, return an error
10191 	 */
10192 	if (svp->v_mount != fvp->v_mount) {
10193 		error = EXDEV;
10194 		goto out;
10195 	}
10196 
10197 	/* If they're not files, return an error */
10198 	if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10199 		error = EINVAL;
10200 		goto out;
10201 	}
10202 
10203 #if CONFIG_MACF
10204 	error = mac_vnode_check_exchangedata(ctx,
10205 	    fvp, svp);
10206 	if (error) {
10207 		goto out;
10208 	}
10209 #endif
10210 	if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10211 	    ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10212 		goto out;
10213 	}
10214 
10215 	if (
10216 #if CONFIG_FSE
10217 		need_fsevent(FSE_EXCHANGE, fvp) ||
10218 #endif
10219 		kauth_authorize_fileop_has_listeners()) {
10220 		GET_PATH(fpath);
10221 		GET_PATH(spath);
10222 
10223 		flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10224 		slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10225 
10226 #if CONFIG_FSE
10227 		get_fse_info(fvp, &f_finfo, ctx);
10228 		get_fse_info(svp, &s_finfo, ctx);
10229 		if (from_truncated || to_truncated) {
10230 			// set it here since only the f_finfo gets reported up to user space
10231 			f_finfo.mode |= FSE_TRUNCATED_PATH;
10232 		}
10233 #endif
10234 	}
10235 	/* Ok, make the call */
10236 	error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10237 
10238 	if (error == 0) {
10239 		const char *tmpname;
10240 
10241 		if (fpath != NULL && spath != NULL) {
10242 			/* call out to allow 3rd party notification of exchangedata.
10243 			 * Ignore result of kauth_authorize_fileop call.
10244 			 */
10245 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10246 			    (uintptr_t)fpath, (uintptr_t)spath);
10247 		}
10248 		name_cache_lock();
10249 
10250 		tmpname     = fvp->v_name;
10251 		fvp->v_name = svp->v_name;
10252 		svp->v_name = tmpname;
10253 
10254 		if (fvp->v_parent != svp->v_parent) {
10255 			vnode_t tmp;
10256 
10257 			tmp           = fvp->v_parent;
10258 			fvp->v_parent = svp->v_parent;
10259 			svp->v_parent = tmp;
10260 		}
10261 		name_cache_unlock();
10262 
10263 #if CONFIG_FSE
10264 		if (fpath != NULL && spath != NULL) {
10265 			add_fsevent(FSE_EXCHANGE, ctx,
10266 			    FSE_ARG_STRING, flen, fpath,
10267 			    FSE_ARG_FINFO, &f_finfo,
10268 			    FSE_ARG_STRING, slen, spath,
10269 			    FSE_ARG_FINFO, &s_finfo,
10270 			    FSE_ARG_DONE);
10271 		}
10272 #endif
10273 	}
10274 
10275 out:
10276 	if (fpath != NULL) {
10277 		RELEASE_PATH(fpath);
10278 	}
10279 	if (spath != NULL) {
10280 		RELEASE_PATH(spath);
10281 	}
10282 	vnode_put(svp);
10283 	vnode_put(fvp);
10284 out2:
10285 	return error;
10286 }
10287 
10288 /*
10289  * Return (in MB) the amount of freespace on the given vnode's volume.
10290  */
10291 uint32_t freespace_mb(vnode_t vp);
10292 
10293 uint32_t
freespace_mb(vnode_t vp)10294 freespace_mb(vnode_t vp)
10295 {
10296 	vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
10297 	return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10298 	       vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10299 }
10300 
10301 #if CONFIG_SEARCHFS
10302 
10303 /* ARGSUSED */
10304 
10305 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)10306 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10307 {
10308 	vnode_t vp, tvp;
10309 	int i, error = 0;
10310 	int fserror = 0;
10311 	struct nameidata nd;
10312 	struct user64_fssearchblock searchblock;
10313 	struct searchstate *state;
10314 	struct attrlist *returnattrs;
10315 	struct timeval timelimit;
10316 	void *searchparams1, *searchparams2;
10317 	uio_t auio = NULL;
10318 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10319 	uint32_t nummatches;
10320 	size_t mallocsize;
10321 	uint32_t nameiflags;
10322 	vfs_context_t ctx = vfs_context_current();
10323 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10324 
10325 	/* Start by copying in fsearchblock parameter list */
10326 	if (IS_64BIT_PROCESS(p)) {
10327 		error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
10328 		timelimit.tv_sec = searchblock.timelimit.tv_sec;
10329 		timelimit.tv_usec = searchblock.timelimit.tv_usec;
10330 	} else {
10331 		struct user32_fssearchblock tmp_searchblock;
10332 
10333 		error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
10334 		// munge into 64-bit version
10335 		searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
10336 		searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
10337 		searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
10338 		searchblock.maxmatches = tmp_searchblock.maxmatches;
10339 		/*
10340 		 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
10341 		 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
10342 		 */
10343 		timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
10344 		timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
10345 		searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
10346 		searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
10347 		searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
10348 		searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
10349 		searchblock.searchattrs = tmp_searchblock.searchattrs;
10350 	}
10351 	if (error) {
10352 		return error;
10353 	}
10354 
10355 	/* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
10356 	 */
10357 	if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10358 	    searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10359 		return EINVAL;
10360 	}
10361 
10362 	/* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
10363 	/* It all has to do into local memory and it's not that big so we might as well  put it all together. */
10364 	/* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
10365 	/* block.                                                                                             */
10366 	/*												      */
10367 	/* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
10368 	/*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
10369 	/*       assumes the size is still 556 bytes it will continue to work				      */
10370 
10371 	mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
10372 	    sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
10373 
10374 	searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
10375 
10376 	/* Now set up the various pointers to the correct place in our newly allocated memory */
10377 
10378 	searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
10379 	returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
10380 	state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
10381 
10382 	/* Now copy in the stuff given our local variables. */
10383 
10384 	if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
10385 		goto freeandexit;
10386 	}
10387 
10388 	if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
10389 		goto freeandexit;
10390 	}
10391 
10392 	if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
10393 		goto freeandexit;
10394 	}
10395 
10396 	if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
10397 		goto freeandexit;
10398 	}
10399 
10400 	/*
10401 	 * When searching a union mount, need to set the
10402 	 * start flag at the first call on each layer to
10403 	 * reset state for the new volume.
10404 	 */
10405 	if (uap->options & SRCHFS_START) {
10406 		state->ss_union_layer = 0;
10407 	} else {
10408 		uap->options |= state->ss_union_flags;
10409 	}
10410 	state->ss_union_flags = 0;
10411 
10412 	/*
10413 	 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
10414 	 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
10415 	 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
10416 	 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
10417 	 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
10418 	 */
10419 
10420 	if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
10421 		attrreference_t* string_ref;
10422 		u_int32_t* start_length;
10423 		user64_size_t param_length;
10424 
10425 		/* validate searchparams1 */
10426 		param_length = searchblock.sizeofsearchparams1;
10427 		/* skip the word that specifies length of the buffer */
10428 		start_length = (u_int32_t*) searchparams1;
10429 		start_length = start_length + 1;
10430 		string_ref = (attrreference_t*) start_length;
10431 
10432 		/* ensure no negative offsets or too big offsets */
10433 		if (string_ref->attr_dataoffset < 0) {
10434 			error = EINVAL;
10435 			goto freeandexit;
10436 		}
10437 		if (string_ref->attr_length > MAXPATHLEN) {
10438 			error = EINVAL;
10439 			goto freeandexit;
10440 		}
10441 
10442 		/* Check for pointer overflow in the string ref */
10443 		if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
10444 			error = EINVAL;
10445 			goto freeandexit;
10446 		}
10447 
10448 		if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
10449 			error = EINVAL;
10450 			goto freeandexit;
10451 		}
10452 		if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
10453 			error = EINVAL;
10454 			goto freeandexit;
10455 		}
10456 	}
10457 
10458 	/* set up the uio structure which will contain the users return buffer */
10459 	auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10460 	uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
10461 
10462 	nameiflags = 0;
10463 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10464 		nameiflags |= FOLLOW;
10465 	}
10466 	NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
10467 	    UIO_USERSPACE, uap->path, ctx);
10468 
10469 	error = namei(&nd);
10470 	if (error) {
10471 		goto freeandexit;
10472 	}
10473 	vp = nd.ni_vp;
10474 	nameidone(&nd);
10475 
10476 	/*
10477 	 * Switch to the root vnode for the volume
10478 	 */
10479 	error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
10480 	vnode_put(vp);
10481 	if (error) {
10482 		goto freeandexit;
10483 	}
10484 	vp = tvp;
10485 
10486 #if CONFIG_UNION_MOUNTS
10487 	/*
10488 	 * If it's a union mount, the path lookup takes
10489 	 * us to the top layer. But we may need to descend
10490 	 * to a lower layer. For non-union mounts the layer
10491 	 * is always zero.
10492 	 */
10493 	for (i = 0; i < (int) state->ss_union_layer; i++) {
10494 		if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
10495 			break;
10496 		}
10497 		tvp = vp;
10498 		vp = vp->v_mount->mnt_vnodecovered;
10499 		if (vp == NULL) {
10500 			vnode_put(tvp);
10501 			error = ENOENT;
10502 			goto freeandexit;
10503 		}
10504 		error = vnode_getwithref(vp);
10505 		vnode_put(tvp);
10506 		if (error) {
10507 			goto freeandexit;
10508 		}
10509 	}
10510 #endif /* CONFIG_UNION_MOUNTS */
10511 
10512 #if CONFIG_MACF
10513 	error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
10514 	if (error) {
10515 		vnode_put(vp);
10516 		goto freeandexit;
10517 	}
10518 #endif
10519 
10520 
10521 	/*
10522 	 * If searchblock.maxmatches == 0, then skip the search. This has happened
10523 	 * before and sometimes the underlying code doesnt deal with it well.
10524 	 */
10525 	if (searchblock.maxmatches == 0) {
10526 		nummatches = 0;
10527 		goto saveandexit;
10528 	}
10529 
10530 	/*
10531 	 * Allright, we have everything we need, so lets make that call.
10532 	 *
10533 	 * We keep special track of the return value from the file system:
10534 	 * EAGAIN is an acceptable error condition that shouldn't keep us
10535 	 * from copying out any results...
10536 	 */
10537 
10538 	fserror = VNOP_SEARCHFS(vp,
10539 	    searchparams1,
10540 	    searchparams2,
10541 	    &searchblock.searchattrs,
10542 	    (uint32_t)searchblock.maxmatches,
10543 	    &timelimit,
10544 	    returnattrs,
10545 	    &nummatches,
10546 	    (uint32_t)uap->scriptcode,
10547 	    (uint32_t)uap->options,
10548 	    auio,
10549 	    (struct searchstate *) &state->ss_fsstate,
10550 	    ctx);
10551 
10552 #if CONFIG_UNION_MOUNTS
10553 	/*
10554 	 * If it's a union mount we need to be called again
10555 	 * to search the mounted-on filesystem.
10556 	 */
10557 	if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
10558 		state->ss_union_flags = SRCHFS_START;
10559 		state->ss_union_layer++;        // search next layer down
10560 		fserror = EAGAIN;
10561 	}
10562 #endif /* CONFIG_UNION_MOUNTS */
10563 
10564 saveandexit:
10565 
10566 	vnode_put(vp);
10567 
10568 	/* Now copy out the stuff that needs copying out. That means the number of matches, the
10569 	 *  search state.  Everything was already put into he return buffer by the vop call. */
10570 
10571 	if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
10572 		goto freeandexit;
10573 	}
10574 
10575 	if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
10576 		goto freeandexit;
10577 	}
10578 
10579 	error = fserror;
10580 
10581 freeandexit:
10582 
10583 	kfree_data(searchparams1, mallocsize);
10584 
10585 	return error;
10586 } /* end of searchfs system call */
10587 
10588 #else /* CONFIG_SEARCHFS */
10589 
10590 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)10591 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
10592 {
10593 	return ENOTSUP;
10594 }
10595 
10596 #endif /* CONFIG_SEARCHFS */
10597 
10598 
10599 #if CONFIG_DATALESS_FILES
10600 
10601 /*
10602  * === Namespace Resolver Up-call Mechanism ===
10603  *
10604  * When I/O is performed to a dataless file or directory (read, write,
10605  * lookup-in, etc.), the file system performs an upcall to the namespace
10606  * resolver (filecoordinationd) to materialize the object.
10607  *
10608  * We need multiple up-calls to be in flight at once, and we need these
10609  * up-calls to be interruptible, thus the following implementation:
10610  *
10611  * => The nspace_resolver_request represents the in-kernel request state.
10612  *    It contains a request ID, storage space for the errno code returned
10613  *    by filecoordinationd, and flags.
10614  *
10615  * => The request ID is simply a global monotonically incrementing 32-bit
10616  *    number.  Outstanding requests are stored in a hash table, and the
10617  *    hash function is extremely simple.
10618  *
10619  * => When an upcall is to be made to filecoordinationd, a request structure
10620  *    is allocated on the stack (it is small, and needs to live only during
10621  *    the duration of the call to resolve_nspace_item_ext()).  It is
10622  *    initialized and inserted into the table.  Some backpressure from
10623  *    filecoordinationd is applied by limiting the numnber of entries that
10624  *    can be inserted into the table (and thus limiting the number of
10625  *    outstanding requests issued to filecoordinationd); waiting for an
10626  *    available slot is interruptible.
10627  *
10628  * => Once the request has been inserted into the table, the up-call is made
10629  *    to filecoordinationd via a MiG-generated stub.  The up-call returns
10630  *    immediately and filecoordinationd processes the request asynchronously.
10631  *
10632  * => The caller now waits for the request to complete.  Tnis is achieved by
10633  *    sleeping on the address of the request structure and waiting for
10634  *    filecoordinationd to mark the request structure as complete.  This
10635  *    is an interruptible sleep call; if interrupted, the request structure
10636  *    is removed from the table and EINTR is returned to the caller.  If
10637  *    this occurs, an advisory up-call is made to filecoordinationd with
10638  *    the request ID to indicate that the request can be aborted or
10639  *    de-prioritized at the discretion of filecoordinationd.
10640  *
10641  * => When filecoordinationd has completed the request, it signals completion
10642  *    by writing to the vfs.nspace.complete sysctl node.  Only a process
10643  *    decorated as a namespace resolver can write to this sysctl node.  The
10644  *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
10645  *    The request ID is looked up in the table, and if the request is found,
10646  *    the error code is stored in the request structure and a wakeup()
10647  *    issued on the address of the request structure.  If the request is not
10648  *    found, we simply drop the completion notification, assuming that the
10649  *    caller was interrupted.
10650  *
10651  * => When the waiting thread wakes up, it extracts the error code from the
10652  *    request structure, removes the request from the table, and returns the
10653  *    error code to the calling function.  Fini!
10654  */
10655 
10656 struct nspace_resolver_request {
10657 	LIST_ENTRY(nspace_resolver_request) r_hashlink;
10658 	vnode_t         r_vp;
10659 	uint32_t        r_req_id;
10660 	int             r_resolver_error;
10661 	int             r_flags;
10662 };
10663 
10664 #define RRF_COMPLETE    0x0001
10665 
10666 static uint32_t
next_nspace_req_id(void)10667 next_nspace_req_id(void)
10668 {
10669 	static uint32_t next_req_id;
10670 
10671 	return OSAddAtomic(1, &next_req_id);
10672 }
10673 
10674 #define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
10675 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
10676 
10677 static LIST_HEAD(nspace_resolver_requesthead,
10678     nspace_resolver_request) * nspace_resolver_request_hashtbl;
10679 static u_long nspace_resolver_request_hashmask;
10680 static u_int nspace_resolver_request_count;
10681 static bool nspace_resolver_request_wait_slot;
10682 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
10683 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
10684     &nspace_resolver_request_lck_grp);
10685 
10686 #define NSPACE_REQ_LOCK() \
10687 	lck_mtx_lock(&nspace_resolver_request_hash_mutex)
10688 #define NSPACE_REQ_UNLOCK() \
10689 	lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
10690 
10691 #define NSPACE_RESOLVER_HASH(req_id)    \
10692 	(&nspace_resolver_request_hashtbl[(req_id) & \
10693 	 nspace_resolver_request_hashmask])
10694 
10695 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id)10696 nspace_resolver_req_lookup(uint32_t req_id)
10697 {
10698 	struct nspace_resolver_requesthead *bucket;
10699 	struct nspace_resolver_request *req;
10700 
10701 	bucket = NSPACE_RESOLVER_HASH(req_id);
10702 	LIST_FOREACH(req, bucket, r_hashlink) {
10703 		if (req->r_req_id == req_id) {
10704 			return req;
10705 		}
10706 	}
10707 
10708 	return NULL;
10709 }
10710 
10711 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)10712 nspace_resolver_req_add(struct nspace_resolver_request *req)
10713 {
10714 	struct nspace_resolver_requesthead *bucket;
10715 	int error;
10716 
10717 	while (nspace_resolver_request_count >=
10718 	    NSPACE_RESOLVER_MAX_OUTSTANDING) {
10719 		nspace_resolver_request_wait_slot = true;
10720 		error = msleep(&nspace_resolver_request_count,
10721 		    &nspace_resolver_request_hash_mutex,
10722 		    PVFS | PCATCH, "nspacerq", NULL);
10723 		if (error) {
10724 			return error;
10725 		}
10726 	}
10727 
10728 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10729 #if DIAGNOSTIC
10730 	assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
10731 #endif /* DIAGNOSTIC */
10732 	LIST_INSERT_HEAD(bucket, req, r_hashlink);
10733 	nspace_resolver_request_count++;
10734 
10735 	return 0;
10736 }
10737 
10738 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)10739 nspace_resolver_req_remove(struct nspace_resolver_request *req)
10740 {
10741 	struct nspace_resolver_requesthead *bucket;
10742 
10743 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10744 #if DIAGNOSTIC
10745 	assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
10746 #endif /* DIAGNOSTIC */
10747 	LIST_REMOVE(req, r_hashlink);
10748 	nspace_resolver_request_count--;
10749 
10750 	if (nspace_resolver_request_wait_slot) {
10751 		nspace_resolver_request_wait_slot = false;
10752 		wakeup(&nspace_resolver_request_count);
10753 	}
10754 }
10755 
10756 static void
nspace_resolver_req_cancel(uint32_t req_id)10757 nspace_resolver_req_cancel(uint32_t req_id)
10758 {
10759 	kern_return_t kr;
10760 	mach_port_t mp;
10761 
10762 	// Failures here aren't fatal -- the cancellation message
10763 	// sent to the resolver is merely advisory.
10764 
10765 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
10766 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
10767 		return;
10768 	}
10769 
10770 	kr = send_nspace_resolve_cancel(mp, req_id);
10771 	if (kr != KERN_SUCCESS) {
10772 		os_log_error(OS_LOG_DEFAULT,
10773 		    "NSPACE send_nspace_resolve_cancel failure: %d", kr);
10774 	}
10775 
10776 	ipc_port_release_send(mp);
10777 }
10778 
10779 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)10780 nspace_resolver_req_wait(struct nspace_resolver_request *req)
10781 {
10782 	bool send_cancel_message = false;
10783 	int error;
10784 
10785 	NSPACE_REQ_LOCK();
10786 
10787 	while ((req->r_flags & RRF_COMPLETE) == 0) {
10788 		error = msleep(req, &nspace_resolver_request_hash_mutex,
10789 		    PVFS | PCATCH, "nspace", NULL);
10790 		if (error && error != ERESTART) {
10791 			req->r_resolver_error = (error == EINTR) ? EINTR :
10792 			    ETIMEDOUT;
10793 			send_cancel_message = true;
10794 			break;
10795 		}
10796 	}
10797 
10798 	nspace_resolver_req_remove(req);
10799 
10800 	NSPACE_REQ_UNLOCK();
10801 
10802 	if (send_cancel_message) {
10803 		nspace_resolver_req_cancel(req->r_req_id);
10804 	}
10805 
10806 	return req->r_resolver_error;
10807 }
10808 
10809 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)10810 nspace_resolver_req_mark_complete(
10811 	struct nspace_resolver_request *req,
10812 	int resolver_error)
10813 {
10814 	req->r_resolver_error = resolver_error;
10815 	req->r_flags |= RRF_COMPLETE;
10816 	wakeup(req);
10817 }
10818 
10819 static void
nspace_resolver_req_completed(uint32_t req_id,int resolver_error,uint64_t orig_gencount)10820 nspace_resolver_req_completed(uint32_t req_id, int resolver_error, uint64_t orig_gencount)
10821 {
10822 	struct nspace_resolver_request *req;
10823 
10824 	NSPACE_REQ_LOCK();
10825 
10826 	// If we don't find the request corresponding to our req_id,
10827 	// just drop the completion signal on the floor; it's likely
10828 	// that the requester interrupted with a signal.
10829 
10830 	req = nspace_resolver_req_lookup(req_id);
10831 	if (req) {
10832 		mount_t locked_mp = NULL;
10833 
10834 		locked_mp = req->r_vp->v_mount;
10835 		mount_ref(locked_mp, 0);
10836 		mount_lock_renames(locked_mp);
10837 
10838 		//
10839 		// if the resolver isn't already returning an error and we have an
10840 		// orig_gencount, then get an iocount on the request vnode and check
10841 		// that the gencount on req->r_vp has not changed.
10842 		//
10843 		// note: a ref was taken on req->r_vp when the request was created
10844 		// and that ref will be dropped by that thread when it wakes up.
10845 		//
10846 		if (resolver_error == 0 &&
10847 		    orig_gencount != 0 &&
10848 		    vnode_getwithref(req->r_vp) == 0) {
10849 			struct vnode_attr va;
10850 			uint64_t cur_gencount;
10851 
10852 			VATTR_INIT(&va);
10853 			VATTR_WANTED(&va, va_recursive_gencount);
10854 
10855 			if (vnode_getattr(req->r_vp, &va, vfs_context_kernel()) == 0) {
10856 				cur_gencount = va.va_recursive_gencount;
10857 			} else {
10858 				cur_gencount = 0;
10859 			}
10860 
10861 			if (resolver_error == 0 && cur_gencount && orig_gencount && cur_gencount != orig_gencount) {
10862 				printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n", orig_gencount, cur_gencount);
10863 
10864 				// this error will be returned to the thread that initiated the
10865 				// materialization of req->r_vp.
10866 				resolver_error = EBUSY;
10867 
10868 				// note: we explicitly do not return an error to the caller (i.e.
10869 				// the thread that did the materialization) because they said they
10870 				// don't want one.
10871 			}
10872 
10873 			vnode_put(req->r_vp);
10874 		}
10875 
10876 		mount_unlock_renames(locked_mp);
10877 		mount_drop(locked_mp, 0);
10878 
10879 		nspace_resolver_req_mark_complete(req, resolver_error);
10880 	}
10881 
10882 	NSPACE_REQ_UNLOCK();
10883 
10884 	return;
10885 }
10886 
10887 static struct proc *nspace_resolver_proc;
10888 
10889 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)10890 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
10891 {
10892 	*is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
10893 	    p == nspace_resolver_proc) ? 1 : 0;
10894 	return 0;
10895 }
10896 
10897 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)10898 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
10899 {
10900 	vfs_context_t ctx = vfs_context_current();
10901 	int error = 0;
10902 
10903 	//
10904 	// The system filecoordinationd runs as uid == 0.  This also
10905 	// has the nice side-effect of filtering out filecoordinationd
10906 	// running in the simulator.
10907 	//
10908 	if (!vfs_context_issuser(ctx)) {
10909 		return EPERM;
10910 	}
10911 
10912 	error = priv_check_cred(vfs_context_ucred(ctx),
10913 	    PRIV_VFS_DATALESS_RESOLVER, 0);
10914 	if (error) {
10915 		return error;
10916 	}
10917 
10918 	if (is_resolver) {
10919 		NSPACE_REQ_LOCK();
10920 
10921 		if (nspace_resolver_proc == NULL) {
10922 			proc_lock(p);
10923 			p->p_lflag |= P_LNSPACE_RESOLVER;
10924 			proc_unlock(p);
10925 			nspace_resolver_proc = p;
10926 		} else {
10927 			error = EBUSY;
10928 		}
10929 
10930 		NSPACE_REQ_UNLOCK();
10931 	} else {
10932 		// This is basically just like the exit case.
10933 		// nspace_resolver_exited() will verify that the
10934 		// process is the resolver, and will clear the
10935 		// global.
10936 		nspace_resolver_exited(p);
10937 	}
10938 
10939 	return error;
10940 }
10941 
10942 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)10943 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
10944 {
10945 	if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
10946 	    (p->p_vfs_iopolicy &
10947 	    P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
10948 		*is_prevented = 1;
10949 	} else {
10950 		*is_prevented = 0;
10951 	}
10952 	return 0;
10953 }
10954 
10955 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)10956 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
10957 {
10958 	if (p->p_lflag & P_LNSPACE_RESOLVER) {
10959 		return is_prevented ? 0 : EBUSY;
10960 	}
10961 
10962 	if (is_prevented) {
10963 		OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
10964 	} else {
10965 		OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
10966 	}
10967 	return 0;
10968 }
10969 
10970 static int
nspace_materialization_get_thread_state(int * is_prevented)10971 nspace_materialization_get_thread_state(int *is_prevented)
10972 {
10973 	uthread_t ut = current_uthread();
10974 
10975 	*is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
10976 	return 0;
10977 }
10978 
10979 static int
nspace_materialization_set_thread_state(int is_prevented)10980 nspace_materialization_set_thread_state(int is_prevented)
10981 {
10982 	uthread_t ut = current_uthread();
10983 
10984 	if (is_prevented) {
10985 		ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
10986 	} else {
10987 		ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
10988 	}
10989 	return 0;
10990 }
10991 
10992 /* the vfs.nspace branch */
10993 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
10994 
10995 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)10996 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
10997     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10998 {
10999 	struct proc *p = req->p;
11000 	int new_value, old_value, changed = 0;
11001 	int error;
11002 
11003 	error = nspace_resolver_get_proc_state(p, &old_value);
11004 	if (error) {
11005 		return error;
11006 	}
11007 
11008 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11009 	    &changed);
11010 	if (error == 0 && changed) {
11011 		error = nspace_resolver_set_proc_state(p, new_value);
11012 	}
11013 	return error;
11014 }
11015 
11016 /* decorate this process as the dataless file resolver */
11017 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11018     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11019     0, 0, sysctl_nspace_resolver, "I", "");
11020 
11021 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11022 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11023     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11024 {
11025 	struct proc *p = req->p;
11026 	int new_value, old_value, changed = 0;
11027 	int error;
11028 
11029 	error = nspace_materialization_get_proc_state(p, &old_value);
11030 	if (error) {
11031 		return error;
11032 	}
11033 
11034 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11035 	    &changed);
11036 	if (error == 0 && changed) {
11037 		error = nspace_materialization_set_proc_state(p, new_value);
11038 	}
11039 	return error;
11040 }
11041 
11042 /* decorate this process as not wanting to materialize dataless files */
11043 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
11044     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11045     0, 0, sysctl_nspace_prevent_materialization, "I", "");
11046 
11047 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11048 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
11049     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11050 {
11051 	int new_value, old_value, changed = 0;
11052 	int error;
11053 
11054 	error = nspace_materialization_get_thread_state(&old_value);
11055 	if (error) {
11056 		return error;
11057 	}
11058 
11059 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11060 	    &changed);
11061 	if (error == 0 && changed) {
11062 		error = nspace_materialization_set_thread_state(new_value);
11063 	}
11064 	return error;
11065 }
11066 
11067 /* decorate this thread as not wanting to materialize dataless files */
11068 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
11069     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11070     0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
11071 
11072 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11073 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
11074     __unused int arg2, struct sysctl_req *req)
11075 {
11076 	struct proc *p = req->p;
11077 	uint32_t req_status[2] = { 0, 0 };
11078 	uint64_t gencount = 0;
11079 	int error, is_resolver, changed = 0, gencount_changed;
11080 
11081 	error = nspace_resolver_get_proc_state(p, &is_resolver);
11082 	if (error) {
11083 		return error;
11084 	}
11085 
11086 	if (!is_resolver) {
11087 		return EPERM;
11088 	}
11089 
11090 	error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11091 	    &changed);
11092 	if (error) {
11093 		return error;
11094 	}
11095 
11096 	// get the gencount if it was passed
11097 	error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11098 	    &gencount_changed);
11099 	if (error) {
11100 		gencount = 0;
11101 		// we ignore the error because the gencount was optional
11102 		error = 0;
11103 	}
11104 
11105 	/*
11106 	 * req_status[0] is the req_id
11107 	 *
11108 	 * req_status[1] is the errno
11109 	 */
11110 	if (error == 0 && changed) {
11111 		nspace_resolver_req_completed(req_status[0],
11112 		    (int)req_status[1], gencount);
11113 	}
11114 	return error;
11115 }
11116 
11117 /* Resolver reports completed reqs here. */
11118 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11119     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11120     0, 0, sysctl_nspace_complete, "-", "");
11121 
11122 #endif /* CONFIG_DATALESS_FILES */
11123 
11124 #if CONFIG_DATALESS_FILES
11125 #define __no_dataless_unused    /* nothing */
11126 #else
11127 #define __no_dataless_unused    __unused
11128 #endif
11129 
11130 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)11131 vfs_context_dataless_materialization_is_prevented(
11132 	vfs_context_t const ctx __no_dataless_unused)
11133 {
11134 #if CONFIG_DATALESS_FILES
11135 	proc_t const p = vfs_context_proc(ctx);
11136 	thread_t const t = vfs_context_thread(ctx);
11137 	uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11138 
11139 	/*
11140 	 * Kernel context ==> return EDEADLK, as we would with any random
11141 	 * process decorated as no-materialize.
11142 	 */
11143 	if (ctx == vfs_context_kernel()) {
11144 		return EDEADLK;
11145 	}
11146 
11147 	/*
11148 	 * If the process has the dataless-manipulation entitlement,
11149 	 * materialization is prevented, and depending on the kind
11150 	 * of file system operation, things get to proceed as if the
11151 	 * object is not dataless.
11152 	 */
11153 	if (vfs_context_is_dataless_manipulator(ctx)) {
11154 		return EJUSTRETURN;
11155 	}
11156 
11157 	/*
11158 	 * Per-thread decorations override any process-wide decorations.
11159 	 * (Foundation uses this, and this overrides even the dataless-
11160 	 * manipulation entitlement so as to make API contracts consistent.)
11161 	 */
11162 	if (ut != NULL) {
11163 		if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11164 			return EDEADLK;
11165 		}
11166 		if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11167 			return 0;
11168 		}
11169 	}
11170 
11171 	/*
11172 	 * If the process's iopolicy specifies that dataless files
11173 	 * can be materialized, then we let it go ahead.
11174 	 */
11175 	if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11176 		return 0;
11177 	}
11178 #endif /* CONFIG_DATALESS_FILES */
11179 
11180 	/*
11181 	 * The default behavior is to not materialize dataless files;
11182 	 * return to the caller that deadlock was detected.
11183 	 */
11184 	return EDEADLK;
11185 }
11186 
11187 void
nspace_resolver_init(void)11188 nspace_resolver_init(void)
11189 {
11190 #if CONFIG_DATALESS_FILES
11191 	nspace_resolver_request_hashtbl =
11192 	    hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
11193 	    M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
11194 #endif /* CONFIG_DATALESS_FILES */
11195 }
11196 
11197 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)11198 nspace_resolver_exited(struct proc *p __no_dataless_unused)
11199 {
11200 #if CONFIG_DATALESS_FILES
11201 	struct nspace_resolver_requesthead *bucket;
11202 	struct nspace_resolver_request *req;
11203 	u_long idx;
11204 
11205 	NSPACE_REQ_LOCK();
11206 
11207 	if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11208 	    p == nspace_resolver_proc) {
11209 		for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
11210 			bucket = &nspace_resolver_request_hashtbl[idx];
11211 			LIST_FOREACH(req, bucket, r_hashlink) {
11212 				nspace_resolver_req_mark_complete(req,
11213 				    ETIMEDOUT);
11214 			}
11215 		}
11216 		nspace_resolver_proc = NULL;
11217 	}
11218 
11219 	NSPACE_REQ_UNLOCK();
11220 #endif /* CONFIG_DATALESS_FILES */
11221 }
11222 
11223 int
resolve_nspace_item(struct vnode * vp,uint64_t op)11224 resolve_nspace_item(struct vnode *vp, uint64_t op)
11225 {
11226 	return resolve_nspace_item_ext(vp, op, NULL);
11227 }
11228 
11229 #define DATALESS_RESOLVER_ENTITLEMENT     \
11230 	"com.apple.private.vfs.dataless-resolver"
11231 #define DATALESS_MANIPULATION_ENTITLEMENT \
11232 	"com.apple.private.vfs.dataless-manipulation"
11233 
11234 /*
11235  * Return TRUE if the vfs context is associated with a process entitled
11236  * for dataless manipulation.
11237  *
11238  * XXX Arguably belongs in vfs_subr.c, but is here because of the
11239  * complication around CONFIG_DATALESS_FILES.
11240  */
11241 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __unused)11242 vfs_context_is_dataless_manipulator(vfs_context_t ctx __unused)
11243 {
11244 #if CONFIG_DATALESS_FILES
11245 	assert(ctx->vc_thread == current_thread());
11246 	return IOCurrentTaskHasEntitlement( DATALESS_MANIPULATION_ENTITLEMENT) ||
11247 	       IOCurrentTaskHasEntitlement(DATALESS_RESOLVER_ENTITLEMENT);
11248 #else
11249 	return false;
11250 #endif /* CONFIG_DATALESS_FILES */
11251 }
11252 
11253 #if CONFIG_DATALESS_FILES
11254 static void
log_materialization_prevented(vnode_t vp,uint64_t op)11255 log_materialization_prevented(vnode_t vp, uint64_t op)
11256 {
11257 	char p_name[MAXCOMLEN + 1];
11258 	char *vntype;
11259 	proc_selfname(&p_name[0], sizeof(p_name));
11260 
11261 	if (vp->v_type == VREG) {
11262 		vntype = "File";
11263 	} else if (vp->v_type == VDIR) {
11264 		vntype = "Dir";
11265 	} else if (vp->v_type == VLNK) {
11266 		vntype = "SymLink";
11267 	} else {
11268 		vntype = "Other";
11269 	}
11270 
11271 #if DEVELOPMENT
11272 	char *path = NULL;
11273 	int   len;
11274 
11275 	path = get_pathbuff();
11276 	len = MAXPATHLEN;
11277 	if (path) {
11278 		vn_getpath(vp, path, &len);
11279 	}
11280 
11281 	os_log_debug(OS_LOG_DEFAULT,
11282 	    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) path: %s",
11283 	    p_name, proc_selfpid(),
11284 	    op, vntype, path ? path : "<unknown-path>");
11285 	if (path) {
11286 		release_pathbuff(path);
11287 	}
11288 #else
11289 	os_log_debug(OS_LOG_DEFAULT,
11290 	    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
11291 	    p_name, proc_selfpid(),
11292 	    op, vntype);
11293 #endif
11294 }
11295 #endif /* CONFIG_DATALESS_FILES */
11296 
11297 
11298 static int
vfs_materialize_item(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused)11299 vfs_materialize_item(
11300 	struct vnode *vp __no_dataless_unused,
11301 	uint64_t op __no_dataless_unused,
11302 	int64_t offset __no_dataless_unused,
11303 	int64_t size __no_dataless_unused,
11304 	char *lookup_name __no_dataless_unused,
11305 	size_t const namelen __no_dataless_unused)
11306 {
11307 #if CONFIG_DATALESS_FILES
11308 	struct nspace_resolver_request req;
11309 	kern_return_t kern_ret;
11310 	mach_port_t mach_port;
11311 	char *path = NULL;
11312 	vfs_context_t context;
11313 	int path_len;
11314 	int error;
11315 #ifdef DATALESS_FILES_USE_AUDIT_TOKEN
11316 	audit_token_t atoken;
11317 #endif
11318 
11319 	/*
11320 	 * If this is a snapshot event and the vnode is on a disk image just
11321 	 * pretend nothing happened since any change to the disk image will
11322 	 * cause the disk image itself to get backed up and this avoids multi-
11323 	 * way deadlocks between the snapshot handler and the ever popular
11324 	 * diskimages-helper process. The variable nspace_allow_virtual_devs
11325 	 * allows this behavior to be overridden (for use by the Mobile
11326 	 * TimeMachine testing infrastructure which uses disk images).
11327 	 */
11328 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
11329 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
11330 		return ENOTSUP;
11331 	}
11332 
11333 	context = vfs_context_current();
11334 
11335 	error = vfs_context_dataless_materialization_is_prevented(context);
11336 	if (error) {
11337 		log_materialization_prevented(vp, op);
11338 		return error;
11339 	}
11340 
11341 	kern_ret = host_get_filecoordinationd_port(host_priv_self(),
11342 	    &mach_port);
11343 	if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
11344 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
11345 		/*
11346 		 * Treat this like being unable to access the backing store
11347 		 * server.
11348 		 */
11349 		return ETIMEDOUT;
11350 	}
11351 
11352 	path = zalloc(ZV_NAMEI);
11353 	path_len = MAXPATHLEN;
11354 
11355 	error = vn_getpath(vp, path, &path_len);
11356 	if (error) {
11357 		goto out_release_port;
11358 	}
11359 
11360 #ifdef DATALESS_FILES_USE_AUDIT_TOKEN
11361 	error = vfs_context_copy_audit_token(context, &atoken);
11362 	if (error) {
11363 		goto out_release_port;
11364 	}
11365 #endif
11366 
11367 	req.r_req_id = next_nspace_req_id();
11368 	req.r_resolver_error = 0;
11369 	req.r_flags = 0;
11370 	req.r_vp = vp;
11371 
11372 	NSPACE_REQ_LOCK();
11373 	error = nspace_resolver_req_add(&req);
11374 	NSPACE_REQ_UNLOCK();
11375 	if (error) {
11376 		goto out_release_port;
11377 	}
11378 
11379 	os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
11380 	if (vp->v_type == VDIR) {
11381 		char *tmpname = NULL;
11382 
11383 		/*
11384 		 * If the caller provided a lookup_name *and* a name length,
11385 		 * then we assume the lookup_name is not NUL-terminated.
11386 		 * Allocate a temporary buffer in this case to provide
11387 		 * a NUL-terminated path name to the IPC call.
11388 		 */
11389 		if (lookup_name != NULL && namelen != 0) {
11390 			if (namelen >= PATH_MAX) {
11391 				error = EINVAL;
11392 				goto out_release_port;
11393 			}
11394 			tmpname = zalloc(ZV_NAMEI);
11395 			strlcpy(tmpname, lookup_name, namelen + 1);
11396 			lookup_name = tmpname;
11397 		} else if (lookup_name != NULL) {
11398 			/*
11399 			 * If the caller provided a lookup_name with a
11400 			 * zero name length, then we assume it's NUL-
11401 			 * terminated.  Verify it has a valid length.
11402 			 */
11403 			if (strlen(lookup_name) >= PATH_MAX) {
11404 				error = EINVAL;
11405 				goto out_release_port;
11406 			}
11407 		}
11408 
11409 #ifdef DATALESS_FILES_USE_AUDIT_TOKEN
11410 		kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
11411 		    req.r_req_id, (uint32_t)(op & 0xffffffff),
11412 		    lookup_name == NULL ? "" : lookup_name, path, atoken);
11413 #else
11414 		kern_ret = send_vfs_resolve_dir(mach_port, req.r_req_id,
11415 		    proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
11416 		    lookup_name == NULL ? "" : lookup_name, path);
11417 #endif /* DATALESS_FILES_USE_AUDIT_TOKEN */
11418 
11419 		if (tmpname != NULL) {
11420 			zfree(ZV_NAMEI, tmpname);
11421 
11422 			/*
11423 			 * Poison lookup_name rather than reference
11424 			 * freed memory.
11425 			 */
11426 			lookup_name = NULL;
11427 		}
11428 	} else {
11429 #ifdef DATALESS_FILES_USE_AUDIT_TOKEN
11430 		kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
11431 		    req.r_req_id, (uint32_t)(op & 0xffffffff),
11432 		    offset, size, path, atoken);
11433 #else
11434 		kern_ret = send_vfs_resolve_file(mach_port, req.r_req_id,
11435 		    proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
11436 		    offset, size, path);
11437 #endif /* DATALESS_FILES_USE_AUDIT_TOKEN */
11438 	}
11439 	if (kern_ret != KERN_SUCCESS) {
11440 		/*
11441 		 * Also treat this like being unable to access the backing
11442 		 * store server.
11443 		 */
11444 		os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
11445 		    kern_ret);
11446 		error = ETIMEDOUT;
11447 
11448 		NSPACE_REQ_LOCK();
11449 		nspace_resolver_req_remove(&req);
11450 		NSPACE_REQ_UNLOCK();
11451 		goto out_release_port;
11452 	}
11453 
11454 	/*
11455 	 * Give back the memory we allocated earlier while we wait; we
11456 	 * no longer need it.
11457 	 */
11458 	zfree(ZV_NAMEI, path);
11459 	path = NULL;
11460 
11461 	/*
11462 	 * Request has been submitted to the resolver. Now (interruptibly)
11463 	 * wait for completion. Upon requrn, the request will have been
11464 	 * removed from the lookup table.
11465 	 */
11466 	error = nspace_resolver_req_wait(&req);
11467 
11468 out_release_port:
11469 	if (path != NULL) {
11470 		zfree(ZV_NAMEI, path);
11471 	}
11472 	ipc_port_release_send(mach_port);
11473 
11474 	return error;
11475 #else
11476 	return ENOTSUP;
11477 #endif /* CONFIG_DATALESS_FILES */
11478 }
11479 
11480 /*
11481  * vfs_materialize_file: Materialize a regular file.
11482  *
11483  * Inputs:
11484  * vp		The dataless file to be materialized.
11485  *
11486  * op		What kind of operation is being performed:
11487  *		-> NAMESPACE_HANDLER_READ_OP
11488  *		-> NAMESPACE_HANDLER_WRITE_OP
11489  *		-> NAMESPACE_HANDLER_LINK_CREATE
11490  *		-> NAMESPACE_HANDLER_DELETE_OP
11491  *		-> NAMESPACE_HANDLER_TRUNCATE_OP
11492  *		-> NAMESPACE_HANDLER_RENAME_OP
11493  *
11494  * offset	offset of I/O for READ or WRITE.  Ignored for
11495  *		other ops.
11496  *
11497  * size		size of I/O for READ or WRITE  Ignored for
11498  *		other ops.
11499  *
11500  * If offsize or size are -1 for a READ or WRITE, then the resolver should
11501  * consider the range to be unknown.
11502  *
11503  * Upon successful return, the caller may proceed with the operation.
11504  * N.B. the file may still be "dataless" in this case.
11505  */
11506 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)11507 vfs_materialize_file(
11508 	struct vnode *vp,
11509 	uint64_t op,
11510 	int64_t offset,
11511 	int64_t size)
11512 {
11513 	if (vp->v_type != VREG) {
11514 		return EFTYPE;
11515 	}
11516 	return vfs_materialize_item(vp, op, offset, size, NULL, 0);
11517 }
11518 
11519 /*
11520  * vfs_materialize_dir:
11521  *
11522  * Inputs:
11523  * vp		The dataless directory to be materialized.
11524  *
11525  * op		What kind of operation is being performed:
11526  *		-> NAMESPACE_HANDLER_READ_OP
11527  *		-> NAMESPACE_HANDLER_WRITE_OP
11528  *		-> NAMESPACE_HANDLER_DELETE_OP
11529  *		-> NAMESPACE_HANDLER_RENAME_OP
11530  *		-> NAMESPACE_HANDLER_LOOKUP_OP
11531  *
11532  * lookup_name	Name being looked up for a LOOKUP op.  Ignored for
11533  *		other ops.  May or may not be NUL-terminated; see below.
11534  *
11535  * namelen	If non-zero, then lookup_name is assumed to not be NUL-
11536  *		terminated and namelen is the number of valid bytes in
11537  *		lookup_name. If zero, then lookup_name is assumed to be
11538  *		NUL-terminated.
11539  *
11540  * Upon successful return, the caller may proceed with the operation.
11541  * N.B. the directory may still be "dataless" in this case.
11542  */
11543 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)11544 vfs_materialize_dir(
11545 	struct vnode *vp,
11546 	uint64_t op,
11547 	char *lookup_name,
11548 	size_t namelen)
11549 {
11550 	if (vp->v_type != VDIR) {
11551 		return EFTYPE;
11552 	}
11553 	if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
11554 		return EINVAL;
11555 	}
11556 	return vfs_materialize_item(vp, op, 0, 0, lookup_name, namelen);
11557 }
11558 
11559 int
resolve_nspace_item_ext(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,void * arg __unused)11560 resolve_nspace_item_ext(
11561 	struct vnode *vp __no_dataless_unused,
11562 	uint64_t op __no_dataless_unused,
11563 	void *arg __unused)
11564 {
11565 #if CONFIG_DATALESS_FILES
11566 	int error;
11567 	mach_port_t mp;
11568 	char *path = NULL;
11569 	int path_len;
11570 	kern_return_t kr;
11571 	struct nspace_resolver_request req;
11572 
11573 	// only allow namespace events on regular files, directories and symlinks.
11574 	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
11575 		return EFTYPE;
11576 	}
11577 
11578 	//
11579 	// if this is a snapshot event and the vnode is on a
11580 	// disk image just pretend nothing happened since any
11581 	// change to the disk image will cause the disk image
11582 	// itself to get backed up and this avoids multi-way
11583 	// deadlocks between the snapshot handler and the ever
11584 	// popular diskimages-helper process.  the variable
11585 	// nspace_allow_virtual_devs allows this behavior to
11586 	// be overridden (for use by the Mobile TimeMachine
11587 	// testing infrastructure which uses disk images)
11588 	//
11589 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
11590 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
11591 		return ENOTSUP;
11592 	}
11593 
11594 	error = vfs_context_dataless_materialization_is_prevented(
11595 		vfs_context_current());
11596 	if (error) {
11597 		log_materialization_prevented(vp, op);
11598 		return error;
11599 	}
11600 
11601 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11602 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11603 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
11604 		// Treat this like being unable to access the backing
11605 		// store server.
11606 		return ETIMEDOUT;
11607 	}
11608 
11609 	path = zalloc(ZV_NAMEI);
11610 	path_len = MAXPATHLEN;
11611 
11612 	error = vn_getpath(vp, path, &path_len);
11613 	if (error == 0) {
11614 		int xxx_rdar44371223;   /* XXX Mig bug */
11615 		req.r_req_id = next_nspace_req_id();
11616 		req.r_resolver_error = 0;
11617 		req.r_flags = 0;
11618 
11619 		if ((error = vnode_ref(vp)) == 0) {     // take a ref so that the vnode doesn't go away
11620 			req.r_vp = vp;
11621 		} else {
11622 			goto out_release_port;
11623 		}
11624 
11625 		NSPACE_REQ_LOCK();
11626 		error = nspace_resolver_req_add(&req);
11627 		NSPACE_REQ_UNLOCK();
11628 		if (error) {
11629 			vnode_rele(req.r_vp);
11630 			goto out_release_port;
11631 		}
11632 
11633 		os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
11634 		kr = send_nspace_resolve_path(mp, req.r_req_id,
11635 		    proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
11636 		    path, &xxx_rdar44371223);
11637 		if (kr != KERN_SUCCESS) {
11638 			// Also treat this like being unable to access
11639 			// the backing store server.
11640 			os_log_error(OS_LOG_DEFAULT,
11641 			    "NSPACE resolve_path failure: %d", kr);
11642 			error = ETIMEDOUT;
11643 
11644 			NSPACE_REQ_LOCK();
11645 			nspace_resolver_req_remove(&req);
11646 			NSPACE_REQ_UNLOCK();
11647 			vnode_rele(req.r_vp);
11648 			goto out_release_port;
11649 		}
11650 
11651 		// Give back the memory we allocated earlier while
11652 		// we wait; we no longer need it.
11653 		zfree(ZV_NAMEI, path);
11654 		path = NULL;
11655 
11656 		// Request has been submitted to the resolver.
11657 		// Now (interruptibly) wait for completion.
11658 		// Upon requrn, the request will have been removed
11659 		// from the lookup table.
11660 		error = nspace_resolver_req_wait(&req);
11661 
11662 		vnode_rele(req.r_vp);
11663 	}
11664 
11665 out_release_port:
11666 	if (path != NULL) {
11667 		zfree(ZV_NAMEI, path);
11668 	}
11669 	ipc_port_release_send(mp);
11670 
11671 	return error;
11672 #else
11673 	return ENOTSUP;
11674 #endif /* CONFIG_DATALESS_FILES */
11675 }
11676 
11677 int
nspace_snapshot_event(__unused vnode_t vp,__unused time_t ctime,__unused uint64_t op_type,__unused void * arg)11678 nspace_snapshot_event(__unused vnode_t vp, __unused  time_t ctime,
11679     __unused uint64_t op_type, __unused void *arg)
11680 {
11681 	return 0;
11682 }
11683 
11684 #if 0
11685 static int
11686 build_volfs_path(struct vnode *vp, char *path, int *len)
11687 {
11688 	struct vnode_attr va;
11689 	int ret;
11690 
11691 	VATTR_INIT(&va);
11692 	VATTR_WANTED(&va, va_fsid);
11693 	VATTR_WANTED(&va, va_fileid);
11694 
11695 	if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
11696 		*len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
11697 		ret = -1;
11698 	} else {
11699 		*len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
11700 		ret = 0;
11701 	}
11702 
11703 	return ret;
11704 }
11705 #endif
11706 
11707 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)11708 fsctl_bogus_command_compat(unsigned long cmd)
11709 {
11710 	switch (cmd) {
11711 	case IOCBASECMD(FSIOC_SYNC_VOLUME):
11712 		return FSIOC_SYNC_VOLUME;
11713 	case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
11714 		return FSIOC_ROUTEFS_SETROUTEID;
11715 	case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
11716 		return FSIOC_SET_PACKAGE_EXTS;
11717 	case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
11718 		return FSIOC_SET_FSTYPENAME_OVERRIDE;
11719 	case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
11720 		return DISK_CONDITIONER_IOC_GET;
11721 	case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
11722 		return DISK_CONDITIONER_IOC_SET;
11723 	case IOCBASECMD(FSIOC_FIOSEEKHOLE):
11724 		return FSIOC_FIOSEEKHOLE;
11725 	case IOCBASECMD(FSIOC_FIOSEEKDATA):
11726 		return FSIOC_FIOSEEKDATA;
11727 	case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
11728 		return SPOTLIGHT_IOC_GET_MOUNT_TIME;
11729 	case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
11730 		return SPOTLIGHT_IOC_GET_LAST_MTIME;
11731 	}
11732 
11733 	return cmd;
11734 }
11735 
11736 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)11737 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
11738 {
11739 	return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
11740 }
11741 
11742 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)11743 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
11744 {
11745 	struct vfs_attr vfa;
11746 	mount_t mp = vp->v_mount;
11747 	unsigned arg;
11748 	int error;
11749 
11750 	/* record vid of vp so we can drop it below. */
11751 	uint32_t vvid = vp->v_id;
11752 
11753 	/*
11754 	 * Then grab mount_iterref so that we can release the vnode.
11755 	 * Without this, a thread may call vnode_iterate_prepare then
11756 	 * get into a deadlock because we've never released the root vp
11757 	 */
11758 	error = mount_iterref(mp, 0);
11759 	if (error) {
11760 		return error;
11761 	}
11762 	vnode_put(vp);
11763 
11764 	arg = MNT_NOWAIT;
11765 	if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
11766 		arg = MNT_WAIT;
11767 	}
11768 
11769 	/*
11770 	 * If the filessytem supports multiple filesytems in a
11771 	 * partition (For eg APFS volumes in a container, it knows
11772 	 * that the waitfor argument to VFS_SYNC are flags.
11773 	 */
11774 	VFSATTR_INIT(&vfa);
11775 	VFSATTR_WANTED(&vfa, f_capabilities);
11776 	if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
11777 	    VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
11778 	    ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
11779 	    ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
11780 		arg |= MNT_VOLUME;
11781 	}
11782 
11783 	/* issue the sync for this volume */
11784 	(void)sync_callback(mp, &arg);
11785 
11786 	/*
11787 	 * Then release the mount_iterref once we're done syncing; it's not
11788 	 * needed for the VNOP_IOCTL below
11789 	 */
11790 	mount_iterdrop(mp);
11791 
11792 	if (arg & FSCTL_SYNC_FULLSYNC) {
11793 		/* re-obtain vnode iocount on the root vp, if possible */
11794 		error = vnode_getwithvid(vp, vvid);
11795 		if (error == 0) {
11796 			error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
11797 			vnode_put(vp);
11798 		}
11799 	}
11800 	/* mark the argument VP as having been released */
11801 	*arg_vp = NULL;
11802 	return error;
11803 }
11804 
11805 #if ROUTEFS
11806 static int __attribute__((noinline))
handle_routes(user_addr_t udata)11807 handle_routes(user_addr_t udata)
11808 {
11809 	char routepath[MAXPATHLEN];
11810 	size_t len = 0;
11811 	int error;
11812 
11813 	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11814 		return error;
11815 	}
11816 	bzero(routepath, MAXPATHLEN);
11817 	error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
11818 	if (error) {
11819 		return error;
11820 	}
11821 	error = routefs_kernel_mount(routepath);
11822 	return error;
11823 }
11824 #endif
11825 
11826 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)11827 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
11828 {
11829 	struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
11830 	struct vnode_attr va;
11831 	int error;
11832 
11833 	VATTR_INIT(&va);
11834 	VATTR_SET(&va, va_flags, cas->new_flags);
11835 
11836 	error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
11837 	return error;
11838 }
11839 
11840 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)11841 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
11842 {
11843 	struct mount *mp = NULL;
11844 	errno_t rootauth = 0;
11845 
11846 	mp = vp->v_mount;
11847 
11848 	/*
11849 	 * query the underlying FS and see if it reports something
11850 	 * sane for this vnode. If volume is authenticated via
11851 	 * chunklist, leave that for the caller to determine.
11852 	 */
11853 	rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
11854 
11855 	return rootauth;
11856 }
11857 
11858 /*
11859  * Make a filesystem-specific control call:
11860  */
11861 /* ARGSUSED */
11862 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)11863 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
11864 {
11865 	int error = 0;
11866 	boolean_t is64bit;
11867 	u_int size;
11868 #define STK_PARAMS 128
11869 	char stkbuf[STK_PARAMS] = {0};
11870 	caddr_t data, memp;
11871 	vnode_t vp = *arg_vp;
11872 
11873 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
11874 		return ENOTTY;
11875 	}
11876 
11877 	cmd = fsctl_bogus_command_compat(cmd);
11878 
11879 	size = IOCPARM_LEN(cmd);
11880 	if (size > IOCPARM_MAX) {
11881 		return EINVAL;
11882 	}
11883 
11884 	is64bit = proc_is64bit(p);
11885 
11886 	memp = NULL;
11887 
11888 	if (size > sizeof(stkbuf)) {
11889 		if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
11890 			return ENOMEM;
11891 		}
11892 		data = memp;
11893 	} else {
11894 		data = &stkbuf[0];
11895 	};
11896 
11897 	if (cmd & IOC_IN) {
11898 		if (size) {
11899 			error = copyin(udata, data, size);
11900 			if (error) {
11901 				if (memp) {
11902 					kfree_data(memp, size);
11903 				}
11904 				return error;
11905 			}
11906 		} else {
11907 			if (is64bit) {
11908 				*(user_addr_t *)data = udata;
11909 			} else {
11910 				*(uint32_t *)data = (uint32_t)udata;
11911 			}
11912 		};
11913 	} else if ((cmd & IOC_OUT) && size) {
11914 		/*
11915 		 * Zero the buffer so the user always
11916 		 * gets back something deterministic.
11917 		 */
11918 		bzero(data, size);
11919 	} else if (cmd & IOC_VOID) {
11920 		if (is64bit) {
11921 			*(user_addr_t *)data = udata;
11922 		} else {
11923 			*(uint32_t *)data = (uint32_t)udata;
11924 		}
11925 	}
11926 
11927 	/* Check to see if it's a generic command */
11928 	switch (cmd) {
11929 	case FSIOC_SYNC_VOLUME:
11930 		error = handle_sync_volume(vp, arg_vp, data, ctx);
11931 		break;
11932 
11933 	case FSIOC_ROUTEFS_SETROUTEID:
11934 #if ROUTEFS
11935 		error = handle_routes(udata);
11936 #endif
11937 		break;
11938 
11939 	case FSIOC_SET_PACKAGE_EXTS: {
11940 		user_addr_t ext_strings;
11941 		uint32_t    num_entries;
11942 		uint32_t    max_width;
11943 
11944 		if ((error = priv_check_cred(kauth_cred_get(), PRIV_PACKAGE_EXTENSIONS, 0))) {
11945 			break;
11946 		}
11947 
11948 		if ((is64bit && size != sizeof(user64_package_ext_info))
11949 		    || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
11950 			// either you're 64-bit and passed a 64-bit struct or
11951 			// you're 32-bit and passed a 32-bit struct.  otherwise
11952 			// it's not ok.
11953 			error = EINVAL;
11954 			break;
11955 		}
11956 
11957 		if (is64bit) {
11958 			if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
11959 				assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
11960 			}
11961 			ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
11962 			num_entries = ((user64_package_ext_info *)data)->num_entries;
11963 			max_width   = ((user64_package_ext_info *)data)->max_width;
11964 		} else {
11965 			ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
11966 			num_entries = ((user32_package_ext_info *)data)->num_entries;
11967 			max_width   = ((user32_package_ext_info *)data)->max_width;
11968 		}
11969 		error = set_package_extensions_table(ext_strings, num_entries, max_width);
11970 	}
11971 	break;
11972 
11973 	case FSIOC_SET_FSTYPENAME_OVERRIDE:
11974 	{
11975 		if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11976 			break;
11977 		}
11978 		if (vp->v_mount) {
11979 			mount_lock(vp->v_mount);
11980 			if (data[0] != 0) {
11981 				int i;
11982 				for (i = 0; i < MFSTYPENAMELEN; i++) {
11983 					if (!data[i]) {
11984 						goto continue_copy;
11985 					}
11986 				}
11987 				/*
11988 				 * Getting here means we have a user data string which has no
11989 				 * NULL termination in its first MFSTYPENAMELEN bytes.
11990 				 * This is bogus, let's avoid strlcpy-ing the read data and
11991 				 * return an error.
11992 				 */
11993 				error = EINVAL;
11994 				goto unlock;
11995 continue_copy:
11996 				strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
11997 				vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
11998 				if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
11999 					vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
12000 					vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
12001 				}
12002 			} else {
12003 				if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
12004 					vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
12005 				}
12006 				vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
12007 				vp->v_mount->fstypename_override[0] = '\0';
12008 			}
12009 unlock:
12010 			mount_unlock(vp->v_mount);
12011 		}
12012 	}
12013 	break;
12014 
12015 	case DISK_CONDITIONER_IOC_GET: {
12016 		error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12017 	}
12018 	break;
12019 
12020 	case DISK_CONDITIONER_IOC_SET: {
12021 		error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
12022 	}
12023 	break;
12024 
12025 	case FSIOC_CAS_BSDFLAGS:
12026 		error = handle_flags(vp, data, ctx);
12027 		break;
12028 
12029 	case FSIOC_FD_ONLY_OPEN_ONCE: {
12030 		error = 0;
12031 		if (vnode_usecount(vp) > 1) {
12032 			vnode_lock_spin(vp);
12033 			if (vp->v_lflag & VL_HASSTREAMS) {
12034 				if (vnode_isinuse_locked(vp, 1, 1)) {
12035 					error = EBUSY;
12036 				}
12037 			} else if (vnode_usecount(vp) > 1) {
12038 				error = EBUSY;
12039 			}
12040 			vnode_unlock(vp);
12041 		}
12042 	}
12043 	break;
12044 
12045 	case FSIOC_EVAL_ROOTAUTH:
12046 		error = handle_auth(vp, cmd, data, options, ctx);
12047 		break;
12048 
12049 	default: {
12050 		/* other, known commands shouldn't be passed down here */
12051 		switch (cmd) {
12052 		case F_PUNCHHOLE:
12053 		case F_TRIM_ACTIVE_FILE:
12054 		case F_RDADVISE:
12055 		case F_TRANSCODEKEY:
12056 		case F_GETPROTECTIONLEVEL:
12057 		case F_GETDEFAULTPROTLEVEL:
12058 		case F_MAKECOMPRESSED:
12059 		case F_SET_GREEDY_MODE:
12060 		case F_SETSTATICCONTENT:
12061 		case F_SETIOTYPE:
12062 		case F_SETBACKINGSTORE:
12063 		case F_GETPATH_MTMINFO:
12064 		case APFSIOC_REVERT_TO_SNAPSHOT:
12065 		case FSIOC_FIOSEEKHOLE:
12066 		case FSIOC_FIOSEEKDATA:
12067 		case HFS_GET_BOOT_INFO:
12068 		case HFS_SET_BOOT_INFO:
12069 		case FIOPINSWAP:
12070 		case F_CHKCLEAN:
12071 		case F_FULLFSYNC:
12072 		case F_BARRIERFSYNC:
12073 		case F_FREEZE_FS:
12074 		case F_THAW_FS:
12075 		case FSIOC_KERNEL_ROOTAUTH:
12076 			error = EINVAL;
12077 			goto outdrop;
12078 		}
12079 		/* Invoke the filesystem-specific code */
12080 		error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12081 	}
12082 	} /* end switch stmt */
12083 
12084 	/*
12085 	 * if no errors, copy any data to user. Size was
12086 	 * already set and checked above.
12087 	 */
12088 	if (error == 0 && (cmd & IOC_OUT) && size) {
12089 		error = copyout(data, udata, size);
12090 	}
12091 
12092 outdrop:
12093 	if (memp) {
12094 		kfree_data(memp, size);
12095 	}
12096 
12097 	return error;
12098 }
12099 
12100 /* ARGSUSED */
12101 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)12102 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
12103 {
12104 	int error;
12105 	struct nameidata nd;
12106 	uint32_t nameiflags;
12107 	vnode_t vp = NULL;
12108 	vfs_context_t ctx = vfs_context_current();
12109 
12110 	AUDIT_ARG(cmd, (int)uap->cmd);
12111 	AUDIT_ARG(value32, uap->options);
12112 	/* Get the vnode for the file we are getting info on:  */
12113 	nameiflags = 0;
12114 	//
12115 	// if we come through fsctl() then the file is by definition not open.
12116 	// therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
12117 	// lest the caller mistakenly thinks the only open is their own (but in
12118 	// reality it's someone elses).
12119 	//
12120 	if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
12121 		return EINVAL;
12122 	}
12123 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
12124 		nameiflags |= FOLLOW;
12125 	}
12126 	if (uap->cmd == FSIOC_FIRMLINK_CTL) {
12127 		nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
12128 	}
12129 	NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
12130 	    UIO_USERSPACE, uap->path, ctx);
12131 	if ((error = namei(&nd))) {
12132 		goto done;
12133 	}
12134 	vp = nd.ni_vp;
12135 	nameidone(&nd);
12136 
12137 #if CONFIG_MACF
12138 	error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
12139 	if (error) {
12140 		goto done;
12141 	}
12142 #endif
12143 
12144 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12145 
12146 done:
12147 	if (vp) {
12148 		vnode_put(vp);
12149 	}
12150 	return error;
12151 }
12152 /* ARGSUSED */
12153 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)12154 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
12155 {
12156 	int error;
12157 	vnode_t vp = NULL;
12158 	vfs_context_t ctx = vfs_context_current();
12159 	int fd = -1;
12160 
12161 	AUDIT_ARG(fd, uap->fd);
12162 	AUDIT_ARG(cmd, (int)uap->cmd);
12163 	AUDIT_ARG(value32, uap->options);
12164 
12165 	/* Get the vnode for the file we are getting info on:  */
12166 	if ((error = file_vnode(uap->fd, &vp))) {
12167 		return error;
12168 	}
12169 	fd = uap->fd;
12170 	if ((error = vnode_getwithref(vp))) {
12171 		file_drop(fd);
12172 		return error;
12173 	}
12174 
12175 #if CONFIG_MACF
12176 	if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
12177 		file_drop(fd);
12178 		vnode_put(vp);
12179 		return error;
12180 	}
12181 #endif
12182 
12183 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12184 
12185 	file_drop(fd);
12186 
12187 	/*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
12188 	if (vp) {
12189 		vnode_put(vp);
12190 	}
12191 
12192 	return error;
12193 }
12194 /* end of fsctl system call */
12195 
12196 #define FILESEC_ACCESS_ENTITLEMENT              \
12197 	"com.apple.private.vfs.filesec-access"
12198 
12199 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)12200 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
12201 {
12202 	if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
12203 		/*
12204 		 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
12205 		 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
12206 		 */
12207 		if ((!setting && vfs_context_issuser(ctx)) ||
12208 		    IOCurrentTaskHasEntitlement(FILESEC_ACCESS_ENTITLEMENT)) {
12209 			return 0;
12210 		}
12211 	}
12212 
12213 	return EPERM;
12214 }
12215 
12216 /*
12217  *  Retrieve the data of an extended attribute.
12218  */
12219 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)12220 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
12221 {
12222 	vnode_t vp;
12223 	struct nameidata nd;
12224 	char attrname[XATTR_MAXNAMELEN + 1];
12225 	vfs_context_t ctx = vfs_context_current();
12226 	uio_t auio = NULL;
12227 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12228 	size_t attrsize = 0;
12229 	size_t namelen;
12230 	u_int32_t nameiflags;
12231 	int error;
12232 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12233 
12234 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12235 		return EINVAL;
12236 	}
12237 
12238 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12239 	NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
12240 	if ((error = namei(&nd))) {
12241 		return error;
12242 	}
12243 	vp = nd.ni_vp;
12244 	nameidone(&nd);
12245 
12246 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12247 	if (error != 0) {
12248 		goto out;
12249 	}
12250 	if (xattr_protected(attrname) &&
12251 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12252 		goto out;
12253 	}
12254 	/*
12255 	 * the specific check for 0xffffffff is a hack to preserve
12256 	 * binaray compatibilty in K64 with applications that discovered
12257 	 * that passing in a buf pointer and a size of -1 resulted in
12258 	 * just the size of the indicated extended attribute being returned.
12259 	 * this isn't part of the documented behavior, but because of the
12260 	 * original implemtation's check for "uap->size > 0", this behavior
12261 	 * was allowed. In K32 that check turned into a signed comparison
12262 	 * even though uap->size is unsigned...  in K64, we blow by that
12263 	 * check because uap->size is unsigned and doesn't get sign smeared
12264 	 * in the munger for a 32 bit user app.  we also need to add a
12265 	 * check to limit the maximum size of the buffer being passed in...
12266 	 * unfortunately, the underlying fileystems seem to just malloc
12267 	 * the requested size even if the actual extended attribute is tiny.
12268 	 * because that malloc is for kernel wired memory, we have to put a
12269 	 * sane limit on it.
12270 	 *
12271 	 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
12272 	 * U64 running on K64 will yield -1 (64 bits wide)
12273 	 * U32/U64 running on K32 will yield -1 (32 bits wide)
12274 	 */
12275 	if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
12276 		goto no_uio;
12277 	}
12278 
12279 	if (uap->value) {
12280 		if (uap->size > (size_t)XATTR_MAXSIZE) {
12281 			uap->size = XATTR_MAXSIZE;
12282 		}
12283 
12284 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12285 		    &uio_buf[0], sizeof(uio_buf));
12286 		uio_addiov(auio, uap->value, uap->size);
12287 	}
12288 no_uio:
12289 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
12290 out:
12291 	vnode_put(vp);
12292 
12293 	if (auio) {
12294 		*retval = uap->size - uio_resid(auio);
12295 	} else {
12296 		*retval = (user_ssize_t)attrsize;
12297 	}
12298 
12299 	return error;
12300 }
12301 
12302 /*
12303  * Retrieve the data of an extended attribute.
12304  */
12305 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)12306 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
12307 {
12308 	vnode_t vp;
12309 	char attrname[XATTR_MAXNAMELEN + 1];
12310 	vfs_context_t ctx = vfs_context_current();
12311 	uio_t auio = NULL;
12312 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12313 	size_t attrsize = 0;
12314 	size_t namelen;
12315 	int error;
12316 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12317 
12318 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12319 		return EINVAL;
12320 	}
12321 
12322 	if ((error = file_vnode(uap->fd, &vp))) {
12323 		return error;
12324 	}
12325 	if ((error = vnode_getwithref(vp))) {
12326 		file_drop(uap->fd);
12327 		return error;
12328 	}
12329 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12330 	if (error != 0) {
12331 		goto out;
12332 	}
12333 	if (xattr_protected(attrname) &&
12334 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12335 		goto out;
12336 	}
12337 	if (uap->value && uap->size > 0) {
12338 		if (uap->size > (size_t)XATTR_MAXSIZE) {
12339 			uap->size = XATTR_MAXSIZE;
12340 		}
12341 
12342 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12343 		    &uio_buf[0], sizeof(uio_buf));
12344 		uio_addiov(auio, uap->value, uap->size);
12345 	}
12346 
12347 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
12348 out:
12349 	(void)vnode_put(vp);
12350 	file_drop(uap->fd);
12351 
12352 	if (auio) {
12353 		*retval = uap->size - uio_resid(auio);
12354 	} else {
12355 		*retval = (user_ssize_t)attrsize;
12356 	}
12357 	return error;
12358 }
12359 
12360 /* struct for checkdirs iteration */
12361 struct setxattr_ctx {
12362 	struct nameidata nd;
12363 	char attrname[XATTR_MAXNAMELEN + 1];
12364 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12365 };
12366 
12367 /*
12368  * Set the data of an extended attribute.
12369  */
12370 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)12371 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
12372 {
12373 	vnode_t vp;
12374 	vfs_context_t ctx = vfs_context_current();
12375 	uio_t auio = NULL;
12376 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12377 	size_t namelen;
12378 	u_int32_t nameiflags;
12379 	int error;
12380 	struct setxattr_ctx *sactx;
12381 
12382 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12383 		return EINVAL;
12384 	}
12385 
12386 	sactx = (struct setxattr_ctx *)kalloc_data(sizeof(struct setxattr_ctx), Z_WAITOK);
12387 	if (sactx == NULL) {
12388 		return ENOMEM;
12389 	}
12390 
12391 	error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
12392 	if (error != 0) {
12393 		if (error == EPERM) {
12394 			/* if the string won't fit in attrname, copyinstr emits EPERM */
12395 			error = ENAMETOOLONG;
12396 		}
12397 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
12398 		goto out;
12399 	}
12400 	if (xattr_protected(sactx->attrname) &&
12401 	    (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
12402 		goto out;
12403 	}
12404 	if (uap->size != 0 && uap->value == 0) {
12405 		error = EINVAL;
12406 		goto out;
12407 	}
12408 	if (uap->size > INT_MAX) {
12409 		error = E2BIG;
12410 		goto out;
12411 	}
12412 
12413 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12414 	NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
12415 	if ((error = namei(&sactx->nd))) {
12416 		goto out;
12417 	}
12418 	vp = sactx->nd.ni_vp;
12419 	nameidone(&sactx->nd);
12420 
12421 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
12422 	    &sactx->uio_buf[0], sizeof(sactx->uio_buf));
12423 	uio_addiov(auio, uap->value, uap->size);
12424 
12425 	error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
12426 #if CONFIG_FSE
12427 	if (error == 0) {
12428 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
12429 		    FSE_ARG_VNODE, vp,
12430 		    FSE_ARG_DONE);
12431 	}
12432 #endif
12433 	vnode_put(vp);
12434 out:
12435 	kfree_data(sactx, sizeof(struct setxattr_ctx));
12436 	*retval = 0;
12437 	return error;
12438 }
12439 
12440 /*
12441  * Set the data of an extended attribute.
12442  */
12443 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)12444 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
12445 {
12446 	vnode_t vp;
12447 	char attrname[XATTR_MAXNAMELEN + 1];
12448 	vfs_context_t ctx = vfs_context_current();
12449 	uio_t auio = NULL;
12450 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12451 	size_t namelen;
12452 	int error;
12453 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12454 
12455 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12456 		return EINVAL;
12457 	}
12458 
12459 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12460 	if (error != 0) {
12461 		if (error == EPERM) {
12462 			/* if the string won't fit in attrname, copyinstr emits EPERM */
12463 			return ENAMETOOLONG;
12464 		}
12465 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
12466 		return error;
12467 	}
12468 	if (xattr_protected(attrname) &&
12469 	    (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
12470 		return error;
12471 	}
12472 	if (uap->size != 0 && uap->value == 0) {
12473 		return EINVAL;
12474 	}
12475 	if (uap->size > INT_MAX) {
12476 		return E2BIG;
12477 	}
12478 	if ((error = file_vnode(uap->fd, &vp))) {
12479 		return error;
12480 	}
12481 	if ((error = vnode_getwithref(vp))) {
12482 		file_drop(uap->fd);
12483 		return error;
12484 	}
12485 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
12486 	    &uio_buf[0], sizeof(uio_buf));
12487 	uio_addiov(auio, uap->value, uap->size);
12488 
12489 	error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
12490 #if CONFIG_FSE
12491 	if (error == 0) {
12492 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
12493 		    FSE_ARG_VNODE, vp,
12494 		    FSE_ARG_DONE);
12495 	}
12496 #endif
12497 	vnode_put(vp);
12498 	file_drop(uap->fd);
12499 	*retval = 0;
12500 	return error;
12501 }
12502 
12503 /*
12504  * Remove an extended attribute.
12505  * XXX Code duplication here.
12506  */
12507 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)12508 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
12509 {
12510 	vnode_t vp;
12511 	struct nameidata nd;
12512 	char attrname[XATTR_MAXNAMELEN + 1];
12513 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12514 	vfs_context_t ctx = vfs_context_current();
12515 	size_t namelen;
12516 	u_int32_t nameiflags;
12517 	int error;
12518 
12519 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12520 		return EINVAL;
12521 	}
12522 
12523 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12524 	if (error != 0) {
12525 		return error;
12526 	}
12527 	if (xattr_protected(attrname)) {
12528 		return EPERM;
12529 	}
12530 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12531 	NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
12532 	if ((error = namei(&nd))) {
12533 		return error;
12534 	}
12535 	vp = nd.ni_vp;
12536 	nameidone(&nd);
12537 
12538 	error = vn_removexattr(vp, attrname, uap->options, ctx);
12539 #if CONFIG_FSE
12540 	if (error == 0) {
12541 		add_fsevent(FSE_XATTR_REMOVED, ctx,
12542 		    FSE_ARG_VNODE, vp,
12543 		    FSE_ARG_DONE);
12544 	}
12545 #endif
12546 	vnode_put(vp);
12547 	*retval = 0;
12548 	return error;
12549 }
12550 
12551 /*
12552  * Remove an extended attribute.
12553  * XXX Code duplication here.
12554  */
12555 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)12556 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
12557 {
12558 	vnode_t vp;
12559 	char attrname[XATTR_MAXNAMELEN + 1];
12560 	size_t namelen;
12561 	int error;
12562 #if CONFIG_FSE
12563 	vfs_context_t ctx = vfs_context_current();
12564 #endif
12565 
12566 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12567 		return EINVAL;
12568 	}
12569 
12570 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12571 	if (error != 0) {
12572 		return error;
12573 	}
12574 	if (xattr_protected(attrname)) {
12575 		return EPERM;
12576 	}
12577 	if ((error = file_vnode(uap->fd, &vp))) {
12578 		return error;
12579 	}
12580 	if ((error = vnode_getwithref(vp))) {
12581 		file_drop(uap->fd);
12582 		return error;
12583 	}
12584 
12585 	error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
12586 #if CONFIG_FSE
12587 	if (error == 0) {
12588 		add_fsevent(FSE_XATTR_REMOVED, ctx,
12589 		    FSE_ARG_VNODE, vp,
12590 		    FSE_ARG_DONE);
12591 	}
12592 #endif
12593 	vnode_put(vp);
12594 	file_drop(uap->fd);
12595 	*retval = 0;
12596 	return error;
12597 }
12598 
12599 /*
12600  * Retrieve the list of extended attribute names.
12601  * XXX Code duplication here.
12602  */
12603 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)12604 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
12605 {
12606 	vnode_t vp;
12607 	struct nameidata nd;
12608 	vfs_context_t ctx = vfs_context_current();
12609 	uio_t auio = NULL;
12610 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12611 	size_t attrsize = 0;
12612 	u_int32_t nameiflags;
12613 	int error;
12614 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12615 
12616 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12617 		return EINVAL;
12618 	}
12619 
12620 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12621 	NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
12622 	if ((error = namei(&nd))) {
12623 		return error;
12624 	}
12625 	vp = nd.ni_vp;
12626 	nameidone(&nd);
12627 	if (uap->namebuf != 0 && uap->bufsize > 0) {
12628 		auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
12629 		    &uio_buf[0], sizeof(uio_buf));
12630 		uio_addiov(auio, uap->namebuf, uap->bufsize);
12631 	}
12632 
12633 	error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
12634 
12635 	vnode_put(vp);
12636 	if (auio) {
12637 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
12638 	} else {
12639 		*retval = (user_ssize_t)attrsize;
12640 	}
12641 	return error;
12642 }
12643 
12644 /*
12645  * Retrieve the list of extended attribute names.
12646  * XXX Code duplication here.
12647  */
12648 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)12649 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
12650 {
12651 	vnode_t vp;
12652 	uio_t auio = NULL;
12653 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12654 	size_t attrsize = 0;
12655 	int error;
12656 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12657 
12658 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12659 		return EINVAL;
12660 	}
12661 
12662 	if ((error = file_vnode(uap->fd, &vp))) {
12663 		return error;
12664 	}
12665 	if ((error = vnode_getwithref(vp))) {
12666 		file_drop(uap->fd);
12667 		return error;
12668 	}
12669 	if (uap->namebuf != 0 && uap->bufsize > 0) {
12670 		auio = uio_createwithbuffer(1, 0, spacetype,
12671 		    UIO_READ, &uio_buf[0], sizeof(uio_buf));
12672 		uio_addiov(auio, uap->namebuf, uap->bufsize);
12673 	}
12674 
12675 	error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
12676 
12677 	vnode_put(vp);
12678 	file_drop(uap->fd);
12679 	if (auio) {
12680 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
12681 	} else {
12682 		*retval = (user_ssize_t)attrsize;
12683 	}
12684 	return error;
12685 }
12686 
12687 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)12688 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
12689     vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
12690 {
12691 	int error;
12692 	struct mount *mp = NULL;
12693 	vnode_t vp;
12694 	int length;
12695 	int bpflags;
12696 	/* maximum number of times to retry build_path */
12697 	unsigned int retries = 0x10;
12698 
12699 	if (bufsize > PAGE_SIZE) {
12700 		return EINVAL;
12701 	}
12702 
12703 	if (buf == NULL) {
12704 		return ENOMEM;
12705 	}
12706 
12707 retry:
12708 	if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
12709 		error = ENOTSUP;  /* unexpected failure */
12710 		return ENOTSUP;
12711 	}
12712 
12713 #if CONFIG_UNION_MOUNTS
12714 unionget:
12715 #endif /* CONFIG_UNION_MOUNTS */
12716 	if (objid == 2) {
12717 		struct vfs_attr vfsattr;
12718 		int use_vfs_root = TRUE;
12719 
12720 		VFSATTR_INIT(&vfsattr);
12721 		VFSATTR_WANTED(&vfsattr, f_capabilities);
12722 		if (!(options & FSOPT_ISREALFSID) &&
12723 		    vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
12724 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
12725 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
12726 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
12727 				use_vfs_root = FALSE;
12728 			}
12729 		}
12730 
12731 		if (use_vfs_root) {
12732 			error = VFS_ROOT(mp, &vp, ctx);
12733 		} else {
12734 			error = VFS_VGET(mp, objid, &vp, ctx);
12735 		}
12736 	} else {
12737 		error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
12738 	}
12739 
12740 #if CONFIG_UNION_MOUNTS
12741 	if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
12742 		/*
12743 		 * If the fileid isn't found and we're in a union
12744 		 * mount volume, then see if the fileid is in the
12745 		 * mounted-on volume.
12746 		 */
12747 		struct mount *tmp = mp;
12748 		mp = vnode_mount(tmp->mnt_vnodecovered);
12749 		vfs_unbusy(tmp);
12750 		if (vfs_busy(mp, LK_NOWAIT) == 0) {
12751 			goto unionget;
12752 		}
12753 	} else {
12754 		vfs_unbusy(mp);
12755 	}
12756 #else
12757 	vfs_unbusy(mp);
12758 #endif /* CONFIG_UNION_MOUNTS */
12759 
12760 	if (error) {
12761 		return error;
12762 	}
12763 
12764 #if CONFIG_MACF
12765 	error = mac_vnode_check_fsgetpath(ctx, vp);
12766 	if (error) {
12767 		vnode_put(vp);
12768 		return error;
12769 	}
12770 #endif
12771 
12772 	/* Obtain the absolute path to this vnode. */
12773 	bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
12774 	if (options & FSOPT_NOFIRMLINKPATH) {
12775 		bpflags |= BUILDPATH_NO_FIRMLINK;
12776 	}
12777 	bpflags |= BUILDPATH_CHECK_MOVED;
12778 	error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
12779 	vnode_put(vp);
12780 
12781 	if (error) {
12782 		/* there was a race building the path, try a few more times */
12783 		if (error == EAGAIN) {
12784 			--retries;
12785 			if (retries > 0) {
12786 				goto retry;
12787 			}
12788 
12789 			error = ENOENT;
12790 		}
12791 		goto out;
12792 	}
12793 
12794 	AUDIT_ARG(text, buf);
12795 
12796 	if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
12797 		unsigned long path_words[NUMPARMS];
12798 		size_t path_len = sizeof(path_words);
12799 
12800 		if ((size_t)length < path_len) {
12801 			memcpy((char *)path_words, buf, length);
12802 			memset((char *)path_words + length, 0, path_len - length);
12803 
12804 			path_len = length;
12805 		} else {
12806 			memcpy((char *)path_words, buf + (length - path_len), path_len);
12807 		}
12808 
12809 		kdebug_vfs_lookup(path_words, (int)path_len, vp,
12810 		    KDBG_VFS_LOOKUP_FLAG_LOOKUP);
12811 	}
12812 
12813 	*pathlen = length; /* may be superseded by error */
12814 
12815 out:
12816 	return error;
12817 }
12818 
12819 /*
12820  * Obtain the full pathname of a file system object by id.
12821  */
12822 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)12823 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
12824     uint32_t options, user_ssize_t *retval)
12825 {
12826 	vfs_context_t ctx = vfs_context_current();
12827 	fsid_t fsid;
12828 	char *realpath;
12829 	int length;
12830 	int error;
12831 
12832 	if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
12833 		return EINVAL;
12834 	}
12835 
12836 	if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
12837 		return error;
12838 	}
12839 	AUDIT_ARG(value32, fsid.val[0]);
12840 	AUDIT_ARG(value64, objid);
12841 	/* Restrict output buffer size for now. */
12842 
12843 	if (bufsize > PAGE_SIZE || bufsize <= 0) {
12844 		return EINVAL;
12845 	}
12846 	realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
12847 	if (realpath == NULL) {
12848 		return ENOMEM;
12849 	}
12850 
12851 	error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
12852 	    options, &length);
12853 
12854 	if (error) {
12855 		goto out;
12856 	}
12857 
12858 	error = copyout((caddr_t)realpath, buf, length);
12859 
12860 	*retval = (user_ssize_t)length; /* may be superseded by error */
12861 out:
12862 	kfree_data(realpath, bufsize);
12863 	return error;
12864 }
12865 
12866 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)12867 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
12868 {
12869 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
12870 	           0, retval);
12871 }
12872 
12873 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)12874 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
12875 {
12876 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
12877 	           uap->options, retval);
12878 }
12879 
12880 /*
12881  * Common routine to handle various flavors of statfs data heading out
12882  *	to user space.
12883  *
12884  * Returns:	0			Success
12885  *		EFAULT
12886  */
12887 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)12888 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
12889     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
12890     boolean_t partial_copy)
12891 {
12892 	int             error;
12893 	int             my_size, copy_size;
12894 
12895 	if (is_64_bit) {
12896 		struct user64_statfs sfs;
12897 		my_size = copy_size = sizeof(sfs);
12898 		bzero(&sfs, my_size);
12899 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12900 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
12901 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12902 		sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
12903 		sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
12904 		sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
12905 		sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
12906 		sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
12907 		sfs.f_files = (user64_long_t)sfsp->f_files;
12908 		sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
12909 		sfs.f_fsid = sfsp->f_fsid;
12910 		sfs.f_owner = sfsp->f_owner;
12911 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12912 			strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12913 		} else {
12914 			strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12915 		}
12916 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12917 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12918 
12919 		if (partial_copy) {
12920 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12921 		}
12922 		error = copyout((caddr_t)&sfs, bufp, copy_size);
12923 	} else {
12924 		struct user32_statfs sfs;
12925 
12926 		my_size = copy_size = sizeof(sfs);
12927 		bzero(&sfs, my_size);
12928 
12929 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12930 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
12931 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12932 
12933 		/*
12934 		 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
12935 		 * have to fudge the numbers here in that case.   We inflate the blocksize in order
12936 		 * to reflect the filesystem size as best we can.
12937 		 */
12938 		if ((sfsp->f_blocks > INT_MAX)
12939 		    /* Hack for 4061702 . I think the real fix is for Carbon to
12940 		     * look for some volume capability and not depend on hidden
12941 		     * semantics agreed between a FS and carbon.
12942 		     * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
12943 		     * for Carbon to set bNoVolumeSizes volume attribute.
12944 		     * Without this the webdavfs files cannot be copied onto
12945 		     * disk as they look huge. This change should not affect
12946 		     * XSAN as they should not setting these to -1..
12947 		     */
12948 		    && (sfsp->f_blocks != 0xffffffffffffffffULL)
12949 		    && (sfsp->f_bfree != 0xffffffffffffffffULL)
12950 		    && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
12951 			int             shift;
12952 
12953 			/*
12954 			 * Work out how far we have to shift the block count down to make it fit.
12955 			 * Note that it's possible to have to shift so far that the resulting
12956 			 * blocksize would be unreportably large.  At that point, we will clip
12957 			 * any values that don't fit.
12958 			 *
12959 			 * For safety's sake, we also ensure that f_iosize is never reported as
12960 			 * being smaller than f_bsize.
12961 			 */
12962 			for (shift = 0; shift < 32; shift++) {
12963 				if ((sfsp->f_blocks >> shift) <= INT_MAX) {
12964 					break;
12965 				}
12966 				if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
12967 					break;
12968 				}
12969 			}
12970 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
12971 			sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
12972 			sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
12973 			sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
12974 #undef __SHIFT_OR_CLIP
12975 			sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
12976 			sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
12977 		} else {
12978 			/* filesystem is small enough to be reported honestly */
12979 			sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
12980 			sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
12981 			sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
12982 			sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
12983 			sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
12984 		}
12985 		sfs.f_files = (user32_long_t)sfsp->f_files;
12986 		sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
12987 		sfs.f_fsid = sfsp->f_fsid;
12988 		sfs.f_owner = sfsp->f_owner;
12989 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12990 			strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12991 		} else {
12992 			strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12993 		}
12994 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12995 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12996 
12997 		if (partial_copy) {
12998 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12999 		}
13000 		error = copyout((caddr_t)&sfs, bufp, copy_size);
13001 	}
13002 
13003 	if (sizep != NULL) {
13004 		*sizep = my_size;
13005 	}
13006 	return error;
13007 }
13008 
13009 /*
13010  * copy stat structure into user_stat structure.
13011  */
13012 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)13013 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
13014 {
13015 	bzero(usbp, sizeof(*usbp));
13016 
13017 	usbp->st_dev = sbp->st_dev;
13018 	usbp->st_ino = sbp->st_ino;
13019 	usbp->st_mode = sbp->st_mode;
13020 	usbp->st_nlink = sbp->st_nlink;
13021 	usbp->st_uid = sbp->st_uid;
13022 	usbp->st_gid = sbp->st_gid;
13023 	usbp->st_rdev = sbp->st_rdev;
13024 #ifndef _POSIX_C_SOURCE
13025 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13026 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13027 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13028 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13029 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13030 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13031 #else
13032 	usbp->st_atime = sbp->st_atime;
13033 	usbp->st_atimensec = sbp->st_atimensec;
13034 	usbp->st_mtime = sbp->st_mtime;
13035 	usbp->st_mtimensec = sbp->st_mtimensec;
13036 	usbp->st_ctime = sbp->st_ctime;
13037 	usbp->st_ctimensec = sbp->st_ctimensec;
13038 #endif
13039 	usbp->st_size = sbp->st_size;
13040 	usbp->st_blocks = sbp->st_blocks;
13041 	usbp->st_blksize = sbp->st_blksize;
13042 	usbp->st_flags = sbp->st_flags;
13043 	usbp->st_gen = sbp->st_gen;
13044 	usbp->st_lspare = sbp->st_lspare;
13045 	usbp->st_qspare[0] = sbp->st_qspare[0];
13046 	usbp->st_qspare[1] = sbp->st_qspare[1];
13047 }
13048 
13049 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)13050 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
13051 {
13052 	bzero(usbp, sizeof(*usbp));
13053 
13054 	usbp->st_dev = sbp->st_dev;
13055 	usbp->st_ino = sbp->st_ino;
13056 	usbp->st_mode = sbp->st_mode;
13057 	usbp->st_nlink = sbp->st_nlink;
13058 	usbp->st_uid = sbp->st_uid;
13059 	usbp->st_gid = sbp->st_gid;
13060 	usbp->st_rdev = sbp->st_rdev;
13061 #ifndef _POSIX_C_SOURCE
13062 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13063 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13064 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13065 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13066 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13067 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13068 #else
13069 	usbp->st_atime = sbp->st_atime;
13070 	usbp->st_atimensec = sbp->st_atimensec;
13071 	usbp->st_mtime = sbp->st_mtime;
13072 	usbp->st_mtimensec = sbp->st_mtimensec;
13073 	usbp->st_ctime = sbp->st_ctime;
13074 	usbp->st_ctimensec = sbp->st_ctimensec;
13075 #endif
13076 	usbp->st_size = sbp->st_size;
13077 	usbp->st_blocks = sbp->st_blocks;
13078 	usbp->st_blksize = sbp->st_blksize;
13079 	usbp->st_flags = sbp->st_flags;
13080 	usbp->st_gen = sbp->st_gen;
13081 	usbp->st_lspare = sbp->st_lspare;
13082 	usbp->st_qspare[0] = sbp->st_qspare[0];
13083 	usbp->st_qspare[1] = sbp->st_qspare[1];
13084 }
13085 
13086 /*
13087  * copy stat64 structure into user_stat64 structure.
13088  */
13089 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)13090 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
13091 {
13092 	bzero(usbp, sizeof(*usbp));
13093 
13094 	usbp->st_dev = sbp->st_dev;
13095 	usbp->st_ino = sbp->st_ino;
13096 	usbp->st_mode = sbp->st_mode;
13097 	usbp->st_nlink = sbp->st_nlink;
13098 	usbp->st_uid = sbp->st_uid;
13099 	usbp->st_gid = sbp->st_gid;
13100 	usbp->st_rdev = sbp->st_rdev;
13101 #ifndef _POSIX_C_SOURCE
13102 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13103 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13104 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13105 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13106 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13107 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13108 	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
13109 	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
13110 #else
13111 	usbp->st_atime = sbp->st_atime;
13112 	usbp->st_atimensec = sbp->st_atimensec;
13113 	usbp->st_mtime = sbp->st_mtime;
13114 	usbp->st_mtimensec = sbp->st_mtimensec;
13115 	usbp->st_ctime = sbp->st_ctime;
13116 	usbp->st_ctimensec = sbp->st_ctimensec;
13117 	usbp->st_birthtime = sbp->st_birthtime;
13118 	usbp->st_birthtimensec = sbp->st_birthtimensec;
13119 #endif
13120 	usbp->st_size = sbp->st_size;
13121 	usbp->st_blocks = sbp->st_blocks;
13122 	usbp->st_blksize = sbp->st_blksize;
13123 	usbp->st_flags = sbp->st_flags;
13124 	usbp->st_gen = sbp->st_gen;
13125 	usbp->st_lspare = sbp->st_lspare;
13126 	usbp->st_qspare[0] = sbp->st_qspare[0];
13127 	usbp->st_qspare[1] = sbp->st_qspare[1];
13128 }
13129 
13130 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)13131 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
13132 {
13133 	bzero(usbp, sizeof(*usbp));
13134 
13135 	usbp->st_dev = sbp->st_dev;
13136 	usbp->st_ino = sbp->st_ino;
13137 	usbp->st_mode = sbp->st_mode;
13138 	usbp->st_nlink = sbp->st_nlink;
13139 	usbp->st_uid = sbp->st_uid;
13140 	usbp->st_gid = sbp->st_gid;
13141 	usbp->st_rdev = sbp->st_rdev;
13142 #ifndef _POSIX_C_SOURCE
13143 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13144 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13145 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13146 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13147 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13148 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13149 	usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
13150 	usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
13151 #else
13152 	usbp->st_atime = sbp->st_atime;
13153 	usbp->st_atimensec = sbp->st_atimensec;
13154 	usbp->st_mtime = sbp->st_mtime;
13155 	usbp->st_mtimensec = sbp->st_mtimensec;
13156 	usbp->st_ctime = sbp->st_ctime;
13157 	usbp->st_ctimensec = sbp->st_ctimensec;
13158 	usbp->st_birthtime = sbp->st_birthtime;
13159 	usbp->st_birthtimensec = sbp->st_birthtimensec;
13160 #endif
13161 	usbp->st_size = sbp->st_size;
13162 	usbp->st_blocks = sbp->st_blocks;
13163 	usbp->st_blksize = sbp->st_blksize;
13164 	usbp->st_flags = sbp->st_flags;
13165 	usbp->st_gen = sbp->st_gen;
13166 	usbp->st_lspare = sbp->st_lspare;
13167 	usbp->st_qspare[0] = sbp->st_qspare[0];
13168 	usbp->st_qspare[1] = sbp->st_qspare[1];
13169 }
13170 
13171 /*
13172  * Purge buffer cache for simulating cold starts
13173  */
13174 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)13175 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
13176 {
13177 	ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
13178 
13179 	return VNODE_RETURNED;
13180 }
13181 
13182 static int
vfs_purge_callback(mount_t mp,__unused void * arg)13183 vfs_purge_callback(mount_t mp, __unused void * arg)
13184 {
13185 	vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
13186 
13187 	return VFS_RETURNED;
13188 }
13189 
13190 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)13191 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
13192 {
13193 	if (!kauth_cred_issuser(kauth_cred_get())) {
13194 		return EPERM;
13195 	}
13196 
13197 	vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
13198 
13199 	return 0;
13200 }
13201 
13202 /*
13203  * gets the vnode associated with the (unnamed) snapshot directory
13204  * for a Filesystem. The snapshot directory vnode is returned with
13205  * an iocount on it.
13206  */
13207 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)13208 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
13209 {
13210 	return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
13211 }
13212 
13213 /*
13214  * Get the snapshot vnode.
13215  *
13216  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
13217  * needs nameidone() on ndp.
13218  *
13219  * If the snapshot vnode exists it is returned in ndp->ni_vp.
13220  *
13221  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
13222  * not needed.
13223  */
13224 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)13225 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
13226     user_addr_t name, struct nameidata *ndp, int32_t op,
13227 #if !CONFIG_TRIGGERS
13228     __unused
13229 #endif
13230     enum path_operation pathop,
13231     vfs_context_t ctx)
13232 {
13233 	int error, i;
13234 	caddr_t name_buf;
13235 	size_t name_len;
13236 	struct vfs_attr vfa;
13237 
13238 	*sdvpp = NULLVP;
13239 	*rvpp = NULLVP;
13240 
13241 	error = vnode_getfromfd(ctx, dirfd, rvpp);
13242 	if (error) {
13243 		return error;
13244 	}
13245 
13246 	if (!vnode_isvroot(*rvpp)) {
13247 		error = EINVAL;
13248 		goto out;
13249 	}
13250 
13251 	/* Make sure the filesystem supports snapshots */
13252 	VFSATTR_INIT(&vfa);
13253 	VFSATTR_WANTED(&vfa, f_capabilities);
13254 	if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
13255 	    !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
13256 	    !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
13257 	    VOL_CAP_INT_SNAPSHOT)) ||
13258 	    !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
13259 	    VOL_CAP_INT_SNAPSHOT))) {
13260 		error = ENOTSUP;
13261 		goto out;
13262 	}
13263 
13264 	error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
13265 	if (error) {
13266 		goto out;
13267 	}
13268 
13269 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13270 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13271 	if (error) {
13272 		goto out1;
13273 	}
13274 
13275 	/*
13276 	 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
13277 	 * (the length returned by copyinstr includes the terminating NUL)
13278 	 */
13279 	if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
13280 	    (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
13281 		error = EINVAL;
13282 		goto out1;
13283 	}
13284 	for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
13285 		;
13286 	}
13287 	if (i < (int)name_len) {
13288 		error = EINVAL;
13289 		goto out1;
13290 	}
13291 
13292 #if CONFIG_MACF
13293 	if (op == CREATE) {
13294 		error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
13295 		    name_buf);
13296 	} else if (op == DELETE) {
13297 		error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
13298 		    name_buf);
13299 	}
13300 	if (error) {
13301 		goto out1;
13302 	}
13303 #endif
13304 
13305 	/* Check if the snapshot already exists ... */
13306 	NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
13307 	    UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
13308 	ndp->ni_dvp = *sdvpp;
13309 
13310 	error = namei(ndp);
13311 out1:
13312 	zfree(ZV_NAMEI, name_buf);
13313 out:
13314 	if (error) {
13315 		if (*sdvpp) {
13316 			vnode_put(*sdvpp);
13317 			*sdvpp = NULLVP;
13318 		}
13319 		if (*rvpp) {
13320 			vnode_put(*rvpp);
13321 			*rvpp = NULLVP;
13322 		}
13323 	}
13324 	return error;
13325 }
13326 
13327 /*
13328  * create a filesystem snapshot (for supporting filesystems)
13329  *
13330  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
13331  * We get to the (unnamed) snapshot directory vnode and create the vnode
13332  * for the snapshot in it.
13333  *
13334  * Restrictions:
13335  *
13336  *    a) Passed in name for snapshot cannot have slashes.
13337  *    b) name can't be "." or ".."
13338  *
13339  * Since this requires superuser privileges, vnode_authorize calls are not
13340  * made.
13341  */
13342 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13343 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
13344     vfs_context_t ctx)
13345 {
13346 	vnode_t rvp, snapdvp;
13347 	int error;
13348 	struct nameidata *ndp;
13349 
13350 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
13351 
13352 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
13353 	    OP_LINK, ctx);
13354 	if (error) {
13355 		goto out;
13356 	}
13357 
13358 	if (ndp->ni_vp) {
13359 		vnode_put(ndp->ni_vp);
13360 		error = EEXIST;
13361 	} else {
13362 		struct vnode_attr *vap;
13363 		vnode_t vp = NULLVP;
13364 
13365 		vap = kalloc_type(struct vnode_attr, Z_WAITOK);
13366 
13367 		VATTR_INIT(vap);
13368 		VATTR_SET(vap, va_type, VREG);
13369 		VATTR_SET(vap, va_mode, 0);
13370 
13371 		error = vn_create(snapdvp, &vp, ndp, vap,
13372 		    VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
13373 		if (!error && vp) {
13374 			vnode_put(vp);
13375 		}
13376 
13377 		kfree_type(struct vnode_attr, vap);
13378 	}
13379 
13380 	nameidone(ndp);
13381 	vnode_put(snapdvp);
13382 	vnode_put(rvp);
13383 out:
13384 	kfree_type(struct nameidata, ndp);
13385 
13386 	return error;
13387 }
13388 
13389 /*
13390  * Delete a Filesystem snapshot
13391  *
13392  * get the vnode for the unnamed snapshot directory and the snapshot and
13393  * delete the snapshot.
13394  */
13395 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13396 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
13397     vfs_context_t ctx)
13398 {
13399 	vnode_t rvp, snapdvp;
13400 	int error;
13401 	struct nameidata *ndp;
13402 
13403 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
13404 
13405 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
13406 	    OP_UNLINK, ctx);
13407 	if (error) {
13408 		goto out;
13409 	}
13410 
13411 	error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
13412 	    VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
13413 
13414 	vnode_put(ndp->ni_vp);
13415 	nameidone(ndp);
13416 	vnode_put(snapdvp);
13417 	vnode_put(rvp);
13418 out:
13419 	kfree_type(struct nameidata, ndp);
13420 
13421 	return error;
13422 }
13423 
13424 /*
13425  * Revert a filesystem to a snapshot
13426  *
13427  * Marks the filesystem to revert to the given snapshot on next mount.
13428  */
13429 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13430 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
13431     vfs_context_t ctx)
13432 {
13433 	int error;
13434 	vnode_t rvp;
13435 	mount_t mp;
13436 	struct fs_snapshot_revert_args revert_data;
13437 	struct componentname cnp;
13438 	caddr_t name_buf;
13439 	size_t name_len;
13440 
13441 	error = vnode_getfromfd(ctx, dirfd, &rvp);
13442 	if (error) {
13443 		return error;
13444 	}
13445 	mp = vnode_mount(rvp);
13446 
13447 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13448 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13449 	if (error) {
13450 		zfree(ZV_NAMEI, name_buf);
13451 		vnode_put(rvp);
13452 		return error;
13453 	}
13454 
13455 #if CONFIG_MACF
13456 	error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
13457 	if (error) {
13458 		zfree(ZV_NAMEI, name_buf);
13459 		vnode_put(rvp);
13460 		return error;
13461 	}
13462 #endif
13463 
13464 	/*
13465 	 * Grab mount_iterref so that we can release the vnode,
13466 	 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
13467 	 */
13468 	error = mount_iterref(mp, 0);
13469 	vnode_put(rvp);
13470 	if (error) {
13471 		zfree(ZV_NAMEI, name_buf);
13472 		return error;
13473 	}
13474 
13475 	memset(&cnp, 0, sizeof(cnp));
13476 	cnp.cn_pnbuf = (char *)name_buf;
13477 	cnp.cn_nameiop = LOOKUP;
13478 	cnp.cn_flags = ISLASTCN | HASBUF;
13479 	cnp.cn_pnlen = MAXPATHLEN;
13480 	cnp.cn_nameptr = cnp.cn_pnbuf;
13481 	cnp.cn_namelen = (int)name_len;
13482 	revert_data.sr_cnp = &cnp;
13483 
13484 	error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
13485 	mount_iterdrop(mp);
13486 	zfree(ZV_NAMEI, name_buf);
13487 
13488 	if (error) {
13489 		/* If there was any error, try again using VNOP_IOCTL */
13490 
13491 		vnode_t snapdvp;
13492 		struct nameidata namend;
13493 
13494 		error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
13495 		    OP_LOOKUP, ctx);
13496 		if (error) {
13497 			return error;
13498 		}
13499 
13500 
13501 		error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
13502 		    0, ctx);
13503 
13504 		vnode_put(namend.ni_vp);
13505 		nameidone(&namend);
13506 		vnode_put(snapdvp);
13507 		vnode_put(rvp);
13508 	}
13509 
13510 	return error;
13511 }
13512 
13513 /*
13514  * rename a Filesystem snapshot
13515  *
13516  * get the vnode for the unnamed snapshot directory and the snapshot and
13517  * rename the snapshot. This is a very specialised (and simple) case of
13518  * rename(2) (which has to deal with a lot more complications). It differs
13519  * slightly from rename(2) in that EEXIST is returned if the new name exists.
13520  */
13521 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)13522 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
13523     __unused uint32_t flags, vfs_context_t ctx)
13524 {
13525 	vnode_t rvp, snapdvp;
13526 	int error, i;
13527 	caddr_t newname_buf;
13528 	size_t name_len;
13529 	vnode_t fvp;
13530 	struct nameidata *fromnd, *tond;
13531 	/* carving out a chunk for structs that are too big to be on stack. */
13532 	struct {
13533 		struct nameidata from_node;
13534 		struct nameidata to_node;
13535 	} * __rename_data;
13536 
13537 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
13538 	fromnd = &__rename_data->from_node;
13539 	tond = &__rename_data->to_node;
13540 
13541 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
13542 	    OP_UNLINK, ctx);
13543 	if (error) {
13544 		goto out;
13545 	}
13546 	fvp  = fromnd->ni_vp;
13547 
13548 	newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13549 	error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
13550 	if (error) {
13551 		goto out1;
13552 	}
13553 
13554 	/*
13555 	 * Some sanity checks- new name can't be empty, "." or ".." or have
13556 	 * slashes.
13557 	 * (the length returned by copyinstr includes the terminating NUL)
13558 	 *
13559 	 * The FS rename VNOP is suppossed to handle this but we'll pick it
13560 	 * off here itself.
13561 	 */
13562 	if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
13563 	    (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
13564 		error = EINVAL;
13565 		goto out1;
13566 	}
13567 	for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
13568 		;
13569 	}
13570 	if (i < (int)name_len) {
13571 		error = EINVAL;
13572 		goto out1;
13573 	}
13574 
13575 #if CONFIG_MACF
13576 	error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
13577 	    newname_buf);
13578 	if (error) {
13579 		goto out1;
13580 	}
13581 #endif
13582 
13583 	NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
13584 	    UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
13585 	tond->ni_dvp = snapdvp;
13586 
13587 	error = namei(tond);
13588 	if (error) {
13589 		goto out2;
13590 	} else if (tond->ni_vp) {
13591 		/*
13592 		 * snapshot rename behaves differently than rename(2) - if the
13593 		 * new name exists, EEXIST is returned.
13594 		 */
13595 		vnode_put(tond->ni_vp);
13596 		error = EEXIST;
13597 		goto out2;
13598 	}
13599 
13600 	error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
13601 	    &tond->ni_cnd, ctx);
13602 
13603 out2:
13604 	nameidone(tond);
13605 out1:
13606 	zfree(ZV_NAMEI, newname_buf);
13607 	vnode_put(fvp);
13608 	vnode_put(snapdvp);
13609 	vnode_put(rvp);
13610 	nameidone(fromnd);
13611 out:
13612 	kfree_type(typeof(*__rename_data), __rename_data);
13613 	return error;
13614 }
13615 
13616 /*
13617  * Mount a Filesystem snapshot
13618  *
13619  * get the vnode for the unnamed snapshot directory and the snapshot and
13620  * mount the snapshot.
13621  */
13622 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)13623 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
13624     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
13625 {
13626 	mount_t mp;
13627 	vnode_t rvp, snapdvp, snapvp, vp, pvp;
13628 	struct fs_snapshot_mount_args smnt_data;
13629 	int error;
13630 	struct nameidata *snapndp, *dirndp;
13631 	/* carving out a chunk for structs that are too big to be on stack. */
13632 	struct {
13633 		struct nameidata snapnd;
13634 		struct nameidata dirnd;
13635 	} * __snapshot_mount_data;
13636 
13637 	__snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
13638 	snapndp = &__snapshot_mount_data->snapnd;
13639 	dirndp = &__snapshot_mount_data->dirnd;
13640 
13641 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
13642 	    OP_LOOKUP, ctx);
13643 	if (error) {
13644 		goto out;
13645 	}
13646 
13647 	snapvp  = snapndp->ni_vp;
13648 	if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
13649 		error = EIO;
13650 		goto out1;
13651 	}
13652 
13653 	/* Get the vnode to be covered */
13654 	NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
13655 	    UIO_USERSPACE, directory, ctx);
13656 	error = namei(dirndp);
13657 	if (error) {
13658 		goto out1;
13659 	}
13660 
13661 	vp = dirndp->ni_vp;
13662 	pvp = dirndp->ni_dvp;
13663 	mp = vnode_mount(rvp);
13664 
13665 	if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
13666 		error = EINVAL;
13667 		goto out2;
13668 	}
13669 
13670 #if CONFIG_MACF
13671 	error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
13672 	    mp->mnt_vfsstat.f_fstypename);
13673 	if (error) {
13674 		goto out2;
13675 	}
13676 #endif
13677 
13678 	smnt_data.sm_mp  = mp;
13679 	smnt_data.sm_cnp = &snapndp->ni_cnd;
13680 	error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
13681 	    &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
13682 	    KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
13683 
13684 out2:
13685 	vnode_put(vp);
13686 	vnode_put(pvp);
13687 	nameidone(dirndp);
13688 out1:
13689 	vnode_put(snapvp);
13690 	vnode_put(snapdvp);
13691 	vnode_put(rvp);
13692 	nameidone(snapndp);
13693 out:
13694 	kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
13695 	return error;
13696 }
13697 
13698 /*
13699  * Root from a snapshot of the filesystem
13700  *
13701  * Marks the filesystem to root from the given snapshot on next boot.
13702  */
13703 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13704 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
13705     vfs_context_t ctx)
13706 {
13707 	int error;
13708 	vnode_t rvp;
13709 	mount_t mp;
13710 	struct fs_snapshot_root_args root_data;
13711 	struct componentname cnp;
13712 	caddr_t name_buf;
13713 	size_t name_len;
13714 
13715 	error = vnode_getfromfd(ctx, dirfd, &rvp);
13716 	if (error) {
13717 		return error;
13718 	}
13719 	mp = vnode_mount(rvp);
13720 
13721 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13722 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13723 	if (error) {
13724 		zfree(ZV_NAMEI, name_buf);
13725 		vnode_put(rvp);
13726 		return error;
13727 	}
13728 
13729 	// XXX MAC checks ?
13730 
13731 	/*
13732 	 * Grab mount_iterref so that we can release the vnode,
13733 	 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
13734 	 */
13735 	error = mount_iterref(mp, 0);
13736 	vnode_put(rvp);
13737 	if (error) {
13738 		zfree(ZV_NAMEI, name_buf);
13739 		return error;
13740 	}
13741 
13742 	memset(&cnp, 0, sizeof(cnp));
13743 	cnp.cn_pnbuf = (char *)name_buf;
13744 	cnp.cn_nameiop = LOOKUP;
13745 	cnp.cn_flags = ISLASTCN | HASBUF;
13746 	cnp.cn_pnlen = MAXPATHLEN;
13747 	cnp.cn_nameptr = cnp.cn_pnbuf;
13748 	cnp.cn_namelen = (int)name_len;
13749 	root_data.sr_cnp = &cnp;
13750 
13751 	error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
13752 
13753 	mount_iterdrop(mp);
13754 	zfree(ZV_NAMEI, name_buf);
13755 
13756 	return error;
13757 }
13758 
13759 /*
13760  * FS snapshot operations dispatcher
13761  */
13762 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)13763 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
13764     __unused int32_t *retval)
13765 {
13766 	int error;
13767 	vfs_context_t ctx = vfs_context_current();
13768 
13769 	AUDIT_ARG(fd, uap->dirfd);
13770 	AUDIT_ARG(value32, uap->op);
13771 
13772 	error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_SNAPSHOT, 0);
13773 	if (error) {
13774 		return error;
13775 	}
13776 
13777 	/*
13778 	 * Enforce user authorization for snapshot modification operations,
13779 	 * or if trying to root from snapshot.
13780 	 */
13781 	if (uap->op != SNAPSHOT_OP_MOUNT) {
13782 		vnode_t dvp = NULLVP;
13783 		vnode_t devvp = NULLVP;
13784 		mount_t mp;
13785 
13786 		error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
13787 		if (error) {
13788 			return error;
13789 		}
13790 		mp = vnode_mount(dvp);
13791 		devvp = mp->mnt_devvp;
13792 
13793 		/* get an iocount on devvp */
13794 		if (devvp == NULLVP) {
13795 			error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
13796 			/* for mounts which arent block devices */
13797 			if (error == ENOENT) {
13798 				error = ENXIO;
13799 			}
13800 		} else {
13801 			error = vnode_getwithref(devvp);
13802 		}
13803 
13804 		if (error) {
13805 			vnode_put(dvp);
13806 			return error;
13807 		}
13808 
13809 		if ((vfs_context_issuser(ctx) == 0) &&
13810 		    (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
13811 		    (!IOCurrentTaskHasEntitlement("com.apple.private.vfs.snapshot.user"))) {
13812 			error = EPERM;
13813 		}
13814 		vnode_put(dvp);
13815 		vnode_put(devvp);
13816 
13817 		if (error) {
13818 			return error;
13819 		}
13820 	}
13821 
13822 	switch (uap->op) {
13823 	case SNAPSHOT_OP_CREATE:
13824 		error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
13825 		break;
13826 	case SNAPSHOT_OP_DELETE:
13827 		error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
13828 		break;
13829 	case SNAPSHOT_OP_RENAME:
13830 		error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
13831 		    uap->flags, ctx);
13832 		break;
13833 	case SNAPSHOT_OP_MOUNT:
13834 		error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
13835 		    uap->data, uap->flags, ctx);
13836 		break;
13837 	case SNAPSHOT_OP_REVERT:
13838 		error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
13839 		break;
13840 #if CONFIG_MNT_ROOTSNAP
13841 	case SNAPSHOT_OP_ROOT:
13842 		error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
13843 		break;
13844 #endif /* CONFIG_MNT_ROOTSNAP */
13845 	default:
13846 		error = ENOSYS;
13847 	}
13848 
13849 	return error;
13850 }
13851