xref: /xnu-8019.80.24/bsd/vfs/vfs_syscalls.c (revision a325d9c4a84054e40bbe985afedcb50ab80993ea)
1 /*
2  * Copyright (c) 1995-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1989, 1993
30  *	The Regents of the University of California.  All rights reserved.
31  * (c) UNIX System Laboratories, Inc.
32  * All or some portions of this file are derived from material licensed
33  * to the University of California by American Telephone and Telegraph
34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35  * the permission of UNIX System Laboratories, Inc.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. All advertising materials mentioning features or use of this software
46  *    must display the following acknowledgement:
47  *	This product includes software developed by the University of
48  *	California, Berkeley and its contributors.
49  * 4. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.41 (Berkeley) 6/15/95
66  */
67 /*
68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69  * support for mandatory and extensible security protections.  This notice
70  * is included in support of clause 2.2 (b) of the Apple Public License,
71  * Version 2.0.
72  */
73 
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/fsctl.h>
101 #include <sys/ubc_internal.h>
102 #include <sys/disk.h>
103 #include <sys/content_protection.h>
104 #include <sys/clonefile.h>
105 #include <sys/snapshot.h>
106 #include <sys/priv.h>
107 #include <sys/fsgetpath.h>
108 #include <machine/cons.h>
109 #include <machine/limits.h>
110 #include <miscfs/specfs/specdev.h>
111 
112 #include <vfs/vfs_disk_conditioner.h>
113 
114 #include <security/audit/audit.h>
115 #include <bsm/audit_kevents.h>
116 
117 #include <mach/mach_types.h>
118 #include <kern/kern_types.h>
119 #include <kern/kalloc.h>
120 #include <kern/task.h>
121 
122 #include <vm/vm_pageout.h>
123 #include <vm/vm_protos.h>
124 
125 #include <libkern/OSAtomic.h>
126 #include <os/atomic_private.h>
127 #include <pexpert/pexpert.h>
128 #include <IOKit/IOBSD.h>
129 
130 // deps for MIG call
131 #include <kern/host.h>
132 #include <kern/ipc_misc.h>
133 #include <mach/host_priv.h>
134 #include <mach/vfs_nspace.h>
135 #include <os/log.h>
136 
137 #include <nfs/nfs_conf.h>
138 
139 #if ROUTEFS
140 #include <miscfs/routefs/routefs.h>
141 #endif /* ROUTEFS */
142 
143 #if CONFIG_MACF
144 #include <security/mac.h>
145 #include <security/mac_framework.h>
146 #endif
147 
148 #if CONFIG_FSE
149 #define GET_PATH(x) \
150 	((x) = get_pathbuff())
151 #define RELEASE_PATH(x) \
152 	release_pathbuff(x)
153 #else
154 #define GET_PATH(x)     \
155 	((x) = zalloc(ZV_NAMEI))
156 #define RELEASE_PATH(x) \
157 	zfree(ZV_NAMEI, x)
158 #endif /* CONFIG_FSE */
159 
160 #ifndef HFS_GET_BOOT_INFO
161 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
162 #endif
163 
164 #ifndef HFS_SET_BOOT_INFO
165 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
166 #endif
167 
168 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
169 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
170 #endif
171 
172 extern void disk_conditioner_unmount(mount_t mp);
173 
174 /* struct for checkdirs iteration */
175 struct cdirargs {
176 	vnode_t olddp;
177 	vnode_t newdp;
178 };
179 /* callback  for checkdirs iteration */
180 static int checkdirs_callback(proc_t p, void * arg);
181 
182 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
183 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
184 void enablequotas(struct mount *mp, vfs_context_t ctx);
185 static int getfsstat_callback(mount_t mp, void * arg);
186 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
187 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
188 static int sync_callback(mount_t, void *);
189 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
190     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
191     boolean_t partial_copy);
192 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
193 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
194     struct componentname *cnp, user_addr_t fsmountargs,
195     int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
196 void vfs_notify_mount(vnode_t pdvp);
197 
198 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
199 
200 struct fd_vn_data * fg_vn_data_alloc(void);
201 
202 /*
203  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
204  * Concurrent lookups (or lookups by ids) on hard links can cause the
205  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
206  * does) to return ENOENT as the path cannot be returned from the name cache
207  * alone. We have no option but to retry and hope to get one namei->reverse path
208  * generation done without an intervening lookup, lookup by id on the hard link
209  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
210  * which currently are the MAC hooks for rename, unlink and rmdir.
211  */
212 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
213 
214 /* Max retry limit for rename due to vnode recycling. */
215 #define MAX_RENAME_ERECYCLE_RETRIES 1024
216 
217 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
218     int unlink_flags);
219 
220 #ifdef CONFIG_IMGSRC_ACCESS
221 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
222 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
223 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
224 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
225 static void mount_end_update(mount_t mp);
226 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
227 #endif /* CONFIG_IMGSRC_ACCESS */
228 
229 //snapshot functions
230 #if CONFIG_MNT_ROOTSNAP
231 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
232 #else
233 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
234 #endif
235 
236 __private_extern__
237 int sync_internal(void);
238 
239 __private_extern__
240 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
241 
242 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
243 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
244 
245 /* vars for sync mutex */
246 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
247 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
248 
249 extern lck_rw_t rootvnode_rw_lock;
250 
251 /*
252  * incremented each time a mount or unmount operation occurs
253  * used to invalidate the cached value of the rootvp in the
254  * mount structure utilized by cache_lookup_path
255  */
256 uint32_t mount_generation = 0;
257 
258 /* counts number of mount and unmount operations */
259 unsigned int vfs_nummntops = 0;
260 
261 /* system-wide, per-boot unique mount ID */
262 static _Atomic uint64_t mount_unique_id = 1;
263 
264 extern const struct fileops vnops;
265 #if CONFIG_APPLEDOUBLE
266 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
267 #endif /* CONFIG_APPLEDOUBLE */
268 
269 /*
270  * Virtual File System System Calls
271  */
272 
273 /*
274  * Private in-kernel mounting spi (specific use-cases only)
275  */
276 boolean_t
vfs_iskernelmount(mount_t mp)277 vfs_iskernelmount(mount_t mp)
278 {
279 	return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
280 }
281 
282 __private_extern__
283 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)284 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
285     void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
286     vfs_context_t ctx)
287 {
288 	struct nameidata nd;
289 	boolean_t did_namei;
290 	int error;
291 
292 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
293 	    UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
294 
295 	kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
296 
297 	/*
298 	 * Get the vnode to be covered if it's not supplied
299 	 */
300 	if (vp == NULLVP) {
301 		error = namei(&nd);
302 		if (error) {
303 			if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
304 				printf("failed to locate mount-on path: %s ", path);
305 			}
306 			return error;
307 		}
308 		vp = nd.ni_vp;
309 		pvp = nd.ni_dvp;
310 		did_namei = TRUE;
311 	} else {
312 		char *pnbuf = CAST_DOWN(char *, path);
313 
314 		nd.ni_cnd.cn_pnbuf = pnbuf;
315 		nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
316 		did_namei = FALSE;
317 	}
318 
319 	kern_flags |= KERNEL_MOUNT_KMOUNT;
320 	error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
321 	    syscall_flags, kern_flags, NULL, ctx);
322 
323 	if (did_namei) {
324 		vnode_put(vp);
325 		vnode_put(pvp);
326 		nameidone(&nd);
327 	}
328 
329 	return error;
330 }
331 
332 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)333 vfs_mount_at_path(const char *fstype, const char *path,
334     vnode_t pvp, vnode_t vp, void *data, size_t datalen,
335     int mnt_flags, int flags)
336 {
337 	int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
338 	int error, km_flags = 0;
339 
340 	/*
341 	 * This call is currently restricted to specific use cases.
342 	 */
343 	if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
344 		return ENOTSUP;
345 	}
346 
347 #if !defined(XNU_TARGET_OS_OSX)
348 	if (strcmp(fstype, "lifs") == 0) {
349 		syscall_flags |= MNT_NOEXEC;
350 	}
351 #endif
352 
353 	if (flags & VFS_MOUNT_FLAG_NOAUTH) {
354 		km_flags |= KERNEL_MOUNT_NOAUTH;
355 	}
356 	if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
357 		km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
358 	}
359 
360 	error = kernel_mount(fstype, pvp, vp, path, data, datalen,
361 	    syscall_flags, km_flags, vfs_context_kernel());
362 	if (error) {
363 		printf("%s: mount on %s failed, error %d\n", __func__, path,
364 		    error);
365 	}
366 
367 	return error;
368 }
369 
370 int
vfs_mount_override_type_name(mount_t mp,const char * name)371 vfs_mount_override_type_name(mount_t mp, const char *name)
372 {
373 	if (mp == NULL || name == NULL) {
374 		return EINVAL;
375 	}
376 
377 	/* Override the FS type name. */
378 	mount_lock_spin(mp);
379 	strlcpy(mp->fstypename_override, name, sizeof(mp->fstypename_override));
380 	mp->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
381 	mount_unlock(mp);
382 
383 	return 0;
384 }
385 
386 /*
387  * Mount a file system.
388  */
389 /* ARGSUSED */
390 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)391 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
392 {
393 	struct __mac_mount_args muap;
394 
395 	muap.type = uap->type;
396 	muap.path = uap->path;
397 	muap.flags = uap->flags;
398 	muap.data = uap->data;
399 	muap.mac_p = USER_ADDR_NULL;
400 	return __mac_mount(p, &muap, retval);
401 }
402 
403 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)404 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
405 {
406 	struct componentname    cn;
407 	vfs_context_t           ctx = vfs_context_current();
408 	size_t                  dummy = 0;
409 	int                     error;
410 	int                     flags = uap->flags;
411 	char                    fstypename[MFSNAMELEN];
412 	char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
413 	vnode_t                 pvp;
414 	vnode_t                 vp;
415 
416 	AUDIT_ARG(fd, uap->fd);
417 	AUDIT_ARG(fflags, flags);
418 	/* fstypename will get audited by mount_common */
419 
420 	/* Sanity check the flags */
421 	if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
422 		return ENOTSUP;
423 	}
424 
425 	if (flags & MNT_UNION) {
426 		return EPERM;
427 	}
428 
429 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
430 	if (error) {
431 		return error;
432 	}
433 
434 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
435 		return error;
436 	}
437 
438 	if ((error = vnode_getwithref(vp)) != 0) {
439 		file_drop(uap->fd);
440 		return error;
441 	}
442 
443 	pvp = vnode_getparent(vp);
444 	if (pvp == NULL) {
445 		vnode_put(vp);
446 		file_drop(uap->fd);
447 		return EINVAL;
448 	}
449 
450 	memset(&cn, 0, sizeof(struct componentname));
451 	cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
452 	cn.cn_pnlen = MAXPATHLEN;
453 
454 	if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
455 		zfree(ZV_NAMEI, cn.cn_pnbuf);
456 		vnode_put(pvp);
457 		vnode_put(vp);
458 		file_drop(uap->fd);
459 		return error;
460 	}
461 
462 	error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
463 
464 	zfree(ZV_NAMEI, cn.cn_pnbuf);
465 	vnode_put(pvp);
466 	vnode_put(vp);
467 	file_drop(uap->fd);
468 
469 	return error;
470 }
471 
472 void
vfs_notify_mount(vnode_t pdvp)473 vfs_notify_mount(vnode_t pdvp)
474 {
475 	vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
476 	lock_vnode_and_post(pdvp, NOTE_WRITE);
477 }
478 
479 /*
480  * __mac_mount:
481  *	Mount a file system taking into account MAC label behavior.
482  *	See mount(2) man page for more information
483  *
484  * Parameters:    p                        Process requesting the mount
485  *                uap                      User argument descriptor (see below)
486  *                retval                   (ignored)
487  *
488  * Indirect:      uap->type                Filesystem type
489  *                uap->path                Path to mount
490  *                uap->data                Mount arguments
491  *                uap->mac_p               MAC info
492  *                uap->flags               Mount flags
493  *
494  *
495  * Returns:        0                       Success
496  *                !0                       Not success
497  */
498 boolean_t root_fs_upgrade_try = FALSE;
499 
500 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)501 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
502 {
503 	vnode_t pvp = NULL;
504 	vnode_t vp = NULL;
505 	int need_nameidone = 0;
506 	vfs_context_t ctx = vfs_context_current();
507 	char fstypename[MFSNAMELEN];
508 	struct nameidata nd;
509 	size_t dummy = 0;
510 	char *labelstr = NULL;
511 	size_t labelsz = 0;
512 	int flags = uap->flags;
513 	int error;
514 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
515 	boolean_t is_64bit = IS_64BIT_PROCESS(p);
516 #else
517 #pragma unused(p)
518 #endif
519 	/*
520 	 * Get the fs type name from user space
521 	 */
522 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
523 	if (error) {
524 		return error;
525 	}
526 
527 	/*
528 	 * Get the vnode to be covered
529 	 */
530 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
531 	    UIO_USERSPACE, uap->path, ctx);
532 	error = namei(&nd);
533 	if (error) {
534 		goto out;
535 	}
536 	need_nameidone = 1;
537 	vp = nd.ni_vp;
538 	pvp = nd.ni_dvp;
539 
540 #ifdef CONFIG_IMGSRC_ACCESS
541 	/* Mounting image source cannot be batched with other operations */
542 	if (flags == MNT_IMGSRC_BY_INDEX) {
543 		error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
544 		    ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
545 		goto out;
546 	}
547 #endif /* CONFIG_IMGSRC_ACCESS */
548 
549 #if CONFIG_MACF
550 	/*
551 	 * Get the label string (if any) from user space
552 	 */
553 	if (uap->mac_p != USER_ADDR_NULL) {
554 		struct user_mac mac;
555 		size_t ulen = 0;
556 
557 		if (is_64bit) {
558 			struct user64_mac mac64;
559 			error = copyin(uap->mac_p, &mac64, sizeof(mac64));
560 			mac.m_buflen = (user_size_t)mac64.m_buflen;
561 			mac.m_string = (user_addr_t)mac64.m_string;
562 		} else {
563 			struct user32_mac mac32;
564 			error = copyin(uap->mac_p, &mac32, sizeof(mac32));
565 			mac.m_buflen = mac32.m_buflen;
566 			mac.m_string = mac32.m_string;
567 		}
568 		if (error) {
569 			goto out;
570 		}
571 		if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
572 		    (mac.m_buflen < 2)) {
573 			error = EINVAL;
574 			goto out;
575 		}
576 		labelsz = mac.m_buflen;
577 		labelstr = kalloc_data(labelsz, Z_WAITOK);
578 		error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
579 		if (error) {
580 			goto out;
581 		}
582 		AUDIT_ARG(mac_string, labelstr);
583 	}
584 #endif /* CONFIG_MACF */
585 
586 	AUDIT_ARG(fflags, flags);
587 
588 #if !CONFIG_UNION_MOUNTS
589 	if (flags & MNT_UNION) {
590 		error = EPERM;
591 		goto out;
592 	}
593 #endif
594 
595 	if ((vp->v_flag & VROOT) &&
596 	    (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
597 #if CONFIG_UNION_MOUNTS
598 		if (!(flags & MNT_UNION)) {
599 			flags |= MNT_UPDATE;
600 		} else {
601 			/*
602 			 * For a union mount on '/', treat it as fresh
603 			 * mount instead of update.
604 			 * Otherwise, union mouting on '/' used to panic the
605 			 * system before, since mnt_vnodecovered was found to
606 			 * be NULL for '/' which is required for unionlookup
607 			 * after it gets ENOENT on union mount.
608 			 */
609 			flags = (flags & ~(MNT_UPDATE));
610 		}
611 #else
612 		flags |= MNT_UPDATE;
613 #endif /* CONFIG_UNION_MOUNTS */
614 
615 #if SECURE_KERNEL
616 		if ((flags & MNT_RDONLY) == 0) {
617 			/* Release kernels are not allowed to mount "/" as rw */
618 			error = EPERM;
619 			goto out;
620 		}
621 #endif
622 
623 		/*
624 		 * See 7392553 for more details on why this check exists.
625 		 * Suffice to say: If this check is ON and something tries
626 		 * to mount the rootFS RW, we'll turn off the codesign
627 		 * bitmap optimization.
628 		 */
629 #if CHECK_CS_VALIDATION_BITMAP
630 		if ((flags & MNT_RDONLY) == 0) {
631 			root_fs_upgrade_try = TRUE;
632 		}
633 #endif
634 	}
635 
636 	error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
637 	    labelstr, ctx);
638 
639 out:
640 
641 #if CONFIG_MACF
642 	kfree_data(labelstr, labelsz);
643 #endif /* CONFIG_MACF */
644 
645 	if (vp) {
646 		vnode_put(vp);
647 	}
648 	if (pvp) {
649 		vnode_put(pvp);
650 	}
651 	if (need_nameidone) {
652 		nameidone(&nd);
653 	}
654 
655 	return error;
656 }
657 
658 /*
659  * common mount implementation (final stage of mounting)
660  *
661  * Arguments:
662  *  fstypename	file system type (ie it's vfs name)
663  *  pvp		parent of covered vnode
664  *  vp		covered vnode
665  *  cnp		component name (ie path) of covered vnode
666  *  flags	generic mount flags
667  *  fsmountargs	file system specific data
668  *  labelstr	optional MAC label
669  *  kernelmount	TRUE for mounts initiated from inside the kernel
670  *  ctx		caller's context
671  */
672 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)673 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
674     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
675     char *labelstr, vfs_context_t ctx)
676 {
677 #if !CONFIG_MACF
678 #pragma unused(labelstr)
679 #endif
680 	struct vnode *devvp = NULLVP;
681 	struct vnode *device_vnode = NULLVP;
682 #if CONFIG_MACF
683 	struct vnode *rvp;
684 #endif
685 	struct mount *mp;
686 	struct vfstable *vfsp = (struct vfstable *)0;
687 	struct proc *p = vfs_context_proc(ctx);
688 	int error, flag = 0;
689 	bool flag_set = false;
690 	user_addr_t devpath = USER_ADDR_NULL;
691 	int ronly = 0;
692 	int mntalloc = 0;
693 	boolean_t vfsp_ref = FALSE;
694 	boolean_t is_rwlock_locked = FALSE;
695 	boolean_t did_rele = FALSE;
696 	boolean_t have_usecount = FALSE;
697 	boolean_t did_set_lmount = FALSE;
698 	boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
699 
700 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
701 	/* Check for mutually-exclusive flag bits */
702 	uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
703 	int bitcount = 0;
704 	while (checkflags != 0) {
705 		checkflags &= (checkflags - 1);
706 		bitcount++;
707 	}
708 
709 	if (bitcount > 1) {
710 		//not allowed to request multiple mount-by-role flags
711 		error = EINVAL;
712 		goto out1;
713 	}
714 #endif
715 
716 	/*
717 	 * Process an update for an existing mount
718 	 */
719 	if (flags & MNT_UPDATE) {
720 		if ((vp->v_flag & VROOT) == 0) {
721 			error = EINVAL;
722 			goto out1;
723 		}
724 		mp = vp->v_mount;
725 
726 		/* if unmount or mount in progress, return error */
727 		mount_lock_spin(mp);
728 		if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
729 			mount_unlock(mp);
730 			error = EBUSY;
731 			goto out1;
732 		}
733 		mp->mnt_lflag |= MNT_LMOUNT;
734 		did_set_lmount = TRUE;
735 		mount_unlock(mp);
736 		lck_rw_lock_exclusive(&mp->mnt_rwlock);
737 		is_rwlock_locked = TRUE;
738 		/*
739 		 * We only allow the filesystem to be reloaded if it
740 		 * is currently mounted read-only.
741 		 */
742 		if ((flags & MNT_RELOAD) &&
743 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
744 			error = ENOTSUP;
745 			goto out1;
746 		}
747 
748 		/*
749 		 * If content protection is enabled, update mounts are not
750 		 * allowed to turn it off.
751 		 */
752 		if ((mp->mnt_flag & MNT_CPROTECT) &&
753 		    ((flags & MNT_CPROTECT) == 0)) {
754 			error = EINVAL;
755 			goto out1;
756 		}
757 
758 		/*
759 		 * can't turn off MNT_REMOVABLE either but it may be an unexpected
760 		 * failure to return an error for this so we'll just silently
761 		 * add it if it is not passed in.
762 		 */
763 		if ((mp->mnt_flag & MNT_REMOVABLE) &&
764 		    ((flags & MNT_REMOVABLE) == 0)) {
765 			flags |= MNT_REMOVABLE;
766 		}
767 
768 		/* Can't downgrade the backer of the root FS */
769 		if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
770 		    (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
771 			error = ENOTSUP;
772 			goto out1;
773 		}
774 
775 		/*
776 		 * Only root, or the user that did the original mount is
777 		 * permitted to update it.
778 		 */
779 		if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
780 		    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
781 			goto out1;
782 		}
783 #if CONFIG_MACF
784 		error = mac_mount_check_remount(ctx, mp);
785 		if (error != 0) {
786 			goto out1;
787 		}
788 #endif
789 		/*
790 		 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
791 		 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
792 		 */
793 		if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
794 			flags |= MNT_NOSUID | MNT_NODEV;
795 			if (mp->mnt_flag & MNT_NOEXEC) {
796 				flags |= MNT_NOEXEC;
797 			}
798 		}
799 		flag = mp->mnt_flag;
800 		flag_set = true;
801 
802 
803 
804 		mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
805 
806 		vfsp = mp->mnt_vtable;
807 		goto update;
808 	} // MNT_UPDATE
809 
810 	/*
811 	 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
812 	 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
813 	 */
814 	if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
815 		flags |= MNT_NOSUID | MNT_NODEV;
816 		if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
817 			flags |= MNT_NOEXEC;
818 		}
819 	}
820 
821 	/* XXXAUDIT: Should we capture the type on the error path as well? */
822 	/* XXX cast-away const (audit_arg_text() does not modify its input) */
823 	AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
824 	mount_list_lock();
825 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
826 		if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
827 			vfsp->vfc_refcount++;
828 			vfsp_ref = TRUE;
829 			break;
830 		}
831 	}
832 	mount_list_unlock();
833 	if (vfsp == NULL) {
834 		error = ENODEV;
835 		goto out1;
836 	}
837 
838 	/*
839 	 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
840 	 * except in ROSV configs and for the initial BaseSystem root.
841 	 */
842 	if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
843 	    ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
844 	    ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
845 		error = EINVAL;  /* unsupported request */
846 		goto out1;
847 	}
848 
849 	error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
850 	if (error != 0) {
851 		goto out1;
852 	}
853 
854 	/*
855 	 * Allocate and initialize the filesystem (mount_t)
856 	 */
857 	mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
858 	mntalloc = 1;
859 
860 	/* Initialize the default IO constraints */
861 	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
862 	mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
863 	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
864 	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
865 	mp->mnt_devblocksize = DEV_BSIZE;
866 	mp->mnt_alignmentmask = PAGE_MASK;
867 	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
868 	mp->mnt_ioscale = 1;
869 	mp->mnt_ioflags = 0;
870 	mp->mnt_realrootvp = NULLVP;
871 	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
872 
873 	mp->mnt_lflag |= MNT_LMOUNT;
874 	did_set_lmount = TRUE;
875 
876 	TAILQ_INIT(&mp->mnt_vnodelist);
877 	TAILQ_INIT(&mp->mnt_workerqueue);
878 	TAILQ_INIT(&mp->mnt_newvnodes);
879 	mount_lock_init(mp);
880 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
881 	is_rwlock_locked = TRUE;
882 	mp->mnt_op = vfsp->vfc_vfsops;
883 	mp->mnt_vtable = vfsp;
884 	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
885 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
886 	strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
887 	do {
888 		int pathlen = MAXPATHLEN;
889 
890 		if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
891 			strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
892 		}
893 	} while (0);
894 	mp->mnt_vnodecovered = vp;
895 	mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
896 	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
897 	mp->mnt_devbsdunit = 0;
898 	mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
899 
900 	/* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
901 	vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
902 
903 	if (kernelmount) {
904 		mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
905 	}
906 	if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
907 		mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
908 	}
909 
910 	if (KERNEL_MOUNT_DEVFS & internal_flags) {
911 		// kernel mounted devfs
912 		mp->mnt_kern_flag |= MNTK_SYSTEM;
913 	}
914 
915 update:
916 
917 	/*
918 	 * Set the mount level flags.
919 	 */
920 	if (flags & MNT_RDONLY) {
921 		mp->mnt_flag |= MNT_RDONLY;
922 	} else if (mp->mnt_flag & MNT_RDONLY) {
923 		// disallow read/write upgrades of file systems that
924 		// had the TYPENAME_OVERRIDE feature set.
925 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
926 			error = EPERM;
927 			goto out1;
928 		}
929 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
930 	}
931 	mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
932 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
933 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
934 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
935 	    MNT_QUARANTINE | MNT_CPROTECT);
936 
937 #if SECURE_KERNEL
938 #if !CONFIG_MNT_SUID
939 	/*
940 	 * On release builds of iOS based platforms, always enforce NOSUID on
941 	 * all mounts. We do this here because we can catch update mounts as well as
942 	 * non-update mounts in this case.
943 	 */
944 	mp->mnt_flag |= (MNT_NOSUID);
945 #endif
946 #endif
947 
948 	mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
949 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
950 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
951 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
952 	    MNT_QUARANTINE | MNT_CPROTECT);
953 
954 #if CONFIG_MACF
955 	if (flags & MNT_MULTILABEL) {
956 		if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
957 			error = EINVAL;
958 			goto out1;
959 		}
960 		mp->mnt_flag |= MNT_MULTILABEL;
961 	}
962 #endif
963 	/*
964 	 * Process device path for local file systems if requested.
965 	 *
966 	 * Snapshot and mount-by-role mounts do not use this path; they are
967 	 * passing other opaque data in the device path field.
968 	 *
969 	 * Basesystemroot mounts pass a device path to be resolved here,
970 	 * but it's just a char * already inside the kernel, which
971 	 * kernel_mount() shoved into a user_addr_t to call us. So for such
972 	 * mounts we must skip copyin (both of the address and of the string
973 	 * (in NDINIT).
974 	 */
975 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
976 	    !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
977 		boolean_t do_copyin_devpath = true;
978 #if CONFIG_BASESYSTEMROOT
979 		if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
980 			// KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
981 			// We have been passed fsmountargs, which is typed as a user_addr_t,
982 			// but is actually a char ** pointing to a (kernelspace) string.
983 			// We manually unpack it with a series of casts and dereferences
984 			// that reverses what was done just above us on the stack in
985 			// imageboot_pivot_image().
986 			// After retrieving the path to the dev node (which we will NDINIT
987 			// in a moment), we pass NULL fsmountargs on to the filesystem.
988 			_Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
989 			char **devnamepp = (char **)fsmountargs;
990 			char *devnamep = *devnamepp;
991 			devpath = CAST_USER_ADDR_T(devnamep);
992 			do_copyin_devpath = false;
993 			fsmountargs = USER_ADDR_NULL;
994 
995 			//Now that we have a mp, denote that this mount is for the basesystem.
996 			mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
997 		}
998 #endif // CONFIG_BASESYSTEMROOT
999 
1000 		if (do_copyin_devpath) {
1001 			if (vfs_context_is64bit(ctx)) {
1002 				if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1003 					goto out1;
1004 				}
1005 				fsmountargs += sizeof(devpath);
1006 			} else {
1007 				user32_addr_t tmp;
1008 				if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1009 					goto out1;
1010 				}
1011 				/* munge into LP64 addr */
1012 				devpath = CAST_USER_ADDR_T(tmp);
1013 				fsmountargs += sizeof(tmp);
1014 			}
1015 		}
1016 
1017 		/* Lookup device and authorize access to it */
1018 		if ((devpath)) {
1019 			struct nameidata nd;
1020 
1021 			enum uio_seg seg = UIO_USERSPACE;
1022 #if CONFIG_BASESYSTEMROOT
1023 			if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1024 				seg = UIO_SYSSPACE;
1025 			}
1026 #endif // CONFIG_BASESYSTEMROOT
1027 
1028 			NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1029 			if ((error = namei(&nd))) {
1030 				goto out1;
1031 			}
1032 
1033 			strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1034 			devvp = nd.ni_vp;
1035 
1036 			nameidone(&nd);
1037 
1038 			if (devvp->v_type != VBLK) {
1039 				error = ENOTBLK;
1040 				goto out2;
1041 			}
1042 			if (major(devvp->v_rdev) >= nblkdev) {
1043 				error = ENXIO;
1044 				goto out2;
1045 			}
1046 			/*
1047 			 * If mount by non-root, then verify that user has necessary
1048 			 * permissions on the device.
1049 			 */
1050 			if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1051 				mode_t accessmode = KAUTH_VNODE_READ_DATA;
1052 
1053 				if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1054 					accessmode |= KAUTH_VNODE_WRITE_DATA;
1055 				}
1056 				if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1057 					goto out2;
1058 				}
1059 			}
1060 		}
1061 		/* On first mount, preflight and open device */
1062 		if (devpath && ((flags & MNT_UPDATE) == 0)) {
1063 			if ((error = vnode_ref(devvp))) {
1064 				goto out2;
1065 			}
1066 			/*
1067 			 * Disallow multiple mounts of the same device.
1068 			 * Disallow mounting of a device that is currently in use
1069 			 * (except for root, which might share swap device for miniroot).
1070 			 * Flush out any old buffers remaining from a previous use.
1071 			 */
1072 			if ((error = vfs_mountedon(devvp))) {
1073 				goto out3;
1074 			}
1075 
1076 			if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1077 				error = EBUSY;
1078 				goto out3;
1079 			}
1080 			if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1081 				error = ENOTBLK;
1082 				goto out3;
1083 			}
1084 			if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1085 				goto out3;
1086 			}
1087 
1088 			ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1089 #if CONFIG_MACF
1090 			error = mac_vnode_check_open(ctx,
1091 			    devvp,
1092 			    ronly ? FREAD : FREAD | FWRITE);
1093 			if (error) {
1094 				goto out3;
1095 			}
1096 #endif /* MAC */
1097 			if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1098 				goto out3;
1099 			}
1100 
1101 			mp->mnt_devvp = devvp;
1102 			device_vnode = devvp;
1103 		} else if ((mp->mnt_flag & MNT_RDONLY) &&
1104 		    (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1105 		    (device_vnode = mp->mnt_devvp)) {
1106 			dev_t dev;
1107 			int maj;
1108 			/*
1109 			 * If upgrade to read-write by non-root, then verify
1110 			 * that user has necessary permissions on the device.
1111 			 */
1112 			vnode_getalways(device_vnode);
1113 
1114 			if (suser(vfs_context_ucred(ctx), NULL) &&
1115 			    (error = vnode_authorize(device_vnode, NULL,
1116 			    KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1117 			    ctx)) != 0) {
1118 				vnode_put(device_vnode);
1119 				goto out2;
1120 			}
1121 
1122 			/* Tell the device that we're upgrading */
1123 			dev = (dev_t)device_vnode->v_rdev;
1124 			maj = major(dev);
1125 
1126 			if ((u_int)maj >= (u_int)nblkdev) {
1127 				panic("Volume mounted on a device with invalid major number.");
1128 			}
1129 
1130 			error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1131 			vnode_put(device_vnode);
1132 			device_vnode = NULLVP;
1133 			if (error != 0) {
1134 				goto out2;
1135 			}
1136 		}
1137 	} // localargs && !(snapshot | data | vm)
1138 
1139 #if CONFIG_MACF
1140 	if ((flags & MNT_UPDATE) == 0) {
1141 		mac_mount_label_init(mp);
1142 		mac_mount_label_associate(ctx, mp);
1143 	}
1144 	if (labelstr) {
1145 		if ((flags & MNT_UPDATE) != 0) {
1146 			error = mac_mount_check_label_update(ctx, mp);
1147 			if (error != 0) {
1148 				goto out3;
1149 			}
1150 		}
1151 	}
1152 #endif
1153 	/*
1154 	 * Mount the filesystem.  We already asserted that internal_flags
1155 	 * cannot have more than one mount-by-role bit set.
1156 	 */
1157 	if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1158 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1159 		    (caddr_t)fsmountargs, 0, ctx);
1160 	} else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1161 #if CONFIG_ROSV_STARTUP
1162 		struct mount *origin_mp = (struct mount*)fsmountargs;
1163 		fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1164 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1165 		if (error) {
1166 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1167 		} else {
1168 			/* Mark volume associated with system volume */
1169 			mp->mnt_kern_flag |= MNTK_SYSTEM;
1170 
1171 			/* Attempt to acquire the mnt_devvp and set it up */
1172 			struct vnode *mp_devvp = NULL;
1173 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1174 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1175 				    0, &mp_devvp, vfs_context_kernel());
1176 				if (!lerr) {
1177 					mp->mnt_devvp = mp_devvp;
1178 					//vnode_lookup took an iocount, need to drop it.
1179 					vnode_put(mp_devvp);
1180 					// now set `device_vnode` to the devvp that was acquired.
1181 					// this is needed in order to ensure vfs_init_io_attributes is invoked.
1182 					// note that though the iocount above was dropped, the mount acquires
1183 					// an implicit reference against the device.
1184 					device_vnode = mp_devvp;
1185 				}
1186 			}
1187 		}
1188 #else
1189 		error = EINVAL;
1190 #endif
1191 	} else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1192 #if CONFIG_MOUNT_VM
1193 		struct mount *origin_mp = (struct mount*)fsmountargs;
1194 		fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1195 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1196 		if (error) {
1197 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1198 		} else {
1199 			/* Mark volume associated with system volume and a swap mount */
1200 			mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1201 			/* Attempt to acquire the mnt_devvp and set it up */
1202 			struct vnode *mp_devvp = NULL;
1203 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1204 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1205 				    0, &mp_devvp, vfs_context_kernel());
1206 				if (!lerr) {
1207 					mp->mnt_devvp = mp_devvp;
1208 					//vnode_lookup took an iocount, need to drop it.
1209 					vnode_put(mp_devvp);
1210 
1211 					// now set `device_vnode` to the devvp that was acquired.
1212 					// note that though the iocount above was dropped, the mount acquires
1213 					// an implicit reference against the device.
1214 					device_vnode = mp_devvp;
1215 				}
1216 			}
1217 		}
1218 #else
1219 		error = EINVAL;
1220 #endif
1221 	} else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1222 #if CONFIG_MOUNT_PREBOOTRECOVERY
1223 		struct mount *origin_mp = (struct mount*)fsmountargs;
1224 		uint32_t mount_role = 0;
1225 		if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1226 			mount_role = VFS_PREBOOT_ROLE;
1227 		} else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1228 			mount_role = VFS_RECOVERY_ROLE;
1229 		}
1230 
1231 		if (mount_role != 0) {
1232 			fs_role_mount_args_t frma = {origin_mp, mount_role};
1233 			error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1234 			if (error) {
1235 				printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1236 			} else {
1237 				// NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1238 				/* Mark volume associated with system volume */
1239 				//mp->mnt_kern_flag |= MNTK_SYSTEM;
1240 				/* Attempt to acquire the mnt_devvp and set it up */
1241 				struct vnode *mp_devvp = NULL;
1242 				if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1243 					errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1244 					    0, &mp_devvp, vfs_context_kernel());
1245 					if (!lerr) {
1246 						mp->mnt_devvp = mp_devvp;
1247 						//vnode_lookup took an iocount, need to drop it.
1248 						vnode_put(mp_devvp);
1249 
1250 						// now set `device_vnode` to the devvp that was acquired.
1251 						// note that though the iocount above was dropped, the mount acquires
1252 						// an implicit reference against the device.
1253 						device_vnode = mp_devvp;
1254 					}
1255 				}
1256 			}
1257 		} else {
1258 			printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1259 			error = EINVAL;
1260 		}
1261 #else
1262 		error = EINVAL;
1263 #endif
1264 	} else {
1265 		error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1266 	}
1267 
1268 	if (flags & MNT_UPDATE) {
1269 		if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1270 			mp->mnt_flag &= ~MNT_RDONLY;
1271 		}
1272 		mp->mnt_flag &= ~
1273 		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1274 		mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1275 		if (error) {
1276 			mp->mnt_flag = flag;  /* restore flag value */
1277 		}
1278 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1279 		lck_rw_done(&mp->mnt_rwlock);
1280 		is_rwlock_locked = FALSE;
1281 		if (!error) {
1282 			enablequotas(mp, ctx);
1283 		}
1284 		goto exit;
1285 	}
1286 
1287 	/*
1288 	 * Put the new filesystem on the mount list after root.
1289 	 */
1290 	if (error == 0) {
1291 		struct vfs_attr vfsattr;
1292 #if CONFIG_MACF
1293 		error = mac_mount_check_mount_late(ctx, mp);
1294 		if (error != 0) {
1295 			goto out4;
1296 		}
1297 
1298 		if (vfs_flags(mp) & MNT_MULTILABEL) {
1299 			error = VFS_ROOT(mp, &rvp, ctx);
1300 			if (error) {
1301 				printf("%s() VFS_ROOT returned %d\n", __func__, error);
1302 				goto out4;
1303 			}
1304 			error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1305 			/*
1306 			 * drop reference provided by VFS_ROOT
1307 			 */
1308 			vnode_put(rvp);
1309 
1310 			if (error) {
1311 				goto out4;
1312 			}
1313 		}
1314 #endif  /* MAC */
1315 
1316 		vnode_lock_spin(vp);
1317 		CLR(vp->v_flag, VMOUNT);
1318 		vp->v_mountedhere = mp;
1319 		vnode_unlock(vp);
1320 
1321 		/*
1322 		 * taking the name_cache_lock exclusively will
1323 		 * insure that everyone is out of the fast path who
1324 		 * might be trying to use a now stale copy of
1325 		 * vp->v_mountedhere->mnt_realrootvp
1326 		 * bumping mount_generation causes the cached values
1327 		 * to be invalidated
1328 		 */
1329 		name_cache_lock();
1330 		mount_generation++;
1331 		name_cache_unlock();
1332 
1333 		error = vnode_ref(vp);
1334 		if (error != 0) {
1335 			goto out4;
1336 		}
1337 
1338 		have_usecount = TRUE;
1339 
1340 		error = checkdirs(vp, ctx);
1341 		if (error != 0) {
1342 			/* Unmount the filesystem as cdir/rdirs cannot be updated */
1343 			goto out4;
1344 		}
1345 		/*
1346 		 * there is no cleanup code here so I have made it void
1347 		 * we need to revisit this
1348 		 */
1349 		(void)VFS_START(mp, 0, ctx);
1350 
1351 		if (mount_list_add(mp) != 0) {
1352 			/*
1353 			 * The system is shutting down trying to umount
1354 			 * everything, so fail with a plausible errno.
1355 			 */
1356 			error = EBUSY;
1357 			goto out4;
1358 		}
1359 		lck_rw_done(&mp->mnt_rwlock);
1360 		is_rwlock_locked = FALSE;
1361 
1362 		/* Check if this mounted file system supports EAs or named streams. */
1363 		/* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1364 		VFSATTR_INIT(&vfsattr);
1365 		VFSATTR_WANTED(&vfsattr, f_capabilities);
1366 		if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1367 		    vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1368 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1369 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1370 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1371 				mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1372 			}
1373 #if NAMEDSTREAMS
1374 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1375 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1376 				mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1377 			}
1378 #endif
1379 			/* Check if this file system supports path from id lookups. */
1380 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1381 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1382 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1383 			} else if (mp->mnt_flag & MNT_DOVOLFS) {
1384 				/* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1385 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1386 			}
1387 
1388 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1389 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1390 				mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1391 			}
1392 		}
1393 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1394 			mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1395 		}
1396 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1397 			mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1398 		}
1399 		/* increment the operations count */
1400 		OSAddAtomic(1, &vfs_nummntops);
1401 		enablequotas(mp, ctx);
1402 
1403 		if (device_vnode) {
1404 			device_vnode->v_specflags |= SI_MOUNTEDON;
1405 
1406 			/*
1407 			 *   cache the IO attributes for the underlying physical media...
1408 			 *   an error return indicates the underlying driver doesn't
1409 			 *   support all the queries necessary... however, reasonable
1410 			 *   defaults will have been set, so no reason to bail or care
1411 			 */
1412 			vfs_init_io_attributes(device_vnode, mp);
1413 		}
1414 
1415 		/* Now that mount is setup, notify the listeners */
1416 		vfs_notify_mount(pvp);
1417 		IOBSDMountChange(mp, kIOMountChangeMount);
1418 	} else {
1419 		/* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1420 		if (mp->mnt_vnodelist.tqh_first != NULL) {
1421 			panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1422 			    mp->mnt_vtable->vfc_name, error);
1423 		}
1424 
1425 		vnode_lock_spin(vp);
1426 		CLR(vp->v_flag, VMOUNT);
1427 		vnode_unlock(vp);
1428 		mount_list_lock();
1429 		mp->mnt_vtable->vfc_refcount--;
1430 		mount_list_unlock();
1431 
1432 		if (device_vnode) {
1433 			vnode_rele(device_vnode);
1434 			VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1435 		}
1436 		lck_rw_done(&mp->mnt_rwlock);
1437 		is_rwlock_locked = FALSE;
1438 
1439 		/*
1440 		 * if we get here, we have a mount structure that needs to be freed,
1441 		 * but since the coveredvp hasn't yet been updated to point at it,
1442 		 * no need to worry about other threads holding a crossref on this mp
1443 		 * so it's ok to just free it
1444 		 */
1445 		mount_lock_destroy(mp);
1446 #if CONFIG_MACF
1447 		mac_mount_label_destroy(mp);
1448 #endif
1449 		zfree(mount_zone, mp);
1450 		did_set_lmount = false;
1451 	}
1452 exit:
1453 	/*
1454 	 * drop I/O count on the device vp if there was one
1455 	 */
1456 	if (devpath && devvp) {
1457 		vnode_put(devvp);
1458 	}
1459 
1460 	if (did_set_lmount) {
1461 		mount_lock_spin(mp);
1462 		mp->mnt_lflag &= ~MNT_LMOUNT;
1463 		mount_unlock(mp);
1464 	}
1465 
1466 	return error;
1467 
1468 /* Error condition exits */
1469 out4:
1470 	(void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1471 
1472 	/*
1473 	 * If the mount has been placed on the covered vp,
1474 	 * it may have been discovered by now, so we have
1475 	 * to treat this just like an unmount
1476 	 */
1477 	mount_lock_spin(mp);
1478 	mp->mnt_lflag |= MNT_LDEAD;
1479 	mount_unlock(mp);
1480 
1481 	if (device_vnode != NULLVP) {
1482 		vnode_rele(device_vnode);
1483 		VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1484 		    ctx);
1485 		did_rele = TRUE;
1486 	}
1487 
1488 	vnode_lock_spin(vp);
1489 
1490 	mp->mnt_crossref++;
1491 	vp->v_mountedhere = (mount_t) 0;
1492 
1493 	vnode_unlock(vp);
1494 
1495 	if (have_usecount) {
1496 		vnode_rele(vp);
1497 	}
1498 out3:
1499 	if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1500 		vnode_rele(devvp);
1501 	}
1502 out2:
1503 	if (devpath && devvp) {
1504 		vnode_put(devvp);
1505 	}
1506 out1:
1507 	/* Release mnt_rwlock only when it was taken */
1508 	if (is_rwlock_locked == TRUE) {
1509 		if (flag_set) {
1510 			mp->mnt_flag = flag;  /* restore mnt_flag value */
1511 		}
1512 		lck_rw_done(&mp->mnt_rwlock);
1513 	}
1514 
1515 	if (did_set_lmount) {
1516 		mount_lock_spin(mp);
1517 		mp->mnt_lflag &= ~MNT_LMOUNT;
1518 		mount_unlock(mp);
1519 	}
1520 
1521 	if (mntalloc) {
1522 		if (mp->mnt_crossref) {
1523 			mount_dropcrossref(mp, vp, 0);
1524 		} else {
1525 			mount_lock_destroy(mp);
1526 #if CONFIG_MACF
1527 			mac_mount_label_destroy(mp);
1528 #endif
1529 			zfree(mount_zone, mp);
1530 		}
1531 	}
1532 	if (vfsp_ref) {
1533 		mount_list_lock();
1534 		vfsp->vfc_refcount--;
1535 		mount_list_unlock();
1536 	}
1537 
1538 	return error;
1539 }
1540 
1541 /*
1542  * Flush in-core data, check for competing mount attempts,
1543  * and set VMOUNT
1544  */
1545 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)1546 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1547 {
1548 #if !CONFIG_MACF
1549 #pragma unused(cnp,fsname)
1550 #endif
1551 	struct vnode_attr va;
1552 	int error;
1553 	boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1554 	boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1555 	boolean_t is_busy;
1556 
1557 	if (!skip_auth) {
1558 		/*
1559 		 * If the user is not root, ensure that they own the directory
1560 		 * onto which we are attempting to mount.
1561 		 */
1562 		VATTR_INIT(&va);
1563 		VATTR_WANTED(&va, va_uid);
1564 		if ((error = vnode_getattr(vp, &va, ctx)) ||
1565 		    (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1566 		    (!vfs_context_issuser(ctx)))) {
1567 			error = EPERM;
1568 			goto out;
1569 		}
1570 	}
1571 
1572 	if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1573 		goto out;
1574 	}
1575 
1576 	if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1577 		goto out;
1578 	}
1579 
1580 	if (vp->v_type != VDIR) {
1581 		error = ENOTDIR;
1582 		goto out;
1583 	}
1584 
1585 	vnode_lock_spin(vp);
1586 	is_busy = is_fmount ?
1587 	    (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL)) :
1588 	    (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL));
1589 	if (is_busy) {
1590 		vnode_unlock(vp);
1591 		error = EBUSY;
1592 		goto out;
1593 	}
1594 	SET(vp->v_flag, VMOUNT);
1595 	vnode_unlock(vp);
1596 
1597 #if CONFIG_MACF
1598 	error = mac_mount_check_mount(ctx, vp,
1599 	    cnp, fsname);
1600 	if (error != 0) {
1601 		vnode_lock_spin(vp);
1602 		CLR(vp->v_flag, VMOUNT);
1603 		vnode_unlock(vp);
1604 	}
1605 #endif
1606 
1607 out:
1608 	return error;
1609 }
1610 
1611 #if CONFIG_IMGSRC_ACCESS
1612 
1613 #define DEBUG_IMGSRC 0
1614 
1615 #if DEBUG_IMGSRC
1616 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1617 #else
1618 #define IMGSRC_DEBUG(args...) do { } while(0)
1619 #endif
1620 
1621 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)1622 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1623 {
1624 	struct nameidata nd;
1625 	vnode_t vp, realdevvp;
1626 	mode_t accessmode;
1627 	int error;
1628 	enum uio_seg uio = UIO_USERSPACE;
1629 
1630 	if (ctx == vfs_context_kernel()) {
1631 		uio = UIO_SYSSPACE;
1632 	}
1633 
1634 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1635 	if ((error = namei(&nd))) {
1636 		IMGSRC_DEBUG("namei() failed with %d\n", error);
1637 		return error;
1638 	}
1639 
1640 	vp = nd.ni_vp;
1641 
1642 	if (!vnode_isblk(vp)) {
1643 		IMGSRC_DEBUG("Not block device.\n");
1644 		error = ENOTBLK;
1645 		goto out;
1646 	}
1647 
1648 	realdevvp = mp->mnt_devvp;
1649 	if (realdevvp == NULLVP) {
1650 		IMGSRC_DEBUG("No device backs the mount.\n");
1651 		error = ENXIO;
1652 		goto out;
1653 	}
1654 
1655 	error = vnode_getwithref(realdevvp);
1656 	if (error != 0) {
1657 		IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1658 		goto out;
1659 	}
1660 
1661 	if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1662 		IMGSRC_DEBUG("Wrong dev_t.\n");
1663 		error = ENXIO;
1664 		goto out1;
1665 	}
1666 
1667 	strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1668 
1669 	/*
1670 	 * If mount by non-root, then verify that user has necessary
1671 	 * permissions on the device.
1672 	 */
1673 	if (!vfs_context_issuser(ctx)) {
1674 		accessmode = KAUTH_VNODE_READ_DATA;
1675 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1676 			accessmode |= KAUTH_VNODE_WRITE_DATA;
1677 		}
1678 		if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1679 			IMGSRC_DEBUG("Access denied.\n");
1680 			goto out1;
1681 		}
1682 	}
1683 
1684 	*devvpp = vp;
1685 
1686 out1:
1687 	vnode_put(realdevvp);
1688 
1689 out:
1690 	nameidone(&nd);
1691 
1692 	if (error) {
1693 		vnode_put(vp);
1694 	}
1695 
1696 	return error;
1697 }
1698 
1699 /*
1700  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1701  * and call checkdirs()
1702  */
1703 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)1704 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1705 {
1706 	int error;
1707 
1708 	mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1709 
1710 	IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
1711 	    mp->mnt_vtable->vfc_name, vnode_getname(vp));
1712 
1713 	vnode_lock_spin(vp);
1714 	CLR(vp->v_flag, VMOUNT);
1715 	vp->v_mountedhere = mp;
1716 	vnode_unlock(vp);
1717 
1718 	/*
1719 	 * taking the name_cache_lock exclusively will
1720 	 * insure that everyone is out of the fast path who
1721 	 * might be trying to use a now stale copy of
1722 	 * vp->v_mountedhere->mnt_realrootvp
1723 	 * bumping mount_generation causes the cached values
1724 	 * to be invalidated
1725 	 */
1726 	name_cache_lock();
1727 	mount_generation++;
1728 	name_cache_unlock();
1729 
1730 	error = vnode_ref(vp);
1731 	if (error != 0) {
1732 		goto out;
1733 	}
1734 
1735 	error = checkdirs(vp, ctx);
1736 	if (error != 0) {
1737 		/* Unmount the filesystem as cdir/rdirs cannot be updated */
1738 		vnode_rele(vp);
1739 		goto out;
1740 	}
1741 
1742 out:
1743 	if (error != 0) {
1744 		mp->mnt_vnodecovered = NULLVP;
1745 	}
1746 	return error;
1747 }
1748 
1749 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)1750 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1751 {
1752 	vnode_rele(vp);
1753 	vnode_lock_spin(vp);
1754 	vp->v_mountedhere = (mount_t)NULL;
1755 	vnode_unlock(vp);
1756 
1757 	mp->mnt_vnodecovered = NULLVP;
1758 }
1759 
1760 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)1761 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1762 {
1763 	int error;
1764 
1765 	/* unmount in progress return error */
1766 	mount_lock_spin(mp);
1767 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1768 		mount_unlock(mp);
1769 		return EBUSY;
1770 	}
1771 	mount_unlock(mp);
1772 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
1773 
1774 	/*
1775 	 * We only allow the filesystem to be reloaded if it
1776 	 * is currently mounted read-only.
1777 	 */
1778 	if ((flags & MNT_RELOAD) &&
1779 	    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1780 		error = ENOTSUP;
1781 		goto out;
1782 	}
1783 
1784 	/*
1785 	 * Only root, or the user that did the original mount is
1786 	 * permitted to update it.
1787 	 */
1788 	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1789 	    (!vfs_context_issuser(ctx))) {
1790 		error = EPERM;
1791 		goto out;
1792 	}
1793 #if CONFIG_MACF
1794 	error = mac_mount_check_remount(ctx, mp);
1795 	if (error != 0) {
1796 		goto out;
1797 	}
1798 #endif
1799 
1800 out:
1801 	if (error) {
1802 		lck_rw_done(&mp->mnt_rwlock);
1803 	}
1804 
1805 	return error;
1806 }
1807 
1808 static void
mount_end_update(mount_t mp)1809 mount_end_update(mount_t mp)
1810 {
1811 	lck_rw_done(&mp->mnt_rwlock);
1812 }
1813 
1814 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)1815 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1816 {
1817 	vnode_t vp;
1818 
1819 	if (height >= MAX_IMAGEBOOT_NESTING) {
1820 		return EINVAL;
1821 	}
1822 
1823 	vp = imgsrc_rootvnodes[height];
1824 	if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1825 		*rvpp = vp;
1826 		return 0;
1827 	} else {
1828 		return ENOENT;
1829 	}
1830 }
1831 
1832 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)1833 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
1834     struct componentname *cnp, const char *fsname, vfs_context_t ctx,
1835     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1836 {
1837 	int error;
1838 	mount_t mp;
1839 	boolean_t placed = FALSE;
1840 	struct vfstable *vfsp;
1841 	user_addr_t devpath;
1842 	char *old_mntonname;
1843 	vnode_t rvp;
1844 	vnode_t devvp;
1845 	uint32_t height;
1846 	uint32_t flags;
1847 
1848 	/* If we didn't imageboot, nothing to move */
1849 	if (imgsrc_rootvnodes[0] == NULLVP) {
1850 		return EINVAL;
1851 	}
1852 
1853 	/* Only root can do this */
1854 	if (!vfs_context_issuser(ctx)) {
1855 		return EPERM;
1856 	}
1857 
1858 	IMGSRC_DEBUG("looking for root vnode.\n");
1859 
1860 	/*
1861 	 * Get root vnode of filesystem we're moving.
1862 	 */
1863 	if (by_index) {
1864 		if (is64bit) {
1865 			struct user64_mnt_imgsrc_args mia64;
1866 			error = copyin(fsmountargs, &mia64, sizeof(mia64));
1867 			if (error != 0) {
1868 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
1869 				return error;
1870 			}
1871 
1872 			height = mia64.mi_height;
1873 			flags = mia64.mi_flags;
1874 			devpath = (user_addr_t)mia64.mi_devpath;
1875 		} else {
1876 			struct user32_mnt_imgsrc_args mia32;
1877 			error = copyin(fsmountargs, &mia32, sizeof(mia32));
1878 			if (error != 0) {
1879 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
1880 				return error;
1881 			}
1882 
1883 			height = mia32.mi_height;
1884 			flags = mia32.mi_flags;
1885 			devpath = mia32.mi_devpath;
1886 		}
1887 	} else {
1888 		/*
1889 		 * For binary compatibility--assumes one level of nesting.
1890 		 */
1891 		if (is64bit) {
1892 			if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1893 				return error;
1894 			}
1895 		} else {
1896 			user32_addr_t tmp;
1897 			if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1898 				return error;
1899 			}
1900 
1901 			/* munge into LP64 addr */
1902 			devpath = CAST_USER_ADDR_T(tmp);
1903 		}
1904 
1905 		height = 0;
1906 		flags = 0;
1907 	}
1908 
1909 	if (flags != 0) {
1910 		IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1911 		return EINVAL;
1912 	}
1913 
1914 	error = get_imgsrc_rootvnode(height, &rvp);
1915 	if (error != 0) {
1916 		IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
1917 		return error;
1918 	}
1919 
1920 	IMGSRC_DEBUG("got old root vnode\n");
1921 
1922 	old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
1923 
1924 	/* Can only move once */
1925 	mp = vnode_mount(rvp);
1926 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1927 		IMGSRC_DEBUG("Already moved.\n");
1928 		error = EBUSY;
1929 		goto out0;
1930 	}
1931 
1932 	IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
1933 	IMGSRC_DEBUG("Starting updated.\n");
1934 
1935 	/* Get exclusive rwlock on mount, authorize update on mp */
1936 	error = mount_begin_update(mp, ctx, 0);
1937 	if (error != 0) {
1938 		IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1939 		goto out0;
1940 	}
1941 
1942 	/*
1943 	 * It can only be moved once.  Flag is set under the rwlock,
1944 	 * so we're now safe to proceed.
1945 	 */
1946 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1947 		IMGSRC_DEBUG("Already moved [2]\n");
1948 		goto out1;
1949 	}
1950 
1951 	IMGSRC_DEBUG("Preparing coveredvp.\n");
1952 
1953 	/* Mark covered vnode as mount in progress, authorize placing mount on top */
1954 	error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
1955 	if (error != 0) {
1956 		IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1957 		goto out1;
1958 	}
1959 
1960 	IMGSRC_DEBUG("Covered vp OK.\n");
1961 
1962 	/* Sanity check the name caller has provided */
1963 	vfsp = mp->mnt_vtable;
1964 	if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1965 		IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
1966 		    vfsp->vfc_name, fsname);
1967 		error = EINVAL;
1968 		goto out2;
1969 	}
1970 
1971 	/* Check the device vnode and update mount-from name, for local filesystems */
1972 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1973 		IMGSRC_DEBUG("Local, doing device validation.\n");
1974 
1975 		if (devpath != USER_ADDR_NULL) {
1976 			error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1977 			if (error) {
1978 				IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1979 				goto out2;
1980 			}
1981 
1982 			vnode_put(devvp);
1983 		}
1984 	}
1985 
1986 	/*
1987 	 * Place mp on top of vnode, ref the vnode,  call checkdirs(),
1988 	 * and increment the name cache's mount generation
1989 	 */
1990 
1991 	IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1992 	error = place_mount_and_checkdirs(mp, vp, ctx);
1993 	if (error != 0) {
1994 		goto out2;
1995 	}
1996 
1997 	placed = TRUE;
1998 
1999 	strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2000 	strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2001 
2002 	/* Forbid future moves */
2003 	mount_lock(mp);
2004 	mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2005 	mount_unlock(mp);
2006 
2007 	/* Finally, add to mount list, completely ready to go */
2008 	if (mount_list_add(mp) != 0) {
2009 		/*
2010 		 * The system is shutting down trying to umount
2011 		 * everything, so fail with a plausible errno.
2012 		 */
2013 		error = EBUSY;
2014 		goto out3;
2015 	}
2016 
2017 	mount_end_update(mp);
2018 	vnode_put(rvp);
2019 	zfree(ZV_NAMEI, old_mntonname);
2020 
2021 	vfs_notify_mount(pvp);
2022 
2023 	return 0;
2024 out3:
2025 	strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2026 
2027 	mount_lock(mp);
2028 	mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2029 	mount_unlock(mp);
2030 
2031 out2:
2032 	/*
2033 	 * Placing the mp on the vnode clears VMOUNT,
2034 	 * so cleanup is different after that point
2035 	 */
2036 	if (placed) {
2037 		/* Rele the vp, clear VMOUNT and v_mountedhere */
2038 		undo_place_on_covered_vp(mp, vp);
2039 	} else {
2040 		vnode_lock_spin(vp);
2041 		CLR(vp->v_flag, VMOUNT);
2042 		vnode_unlock(vp);
2043 	}
2044 out1:
2045 	mount_end_update(mp);
2046 
2047 out0:
2048 	vnode_put(rvp);
2049 	zfree(ZV_NAMEI, old_mntonname);
2050 	return error;
2051 }
2052 
2053 #endif /* CONFIG_IMGSRC_ACCESS */
2054 
2055 void
enablequotas(struct mount * mp,vfs_context_t ctx)2056 enablequotas(struct mount *mp, vfs_context_t ctx)
2057 {
2058 	struct nameidata qnd;
2059 	int type;
2060 	char qfpath[MAXPATHLEN];
2061 	const char *qfname = QUOTAFILENAME;
2062 	const char *qfopsname = QUOTAOPSNAME;
2063 	const char *qfextension[] = INITQFNAMES;
2064 
2065 	/* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2066 	if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2067 		return;
2068 	}
2069 	/*
2070 	 * Enable filesystem disk quotas if necessary.
2071 	 * We ignore errors as this should not interfere with final mount
2072 	 */
2073 	for (type = 0; type < MAXQUOTAS; type++) {
2074 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2075 		NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2076 		    CAST_USER_ADDR_T(qfpath), ctx);
2077 		if (namei(&qnd) != 0) {
2078 			continue;           /* option file to trigger quotas is not present */
2079 		}
2080 		vnode_put(qnd.ni_vp);
2081 		nameidone(&qnd);
2082 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2083 
2084 		(void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2085 	}
2086 	return;
2087 }
2088 
2089 
2090 static int
checkdirs_callback(proc_t p,void * arg)2091 checkdirs_callback(proc_t p, void * arg)
2092 {
2093 	struct cdirargs *cdrp = (struct cdirargs *)arg;
2094 	vnode_t olddp = cdrp->olddp;
2095 	vnode_t newdp = cdrp->newdp;
2096 	struct filedesc *fdp = &p->p_fd;
2097 	vnode_t new_cvp = newdp;
2098 	vnode_t new_rvp = newdp;
2099 	vnode_t old_cvp = NULL;
2100 	vnode_t old_rvp = NULL;
2101 
2102 	/*
2103 	 * XXX Also needs to iterate each thread in the process to see if it
2104 	 * XXX is using a per-thread current working directory, and, if so,
2105 	 * XXX update that as well.
2106 	 */
2107 
2108 	/*
2109 	 * First, with the proc_fdlock held, check to see if we will need
2110 	 * to do any work.  If not, we will get out fast.
2111 	 */
2112 	proc_fdlock(p);
2113 	if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2114 		proc_fdunlock(p);
2115 		return PROC_RETURNED;
2116 	}
2117 	proc_fdunlock(p);
2118 
2119 	/*
2120 	 * Ok, we will have to do some work.  Always take two refs
2121 	 * because we might need that many.  We'll dispose of whatever
2122 	 * we ended up not using.
2123 	 */
2124 	if (vnode_ref(newdp) != 0) {
2125 		return PROC_RETURNED;
2126 	}
2127 	if (vnode_ref(newdp) != 0) {
2128 		vnode_rele(newdp);
2129 		return PROC_RETURNED;
2130 	}
2131 
2132 	proc_dirs_lock_exclusive(p);
2133 	/*
2134 	 * Now do the work.  Note: we dropped the proc_fdlock, so we
2135 	 * have to do all of the checks again.
2136 	 */
2137 	proc_fdlock(p);
2138 	if (fdp->fd_cdir == olddp) {
2139 		old_cvp = olddp;
2140 		fdp->fd_cdir = newdp;
2141 		new_cvp = NULL;
2142 	}
2143 	if (fdp->fd_rdir == olddp) {
2144 		old_rvp = olddp;
2145 		fdp->fd_rdir = newdp;
2146 		new_rvp = NULL;
2147 	}
2148 	proc_fdunlock(p);
2149 	proc_dirs_unlock_exclusive(p);
2150 
2151 	/*
2152 	 * Dispose of any references that are no longer needed.
2153 	 */
2154 	if (old_cvp != NULL) {
2155 		vnode_rele(old_cvp);
2156 	}
2157 	if (old_rvp != NULL) {
2158 		vnode_rele(old_rvp);
2159 	}
2160 	if (new_cvp != NULL) {
2161 		vnode_rele(new_cvp);
2162 	}
2163 	if (new_rvp != NULL) {
2164 		vnode_rele(new_rvp);
2165 	}
2166 
2167 	return PROC_RETURNED;
2168 }
2169 
2170 
2171 
2172 /*
2173  * Scan all active processes to see if any of them have a current
2174  * or root directory onto which the new filesystem has just been
2175  * mounted. If so, replace them with the new mount point.
2176  */
2177 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2178 checkdirs(vnode_t olddp, vfs_context_t ctx)
2179 {
2180 	vnode_t newdp;
2181 	vnode_t tvp;
2182 	int err;
2183 	struct cdirargs cdr;
2184 
2185 	if (olddp->v_usecount == 1) {
2186 		return 0;
2187 	}
2188 	err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2189 
2190 	if (err != 0) {
2191 #if DIAGNOSTIC
2192 		panic("mount: lost mount: error %d", err);
2193 #endif
2194 		return err;
2195 	}
2196 
2197 	cdr.olddp = olddp;
2198 	cdr.newdp = newdp;
2199 	/* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2200 	proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2201 
2202 	if (rootvnode == olddp) {
2203 		vnode_ref(newdp);
2204 		lck_rw_lock_exclusive(&rootvnode_rw_lock);
2205 		tvp = rootvnode;
2206 		rootvnode = newdp;
2207 		lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2208 		vnode_rele(tvp);
2209 	}
2210 
2211 	vnode_put(newdp);
2212 	return 0;
2213 }
2214 
2215 /*
2216  * Unmount a file system.
2217  *
2218  * Note: unmount takes a path to the vnode mounted on as argument,
2219  * not special file (as before).
2220  */
2221 /* ARGSUSED */
2222 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2223 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2224 {
2225 	vnode_t vp;
2226 	struct mount *mp;
2227 	int error;
2228 	struct nameidata nd;
2229 	vfs_context_t ctx = vfs_context_current();
2230 
2231 	NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2232 	    UIO_USERSPACE, uap->path, ctx);
2233 	error = namei(&nd);
2234 	if (error) {
2235 		return error;
2236 	}
2237 	vp = nd.ni_vp;
2238 	mp = vp->v_mount;
2239 	nameidone(&nd);
2240 
2241 #if CONFIG_MACF
2242 	error = mac_mount_check_umount(ctx, mp);
2243 	if (error != 0) {
2244 		vnode_put(vp);
2245 		return error;
2246 	}
2247 #endif
2248 	/*
2249 	 * Must be the root of the filesystem
2250 	 */
2251 	if ((vp->v_flag & VROOT) == 0) {
2252 		vnode_put(vp);
2253 		return EINVAL;
2254 	}
2255 	mount_ref(mp, 0);
2256 	vnode_put(vp);
2257 	/* safedounmount consumes the mount ref */
2258 	return safedounmount(mp, uap->flags, ctx);
2259 }
2260 
2261 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2262 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2263 {
2264 	mount_t mp;
2265 
2266 	mp = mount_list_lookupby_fsid(fsid, 0, 1);
2267 	if (mp == (mount_t)0) {
2268 		return ENOENT;
2269 	}
2270 	mount_ref(mp, 0);
2271 	mount_iterdrop(mp);
2272 	/* safedounmount consumes the mount ref */
2273 	return safedounmount(mp, flags, ctx);
2274 }
2275 
2276 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT        \
2277 	"com.apple.private.vfs.role-account-unmount"
2278 
2279 /*
2280  * The mount struct comes with a mount ref which will be consumed.
2281  * Do the actual file system unmount, prevent some common foot shooting.
2282  */
2283 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2284 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2285 {
2286 	int error;
2287 	proc_t p = vfs_context_proc(ctx);
2288 
2289 	/*
2290 	 * If the file system is not responding and MNT_NOBLOCK
2291 	 * is set and not a forced unmount then return EBUSY.
2292 	 */
2293 	if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2294 	    (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2295 		error = EBUSY;
2296 		goto out;
2297 	}
2298 
2299 	/*
2300 	 * Skip authorization in two cases:
2301 	 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2302 	 *   This entitlement allows non-root processes unmount volumes mounted by
2303 	 *   other processes.
2304 	 * - If the mount is tagged as permissive and this is not a forced-unmount
2305 	 *   attempt.
2306 	 */
2307 	if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2308 	    (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2309 		/*
2310 		 * Only root, or the user that did the original mount is
2311 		 * permitted to unmount this filesystem.
2312 		 */
2313 		if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2314 		    (error = suser(kauth_cred_get(), &p->p_acflag))) {
2315 			goto out;
2316 		}
2317 	}
2318 	/*
2319 	 * Don't allow unmounting the root file system, or other volumes
2320 	 * associated with it (for example, the associated VM or DATA mounts) .
2321 	 */
2322 	if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2323 		if (!(mp->mnt_flag & MNT_ROOTFS)) {
2324 			printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2325 			    mp->mnt_vfsstat.f_mntonname);
2326 		}
2327 		error = EBUSY; /* the root (or associated volumes) is always busy */
2328 		goto out;
2329 	}
2330 
2331 	/*
2332 	 * If the mount is providing the root filesystem's disk image
2333 	 * (i.e. imageboot), don't allow unmounting
2334 	 */
2335 	if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2336 		error = EBUSY;
2337 		goto out;
2338 	}
2339 
2340 	return dounmount(mp, flags, 1, ctx);
2341 
2342 out:
2343 	mount_drop(mp, 0);
2344 	return error;
2345 }
2346 
2347 /*
2348  * Do the actual file system unmount.
2349  */
2350 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2351 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2352 {
2353 	vnode_t coveredvp = (vnode_t)0;
2354 	int error;
2355 	int needwakeup = 0;
2356 	int forcedunmount = 0;
2357 	int lflags = 0;
2358 	struct vnode *devvp = NULLVP;
2359 #if CONFIG_TRIGGERS
2360 	proc_t p = vfs_context_proc(ctx);
2361 	int did_vflush = 0;
2362 	int pflags_save = 0;
2363 #endif /* CONFIG_TRIGGERS */
2364 
2365 #if CONFIG_FSE
2366 	if (!(flags & MNT_FORCE)) {
2367 		fsevent_unmount(mp, ctx);  /* has to come first! */
2368 	}
2369 #endif
2370 
2371 	mount_lock(mp);
2372 
2373 	/*
2374 	 * If already an unmount in progress just return EBUSY.
2375 	 * Even a forced unmount cannot override.
2376 	 */
2377 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2378 		if (withref != 0) {
2379 			mount_drop(mp, 1);
2380 		}
2381 		mount_unlock(mp);
2382 		return EBUSY;
2383 	}
2384 
2385 	if (flags & MNT_FORCE) {
2386 		forcedunmount = 1;
2387 		mp->mnt_lflag |= MNT_LFORCE;
2388 	}
2389 
2390 #if CONFIG_TRIGGERS
2391 	if (flags & MNT_NOBLOCK && p != kernproc) {
2392 		pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2393 	}
2394 #endif
2395 
2396 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
2397 	mp->mnt_lflag |= MNT_LUNMOUNT;
2398 	mp->mnt_flag &= ~MNT_ASYNC;
2399 	/*
2400 	 * anyone currently in the fast path that
2401 	 * trips over the cached rootvp will be
2402 	 * dumped out and forced into the slow path
2403 	 * to regenerate a new cached value
2404 	 */
2405 	mp->mnt_realrootvp = NULLVP;
2406 	mount_unlock(mp);
2407 
2408 	if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2409 		/*
2410 		 * Force unmount any mounts in this filesystem.
2411 		 * If any unmounts fail - just leave them dangling.
2412 		 * Avoids recursion.
2413 		 */
2414 		(void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2415 	}
2416 
2417 	/*
2418 	 * taking the name_cache_lock exclusively will
2419 	 * insure that everyone is out of the fast path who
2420 	 * might be trying to use a now stale copy of
2421 	 * vp->v_mountedhere->mnt_realrootvp
2422 	 * bumping mount_generation causes the cached values
2423 	 * to be invalidated
2424 	 */
2425 	name_cache_lock();
2426 	mount_generation++;
2427 	name_cache_unlock();
2428 
2429 
2430 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2431 	if (withref != 0) {
2432 		mount_drop(mp, 0);
2433 	}
2434 	error = 0;
2435 	if (forcedunmount == 0) {
2436 		ubc_umount(mp); /* release cached vnodes */
2437 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2438 			error = VFS_SYNC(mp, MNT_WAIT, ctx);
2439 			if (error) {
2440 				mount_lock(mp);
2441 				mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2442 				mp->mnt_lflag &= ~MNT_LUNMOUNT;
2443 				mp->mnt_lflag &= ~MNT_LFORCE;
2444 				goto out;
2445 			}
2446 		}
2447 	}
2448 
2449 	IOBSDMountChange(mp, kIOMountChangeUnmount);
2450 
2451 #if CONFIG_TRIGGERS
2452 	vfs_nested_trigger_unmounts(mp, flags, ctx);
2453 	did_vflush = 1;
2454 #endif
2455 	if (forcedunmount) {
2456 		lflags |= FORCECLOSE;
2457 	}
2458 	error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2459 	if ((forcedunmount == 0) && error) {
2460 		mount_lock(mp);
2461 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2462 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2463 		mp->mnt_lflag &= ~MNT_LFORCE;
2464 		goto out;
2465 	}
2466 
2467 	/* make sure there are no one in the mount iterations or lookup */
2468 	mount_iterdrain(mp);
2469 
2470 	error = VFS_UNMOUNT(mp, flags, ctx);
2471 	if (error) {
2472 		mount_iterreset(mp);
2473 		mount_lock(mp);
2474 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2475 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2476 		mp->mnt_lflag &= ~MNT_LFORCE;
2477 		goto out;
2478 	}
2479 
2480 	/* increment the operations count */
2481 	if (!error) {
2482 		OSAddAtomic(1, &vfs_nummntops);
2483 	}
2484 
2485 	if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2486 		/* hold an io reference and drop the usecount before close */
2487 		devvp = mp->mnt_devvp;
2488 		vnode_getalways(devvp);
2489 		vnode_rele(devvp);
2490 		VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2491 		    ctx);
2492 		vnode_clearmountedon(devvp);
2493 		vnode_put(devvp);
2494 	}
2495 	lck_rw_done(&mp->mnt_rwlock);
2496 	mount_list_remove(mp);
2497 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2498 
2499 	/* mark the mount point hook in the vp but not drop the ref yet */
2500 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2501 		/*
2502 		 * The covered vnode needs special handling. Trying to get an
2503 		 * iocount must not block here as this may lead to deadlocks
2504 		 * if the Filesystem to which the covered vnode belongs is
2505 		 * undergoing forced unmounts. Since we hold a usecount, the
2506 		 * vnode cannot be reused (it can, however, still be terminated)
2507 		 */
2508 		vnode_getalways(coveredvp);
2509 		vnode_lock_spin(coveredvp);
2510 
2511 		mp->mnt_crossref++;
2512 		coveredvp->v_mountedhere = (struct mount *)0;
2513 		CLR(coveredvp->v_flag, VMOUNT);
2514 
2515 		vnode_unlock(coveredvp);
2516 		vnode_put(coveredvp);
2517 	}
2518 
2519 	mount_list_lock();
2520 	mp->mnt_vtable->vfc_refcount--;
2521 	mount_list_unlock();
2522 
2523 	cache_purgevfs(mp);     /* remove cache entries for this file sys */
2524 	vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2525 	mount_lock(mp);
2526 	mp->mnt_lflag |= MNT_LDEAD;
2527 
2528 	if (mp->mnt_lflag & MNT_LWAIT) {
2529 		/*
2530 		 * do the wakeup here
2531 		 * in case we block in mount_refdrain
2532 		 * which will drop the mount lock
2533 		 * and allow anyone blocked in vfs_busy
2534 		 * to wakeup and see the LDEAD state
2535 		 */
2536 		mp->mnt_lflag &= ~MNT_LWAIT;
2537 		wakeup((caddr_t)mp);
2538 	}
2539 	mount_refdrain(mp);
2540 
2541 	/* free disk_conditioner_info structure for this mount */
2542 	disk_conditioner_unmount(mp);
2543 
2544 out:
2545 	if (mp->mnt_lflag & MNT_LWAIT) {
2546 		mp->mnt_lflag &= ~MNT_LWAIT;
2547 		needwakeup = 1;
2548 	}
2549 
2550 #if CONFIG_TRIGGERS
2551 	if (flags & MNT_NOBLOCK && p != kernproc) {
2552 		// Restore P_NOREMOTEHANG bit to its previous value
2553 		if ((pflags_save & P_NOREMOTEHANG) == 0) {
2554 			OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2555 		}
2556 	}
2557 
2558 	/*
2559 	 * Callback and context are set together under the mount lock, and
2560 	 * never cleared, so we're safe to examine them here, drop the lock,
2561 	 * and call out.
2562 	 */
2563 	if (mp->mnt_triggercallback != NULL) {
2564 		mount_unlock(mp);
2565 		if (error == 0) {
2566 			mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2567 		} else if (did_vflush) {
2568 			mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2569 		}
2570 	} else {
2571 		mount_unlock(mp);
2572 	}
2573 #else
2574 	mount_unlock(mp);
2575 #endif /* CONFIG_TRIGGERS */
2576 
2577 	lck_rw_done(&mp->mnt_rwlock);
2578 
2579 	if (needwakeup) {
2580 		wakeup((caddr_t)mp);
2581 	}
2582 
2583 	if (!error) {
2584 		if ((coveredvp != NULLVP)) {
2585 			vnode_t pvp = NULLVP;
2586 
2587 			/*
2588 			 * The covered vnode needs special handling. Trying to
2589 			 * get an iocount must not block here as this may lead
2590 			 * to deadlocks if the Filesystem to which the covered
2591 			 * vnode belongs is undergoing forced unmounts. Since we
2592 			 * hold a usecount, the  vnode cannot be reused
2593 			 * (it can, however, still be terminated).
2594 			 */
2595 			vnode_getalways(coveredvp);
2596 
2597 			mount_dropcrossref(mp, coveredvp, 0);
2598 			/*
2599 			 * We'll _try_ to detect if this really needs to be
2600 			 * done. The coveredvp can only be in termination (or
2601 			 * terminated) if the coveredvp's mount point is in a
2602 			 * forced unmount (or has been) since we still hold the
2603 			 * ref.
2604 			 */
2605 			if (!vnode_isrecycled(coveredvp)) {
2606 				pvp = vnode_getparent(coveredvp);
2607 #if CONFIG_TRIGGERS
2608 				if (coveredvp->v_resolve) {
2609 					vnode_trigger_rearm(coveredvp, ctx);
2610 				}
2611 #endif
2612 			}
2613 
2614 			vnode_rele(coveredvp);
2615 			vnode_put(coveredvp);
2616 			coveredvp = NULLVP;
2617 
2618 			if (pvp) {
2619 				lock_vnode_and_post(pvp, NOTE_WRITE);
2620 				vnode_put(pvp);
2621 			}
2622 		} else if (mp->mnt_flag & MNT_ROOTFS) {
2623 			mount_lock_destroy(mp);
2624 #if CONFIG_MACF
2625 			mac_mount_label_destroy(mp);
2626 #endif
2627 			zfree(mount_zone, mp);
2628 		} else {
2629 			panic("dounmount: no coveredvp");
2630 		}
2631 	}
2632 	return error;
2633 }
2634 
2635 /*
2636  * Unmount any mounts in this filesystem.
2637  */
2638 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)2639 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2640 {
2641 	mount_t smp;
2642 	fsid_t *fsids, fsid;
2643 	int fsids_sz;
2644 	int count = 0, i, m = 0;
2645 	vnode_t vp;
2646 
2647 	mount_list_lock();
2648 
2649 	// Get an array to hold the submounts fsids.
2650 	TAILQ_FOREACH(smp, &mountlist, mnt_list)
2651 	count++;
2652 	fsids_sz = count * sizeof(fsid_t);
2653 	fsids = kalloc_data(fsids_sz, Z_NOWAIT);
2654 	if (fsids == NULL) {
2655 		mount_list_unlock();
2656 		goto out;
2657 	}
2658 	fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
2659 
2660 	/*
2661 	 * Fill the array with submount fsids.
2662 	 * Since mounts are always added to the tail of the mount list, the
2663 	 * list is always in mount order.
2664 	 * For each mount check if the mounted-on vnode belongs to a
2665 	 * mount that's already added to our array of mounts to be unmounted.
2666 	 */
2667 	for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2668 		vp = smp->mnt_vnodecovered;
2669 		if (vp == NULL) {
2670 			continue;
2671 		}
2672 		fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
2673 		for (i = 0; i <= m; i++) {
2674 			if (fsids[i].val[0] == fsid.val[0] &&
2675 			    fsids[i].val[1] == fsid.val[1]) {
2676 				fsids[++m] = smp->mnt_vfsstat.f_fsid;
2677 				break;
2678 			}
2679 		}
2680 	}
2681 	mount_list_unlock();
2682 
2683 	// Unmount the submounts in reverse order. Ignore errors.
2684 	for (i = m; i > 0; i--) {
2685 		smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2686 		if (smp) {
2687 			mount_ref(smp, 0);
2688 			mount_iterdrop(smp);
2689 			(void) dounmount(smp, flags, 1, ctx);
2690 		}
2691 	}
2692 out:
2693 	kfree_data(fsids, fsids_sz);
2694 }
2695 
2696 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)2697 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2698 {
2699 	vnode_lock(dp);
2700 	mp->mnt_crossref--;
2701 
2702 	if (mp->mnt_crossref < 0) {
2703 		panic("mount cross refs -ve");
2704 	}
2705 
2706 	if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2707 		if (need_put) {
2708 			vnode_put_locked(dp);
2709 		}
2710 		vnode_unlock(dp);
2711 
2712 		mount_lock_destroy(mp);
2713 #if CONFIG_MACF
2714 		mac_mount_label_destroy(mp);
2715 #endif
2716 		zfree(mount_zone, mp);
2717 		return;
2718 	}
2719 	if (need_put) {
2720 		vnode_put_locked(dp);
2721 	}
2722 	vnode_unlock(dp);
2723 }
2724 
2725 
2726 /*
2727  * Sync each mounted filesystem.
2728  */
2729 #if DIAGNOSTIC
2730 int syncprt = 0;
2731 #endif
2732 
2733 int print_vmpage_stat = 0;
2734 
2735 /*
2736  * sync_callback:	simple wrapper that calls VFS_SYNC() on volumes
2737  *			mounted read-write with the passed waitfor value.
2738  *
2739  * Parameters:	mp	mount-point descriptor per mounted file-system instance.
2740  *		arg	user argument (please see below)
2741  *
2742  * User argument is a pointer to 32 bit unsigned integer which describes the
2743  * type of waitfor value to set for calling VFS_SYNC().  If user argument is
2744  * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
2745  * waitfor value.
2746  *
2747  * Returns:		VFS_RETURNED
2748  */
2749 static int
sync_callback(mount_t mp,void * arg)2750 sync_callback(mount_t mp, void *arg)
2751 {
2752 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2753 		int asyncflag = mp->mnt_flag & MNT_ASYNC;
2754 		unsigned waitfor = MNT_NOWAIT;
2755 
2756 		if (arg) {
2757 			waitfor = *(uint32_t*)arg;
2758 		}
2759 
2760 		/* Sanity check for flags - these are the only valid combinations for the flag bits*/
2761 		if (waitfor != MNT_WAIT &&
2762 		    waitfor != (MNT_WAIT | MNT_VOLUME) &&
2763 		    waitfor != MNT_NOWAIT &&
2764 		    waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
2765 		    waitfor != MNT_DWAIT &&
2766 		    waitfor != (MNT_DWAIT | MNT_VOLUME)) {
2767 			panic("Passed inappropriate waitfor %u to "
2768 			    "sync_callback()", waitfor);
2769 		}
2770 
2771 		mp->mnt_flag &= ~MNT_ASYNC;
2772 		(void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
2773 		if (asyncflag) {
2774 			mp->mnt_flag |= MNT_ASYNC;
2775 		}
2776 	}
2777 
2778 	return VFS_RETURNED;
2779 }
2780 
2781 /* ARGSUSED */
2782 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)2783 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2784 {
2785 	vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2786 
2787 	if (print_vmpage_stat) {
2788 		vm_countdirtypages();
2789 	}
2790 
2791 #if DIAGNOSTIC
2792 	if (syncprt) {
2793 		vfs_bufstats();
2794 	}
2795 #endif /* DIAGNOSTIC */
2796 	return 0;
2797 }
2798 
2799 typedef enum {
2800 	SYNC_ALL = 0,
2801 	SYNC_ONLY_RELIABLE_MEDIA = 1,
2802 	SYNC_ONLY_UNRELIABLE_MEDIA = 2
2803 } sync_type_t;
2804 
2805 static int
sync_internal_callback(mount_t mp,void * arg)2806 sync_internal_callback(mount_t mp, void *arg)
2807 {
2808 	if (arg) {
2809 		int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
2810 		    (mp->mnt_flag & MNT_LOCAL);
2811 		sync_type_t sync_type = *((sync_type_t *)arg);
2812 
2813 		if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
2814 			return VFS_RETURNED;
2815 		} else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
2816 			return VFS_RETURNED;
2817 		}
2818 	}
2819 
2820 	(void)sync_callback(mp, NULL);
2821 
2822 	return VFS_RETURNED;
2823 }
2824 
2825 int sync_thread_state = 0;
2826 int sync_timeout_seconds = 5;
2827 
2828 #define SYNC_THREAD_RUN       0x0001
2829 #define SYNC_THREAD_RUNNING   0x0002
2830 
2831 #if CONFIG_PHYS_WRITE_ACCT
2832 thread_t pm_sync_thread;
2833 #endif /* CONFIG_PHYS_WRITE_ACCT */
2834 
2835 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)2836 sync_thread(__unused void *arg, __unused wait_result_t wr)
2837 {
2838 	sync_type_t sync_type;
2839 #if CONFIG_PHYS_WRITE_ACCT
2840 	pm_sync_thread = current_thread();
2841 #endif /* CONFIG_PHYS_WRITE_ACCT */
2842 
2843 	lck_mtx_lock(&sync_mtx_lck);
2844 	while (sync_thread_state & SYNC_THREAD_RUN) {
2845 		sync_thread_state &= ~SYNC_THREAD_RUN;
2846 		lck_mtx_unlock(&sync_mtx_lck);
2847 
2848 		sync_type = SYNC_ONLY_RELIABLE_MEDIA;
2849 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2850 		sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
2851 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2852 
2853 		lck_mtx_lock(&sync_mtx_lck);
2854 	}
2855 	/*
2856 	 * This wakeup _has_ to be issued before the lock is released otherwise
2857 	 * we may end up waking up a thread in sync_internal which is
2858 	 * expecting a wakeup from a thread it just created and not from this
2859 	 * thread which is about to exit.
2860 	 */
2861 	wakeup(&sync_thread_state);
2862 	sync_thread_state &= ~SYNC_THREAD_RUNNING;
2863 #if CONFIG_PHYS_WRITE_ACCT
2864 	pm_sync_thread = NULL;
2865 #endif /* CONFIG_PHYS_WRITE_ACCT */
2866 	lck_mtx_unlock(&sync_mtx_lck);
2867 
2868 	if (print_vmpage_stat) {
2869 		vm_countdirtypages();
2870 	}
2871 
2872 #if DIAGNOSTIC
2873 	if (syncprt) {
2874 		vfs_bufstats();
2875 	}
2876 #endif /* DIAGNOSTIC */
2877 }
2878 
2879 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
2880 
2881 /*
2882  * An in-kernel sync for power management to call.
2883  * This function always returns within sync_timeout seconds.
2884  */
2885 __private_extern__ int
sync_internal(void)2886 sync_internal(void)
2887 {
2888 	thread_t thd;
2889 	int error;
2890 	int thread_created = FALSE;
2891 	struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
2892 
2893 	lck_mtx_lock(&sync_mtx_lck);
2894 	sync_thread_state |= SYNC_THREAD_RUN;
2895 	if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
2896 		int kr;
2897 
2898 		sync_thread_state |= SYNC_THREAD_RUNNING;
2899 		kr = kernel_thread_start(sync_thread, NULL, &thd);
2900 		if (kr != KERN_SUCCESS) {
2901 			sync_thread_state &= ~SYNC_THREAD_RUNNING;
2902 			lck_mtx_unlock(&sync_mtx_lck);
2903 			printf("sync_thread failed\n");
2904 			return 0;
2905 		}
2906 		thread_created = TRUE;
2907 	}
2908 
2909 	error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
2910 	    (PVFS | PDROP | PCATCH), "sync_thread", &ts);
2911 	if (error) {
2912 		struct timeval now;
2913 
2914 		microtime(&now);
2915 		if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
2916 			printf("sync timed out: %d sec\n", sync_timeout_seconds);
2917 			sync_timeout_last_print.tv_sec = now.tv_sec;
2918 		}
2919 	}
2920 
2921 	if (thread_created) {
2922 		thread_deallocate(thd);
2923 	}
2924 
2925 	return 0;
2926 } /* end of sync_internal call */
2927 
2928 /*
2929  * Change filesystem quotas.
2930  */
2931 #if QUOTA
2932 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)2933 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2934 {
2935 	struct mount *mp;
2936 	int error, quota_cmd, quota_status = 0;
2937 	caddr_t datap;
2938 	size_t fnamelen;
2939 	struct nameidata nd;
2940 	vfs_context_t ctx = vfs_context_current();
2941 	struct dqblk my_dqblk = {};
2942 
2943 	AUDIT_ARG(uid, uap->uid);
2944 	AUDIT_ARG(cmd, uap->cmd);
2945 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2946 	    uap->path, ctx);
2947 	error = namei(&nd);
2948 	if (error) {
2949 		return error;
2950 	}
2951 	mp = nd.ni_vp->v_mount;
2952 	mount_ref(mp, 0);
2953 	vnode_put(nd.ni_vp);
2954 	nameidone(&nd);
2955 
2956 #if CONFIG_MACF
2957 	error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
2958 	if (error != 0) {
2959 		goto out;
2960 	}
2961 #endif
2962 
2963 	/* copyin any data we will need for downstream code */
2964 	quota_cmd = uap->cmd >> SUBCMDSHIFT;
2965 
2966 	switch (quota_cmd) {
2967 	case Q_QUOTAON:
2968 		/* uap->arg specifies a file from which to take the quotas */
2969 		fnamelen = MAXPATHLEN;
2970 		datap = zalloc(ZV_NAMEI);
2971 		error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2972 		break;
2973 	case Q_GETQUOTA:
2974 		/* uap->arg is a pointer to a dqblk structure. */
2975 		datap = (caddr_t) &my_dqblk;
2976 		break;
2977 	case Q_SETQUOTA:
2978 	case Q_SETUSE:
2979 		/* uap->arg is a pointer to a dqblk structure. */
2980 		datap = (caddr_t) &my_dqblk;
2981 		if (proc_is64bit(p)) {
2982 			struct user_dqblk       my_dqblk64;
2983 			error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
2984 			if (error == 0) {
2985 				munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2986 			}
2987 		} else {
2988 			error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
2989 		}
2990 		break;
2991 	case Q_QUOTASTAT:
2992 		/* uap->arg is a pointer to an integer */
2993 		datap = (caddr_t) &quota_status;
2994 		break;
2995 	default:
2996 		datap = NULL;
2997 		break;
2998 	} /* switch */
2999 
3000 	if (error == 0) {
3001 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3002 	}
3003 
3004 	switch (quota_cmd) {
3005 	case Q_QUOTAON:
3006 		if (datap != NULL) {
3007 			zfree(ZV_NAMEI, datap);
3008 		}
3009 		break;
3010 	case Q_GETQUOTA:
3011 		/* uap->arg is a pointer to a dqblk structure we need to copy out to */
3012 		if (error == 0) {
3013 			if (proc_is64bit(p)) {
3014 				struct user_dqblk       my_dqblk64;
3015 
3016 				memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3017 				munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3018 				error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3019 			} else {
3020 				error = copyout(datap, uap->arg, sizeof(struct dqblk));
3021 			}
3022 		}
3023 		break;
3024 	case Q_QUOTASTAT:
3025 		/* uap->arg is a pointer to an integer */
3026 		if (error == 0) {
3027 			error = copyout(datap, uap->arg, sizeof(quota_status));
3028 		}
3029 		break;
3030 	default:
3031 		break;
3032 	} /* switch */
3033 
3034 out:
3035 	mount_drop(mp, 0);
3036 	return error;
3037 }
3038 #else
3039 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3040 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3041 {
3042 	return EOPNOTSUPP;
3043 }
3044 #endif /* QUOTA */
3045 
3046 /*
3047  * Get filesystem statistics.
3048  *
3049  * Returns:	0			Success
3050  *	namei:???
3051  *	vfs_update_vfsstat:???
3052  *	munge_statfs:EFAULT
3053  */
3054 /* ARGSUSED */
3055 int
statfs(__unused proc_t p,struct statfs_args * uap,__unused int32_t * retval)3056 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3057 {
3058 	struct mount *mp;
3059 	struct vfsstatfs *sp;
3060 	int error;
3061 	struct nameidata nd;
3062 	vfs_context_t ctx = vfs_context_current();
3063 	vnode_t vp;
3064 
3065 	NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3066 	    UIO_USERSPACE, uap->path, ctx);
3067 	error = namei(&nd);
3068 	if (error != 0) {
3069 		return error;
3070 	}
3071 	vp = nd.ni_vp;
3072 	mp = vp->v_mount;
3073 	sp = &mp->mnt_vfsstat;
3074 	nameidone(&nd);
3075 
3076 #if CONFIG_MACF
3077 	error = mac_mount_check_stat(ctx, mp);
3078 	if (error != 0) {
3079 		vnode_put(vp);
3080 		return error;
3081 	}
3082 #endif
3083 
3084 	error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3085 	if (error != 0) {
3086 		vnode_put(vp);
3087 		return error;
3088 	}
3089 
3090 	error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
3091 	vnode_put(vp);
3092 	return error;
3093 }
3094 
3095 /*
3096  * Get filesystem statistics.
3097  */
3098 /* ARGSUSED */
3099 int
fstatfs(__unused proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3100 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3101 {
3102 	vnode_t vp;
3103 	struct mount *mp;
3104 	struct vfsstatfs *sp;
3105 	int error;
3106 
3107 	AUDIT_ARG(fd, uap->fd);
3108 
3109 	if ((error = file_vnode(uap->fd, &vp))) {
3110 		return error;
3111 	}
3112 
3113 	error = vnode_getwithref(vp);
3114 	if (error) {
3115 		file_drop(uap->fd);
3116 		return error;
3117 	}
3118 
3119 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3120 
3121 	mp = vp->v_mount;
3122 	if (!mp) {
3123 		error = EBADF;
3124 		goto out;
3125 	}
3126 
3127 #if CONFIG_MACF
3128 	error = mac_mount_check_stat(vfs_context_current(), mp);
3129 	if (error != 0) {
3130 		goto out;
3131 	}
3132 #endif
3133 
3134 	sp = &mp->mnt_vfsstat;
3135 	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3136 		goto out;
3137 	}
3138 
3139 	error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
3140 
3141 out:
3142 	file_drop(uap->fd);
3143 	vnode_put(vp);
3144 
3145 	return error;
3146 }
3147 
3148 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3149 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3150 {
3151 	struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3152 
3153 	bzero(sfs, sizeof(*sfs));
3154 
3155 	sfs->f_bsize = vsfs->f_bsize;
3156 	sfs->f_iosize = (int32_t)vsfs->f_iosize;
3157 	sfs->f_blocks = vsfs->f_blocks;
3158 	sfs->f_bfree = vsfs->f_bfree;
3159 	sfs->f_bavail = vsfs->f_bavail;
3160 	sfs->f_files = vsfs->f_files;
3161 	sfs->f_ffree = vsfs->f_ffree;
3162 	sfs->f_fsid = vsfs->f_fsid;
3163 	sfs->f_owner = vsfs->f_owner;
3164 	sfs->f_type = mp->mnt_vtable->vfc_typenum;
3165 	sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3166 	sfs->f_fssubtype = vsfs->f_fssubtype;
3167 	sfs->f_flags_ext = (mp->mnt_kern_flag & MNTK_SYSTEMDATA) ? MNT_EXT_ROOT_DATA_VOL : 0;
3168 	if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3169 		strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3170 	} else {
3171 		strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
3172 	}
3173 	strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3174 	strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3175 }
3176 
3177 /*
3178  * Get file system statistics in 64-bit mode
3179  */
3180 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3181 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3182 {
3183 	struct mount *mp;
3184 	int error;
3185 	struct nameidata *ndp;
3186 	struct statfs64 *sfsp;
3187 	vfs_context_t ctxp = vfs_context_current();
3188 	vnode_t vp;
3189 	struct {
3190 		struct nameidata nd;
3191 		struct statfs64 sfs;
3192 	} *__nameidata_statfs64;
3193 
3194 	__nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3195 	    Z_WAITOK);
3196 	ndp = &__nameidata_statfs64->nd;
3197 
3198 	NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3199 	    UIO_USERSPACE, uap->path, ctxp);
3200 	error = namei(ndp);
3201 	if (error != 0) {
3202 		goto out;
3203 	}
3204 	vp = ndp->ni_vp;
3205 	mp = vp->v_mount;
3206 	nameidone(ndp);
3207 
3208 #if CONFIG_MACF
3209 	error = mac_mount_check_stat(ctxp, mp);
3210 	if (error != 0) {
3211 		vnode_put(vp);
3212 		goto out;
3213 	}
3214 #endif
3215 
3216 	error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3217 	if (error != 0) {
3218 		vnode_put(vp);
3219 		goto out;
3220 	}
3221 
3222 	sfsp = &__nameidata_statfs64->sfs;
3223 	vfs_get_statfs64(mp, sfsp);
3224 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3225 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3226 		/* This process does not want to see a seperate data volume mountpoint */
3227 		strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3228 	}
3229 	error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3230 	vnode_put(vp);
3231 
3232 out:
3233 	kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3234 
3235 	return error;
3236 }
3237 
3238 /*
3239  * Get file system statistics in 64-bit mode
3240  */
3241 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3242 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3243 {
3244 	struct vnode *vp;
3245 	struct mount *mp;
3246 	struct statfs64 sfs;
3247 	int error;
3248 
3249 	AUDIT_ARG(fd, uap->fd);
3250 
3251 	if ((error = file_vnode(uap->fd, &vp))) {
3252 		return error;
3253 	}
3254 
3255 	error = vnode_getwithref(vp);
3256 	if (error) {
3257 		file_drop(uap->fd);
3258 		return error;
3259 	}
3260 
3261 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3262 
3263 	mp = vp->v_mount;
3264 	if (!mp) {
3265 		error = EBADF;
3266 		goto out;
3267 	}
3268 
3269 #if CONFIG_MACF
3270 	error = mac_mount_check_stat(vfs_context_current(), mp);
3271 	if (error != 0) {
3272 		goto out;
3273 	}
3274 #endif
3275 
3276 	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3277 		goto out;
3278 	}
3279 
3280 	vfs_get_statfs64(mp, &sfs);
3281 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3282 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3283 		/* This process does not want to see a seperate data volume mountpoint */
3284 		strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3285 	}
3286 	error = copyout(&sfs, uap->buf, sizeof(sfs));
3287 
3288 out:
3289 	file_drop(uap->fd);
3290 	vnode_put(vp);
3291 
3292 	return error;
3293 }
3294 
3295 struct getfsstat_struct {
3296 	user_addr_t     sfsp;
3297 	user_addr_t     *mp;
3298 	int             count;
3299 	int             maxcount;
3300 	int             flags;
3301 	int             error;
3302 };
3303 
3304 
3305 static int
getfsstat_callback(mount_t mp,void * arg)3306 getfsstat_callback(mount_t mp, void * arg)
3307 {
3308 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3309 	struct vfsstatfs *sp;
3310 	int error, my_size;
3311 	vfs_context_t ctx = vfs_context_current();
3312 
3313 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3314 #if CONFIG_MACF
3315 		error = mac_mount_check_stat(ctx, mp);
3316 		if (error != 0) {
3317 			fstp->error = error;
3318 			return VFS_RETURNED_DONE;
3319 		}
3320 #endif
3321 		sp = &mp->mnt_vfsstat;
3322 		/*
3323 		 * If MNT_NOWAIT is specified, do not refresh the
3324 		 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3325 		 */
3326 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3327 		    (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3328 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3329 		    (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3330 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3331 			return VFS_RETURNED;
3332 		}
3333 
3334 		/*
3335 		 * Need to handle LP64 version of struct statfs
3336 		 */
3337 		error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3338 		if (error) {
3339 			fstp->error = error;
3340 			return VFS_RETURNED_DONE;
3341 		}
3342 		fstp->sfsp += my_size;
3343 
3344 		if (fstp->mp) {
3345 #if CONFIG_MACF
3346 			error = mac_mount_label_get(mp, *fstp->mp);
3347 			if (error) {
3348 				fstp->error = error;
3349 				return VFS_RETURNED_DONE;
3350 			}
3351 #endif
3352 			fstp->mp++;
3353 		}
3354 	}
3355 	fstp->count++;
3356 	return VFS_RETURNED;
3357 }
3358 
3359 /*
3360  * Get statistics on all filesystems.
3361  */
3362 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3363 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3364 {
3365 	struct __mac_getfsstat_args muap;
3366 
3367 	muap.buf = uap->buf;
3368 	muap.bufsize = uap->bufsize;
3369 	muap.mac = USER_ADDR_NULL;
3370 	muap.macsize = 0;
3371 	muap.flags = uap->flags;
3372 
3373 	return __mac_getfsstat(p, &muap, retval);
3374 }
3375 
3376 /*
3377  * __mac_getfsstat: Get MAC-related file system statistics
3378  *
3379  * Parameters:    p                        (ignored)
3380  *                uap                      User argument descriptor (see below)
3381  *                retval                   Count of file system statistics (N stats)
3382  *
3383  * Indirect:      uap->bufsize             Buffer size
3384  *                uap->macsize             MAC info size
3385  *                uap->buf                 Buffer where information will be returned
3386  *                uap->mac                 MAC info
3387  *                uap->flags               File system flags
3388  *
3389  *
3390  * Returns:        0                       Success
3391  *                !0                       Not success
3392  *
3393  */
3394 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3395 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3396 {
3397 	user_addr_t sfsp;
3398 	user_addr_t *mp;
3399 	size_t count, maxcount, bufsize, macsize;
3400 	struct getfsstat_struct fst;
3401 
3402 	if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3403 		return EINVAL;
3404 	}
3405 
3406 	bufsize = (size_t) uap->bufsize;
3407 	macsize = (size_t) uap->macsize;
3408 
3409 	if (IS_64BIT_PROCESS(p)) {
3410 		maxcount = bufsize / sizeof(struct user64_statfs);
3411 	} else {
3412 		maxcount = bufsize / sizeof(struct user32_statfs);
3413 	}
3414 	sfsp = uap->buf;
3415 	count = 0;
3416 
3417 	mp = NULL;
3418 
3419 #if CONFIG_MACF
3420 	if (uap->mac != USER_ADDR_NULL) {
3421 		u_int32_t *mp0;
3422 		int error;
3423 		unsigned int i;
3424 
3425 		count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3426 		if (count != maxcount) {
3427 			return EINVAL;
3428 		}
3429 
3430 		/* Copy in the array */
3431 		mp0 = kalloc_data(macsize, Z_WAITOK);
3432 		if (mp0 == NULL) {
3433 			return ENOMEM;
3434 		}
3435 
3436 		error = copyin(uap->mac, mp0, macsize);
3437 		if (error) {
3438 			kfree_data(mp0, macsize);
3439 			return error;
3440 		}
3441 
3442 		/* Normalize to an array of user_addr_t */
3443 		mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3444 		if (mp == NULL) {
3445 			kfree_data(mp0, macsize);
3446 			return ENOMEM;
3447 		}
3448 
3449 		for (i = 0; i < count; i++) {
3450 			if (IS_64BIT_PROCESS(p)) {
3451 				mp[i] = ((user_addr_t *)mp0)[i];
3452 			} else {
3453 				mp[i] = (user_addr_t)mp0[i];
3454 			}
3455 		}
3456 		kfree_data(mp0, macsize);
3457 	}
3458 #endif
3459 
3460 
3461 	fst.sfsp = sfsp;
3462 	fst.mp = mp;
3463 	fst.flags = uap->flags;
3464 	fst.count = 0;
3465 	fst.error = 0;
3466 	fst.maxcount = (int)maxcount;
3467 
3468 
3469 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3470 
3471 	if (mp) {
3472 		kfree_data(mp, count * sizeof(user_addr_t));
3473 	}
3474 
3475 	if (fst.error) {
3476 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3477 		return fst.error;
3478 	}
3479 
3480 	if (fst.sfsp && fst.count > fst.maxcount) {
3481 		*retval = fst.maxcount;
3482 	} else {
3483 		*retval = fst.count;
3484 	}
3485 	return 0;
3486 }
3487 
3488 static int
getfsstat64_callback(mount_t mp,void * arg)3489 getfsstat64_callback(mount_t mp, void * arg)
3490 {
3491 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3492 	struct vfsstatfs *sp;
3493 	struct statfs64 sfs;
3494 	int error;
3495 
3496 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3497 #if CONFIG_MACF
3498 		error = mac_mount_check_stat(vfs_context_current(), mp);
3499 		if (error != 0) {
3500 			fstp->error = error;
3501 			return VFS_RETURNED_DONE;
3502 		}
3503 #endif
3504 		sp = &mp->mnt_vfsstat;
3505 		/*
3506 		 * If MNT_NOWAIT is specified, do not refresh the fsstat
3507 		 * cache. MNT_WAIT overrides MNT_NOWAIT.
3508 		 *
3509 		 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3510 		 * getfsstat, since the constants are out of the same
3511 		 * namespace.
3512 		 */
3513 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3514 		    ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3515 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3516 		    (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3517 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3518 			return VFS_RETURNED;
3519 		}
3520 
3521 		vfs_get_statfs64(mp, &sfs);
3522 		error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3523 		if (error) {
3524 			fstp->error = error;
3525 			return VFS_RETURNED_DONE;
3526 		}
3527 		fstp->sfsp += sizeof(sfs);
3528 	}
3529 	fstp->count++;
3530 	return VFS_RETURNED;
3531 }
3532 
3533 /*
3534  * Get statistics on all file systems in 64 bit mode.
3535  */
3536 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)3537 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3538 {
3539 	user_addr_t sfsp;
3540 	int count, maxcount;
3541 	struct getfsstat_struct fst;
3542 
3543 	maxcount = uap->bufsize / sizeof(struct statfs64);
3544 
3545 	sfsp = uap->buf;
3546 	count = 0;
3547 
3548 	fst.sfsp = sfsp;
3549 	fst.flags = uap->flags;
3550 	fst.count = 0;
3551 	fst.error = 0;
3552 	fst.maxcount = maxcount;
3553 
3554 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3555 
3556 	if (fst.error) {
3557 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3558 		return fst.error;
3559 	}
3560 
3561 	if (fst.sfsp && fst.count > fst.maxcount) {
3562 		*retval = fst.maxcount;
3563 	} else {
3564 		*retval = fst.count;
3565 	}
3566 
3567 	return 0;
3568 }
3569 
3570 /*
3571  * gets the associated vnode with the file descriptor passed.
3572  * as input
3573  *
3574  * INPUT
3575  * ctx - vfs context of caller
3576  * fd - file descriptor for which vnode is required.
3577  * vpp - Pointer to pointer to vnode to be returned.
3578  *
3579  * The vnode is returned with an iocount so any vnode obtained
3580  * by this call needs a vnode_put
3581  *
3582  */
3583 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)3584 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3585 {
3586 	int error;
3587 	vnode_t vp;
3588 	struct fileproc *fp;
3589 	proc_t p = vfs_context_proc(ctx);
3590 
3591 	*vpp =  NULLVP;
3592 
3593 	error = fp_getfvp(p, fd, &fp, &vp);
3594 	if (error) {
3595 		return error;
3596 	}
3597 
3598 	error = vnode_getwithref(vp);
3599 	if (error) {
3600 		(void)fp_drop(p, fd, fp, 0);
3601 		return error;
3602 	}
3603 
3604 	(void)fp_drop(p, fd, fp, 0);
3605 	*vpp = vp;
3606 	return error;
3607 }
3608 
3609 /*
3610  * Wrapper function around namei to start lookup from a directory
3611  * specified by a file descriptor ni_dirfd.
3612  *
3613  * In addition to all the errors returned by namei, this call can
3614  * return ENOTDIR if the file descriptor does not refer to a directory.
3615  * and EBADF if the file descriptor is not valid.
3616  */
3617 int
nameiat(struct nameidata * ndp,int dirfd)3618 nameiat(struct nameidata *ndp, int dirfd)
3619 {
3620 	if ((dirfd != AT_FDCWD) &&
3621 	    !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3622 	    !(ndp->ni_cnd.cn_flags & USEDVP)) {
3623 		int error = 0;
3624 		char c;
3625 
3626 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3627 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
3628 			if (error) {
3629 				return error;
3630 			}
3631 		} else {
3632 			c = *((char *)(ndp->ni_dirp));
3633 		}
3634 
3635 		if (c != '/') {
3636 			vnode_t dvp_at;
3637 
3638 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3639 			    &dvp_at);
3640 			if (error) {
3641 				return error;
3642 			}
3643 
3644 			if (vnode_vtype(dvp_at) != VDIR) {
3645 				vnode_put(dvp_at);
3646 				return ENOTDIR;
3647 			}
3648 
3649 			ndp->ni_dvp = dvp_at;
3650 			ndp->ni_cnd.cn_flags |= USEDVP;
3651 			error = namei(ndp);
3652 			ndp->ni_cnd.cn_flags &= ~USEDVP;
3653 			vnode_put(dvp_at);
3654 			return error;
3655 		}
3656 	}
3657 
3658 	return namei(ndp);
3659 }
3660 
3661 /*
3662  * Change current working directory to a given file descriptor.
3663  */
3664 /* ARGSUSED */
3665 static int
common_fchdir(proc_t p,struct fchdir_args * uap,int per_thread)3666 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
3667 {
3668 	vnode_t vp;
3669 	vnode_t tdp;
3670 	vnode_t tvp;
3671 	struct mount *mp;
3672 	int error, should_put = 1;
3673 	vfs_context_t ctx = vfs_context_current();
3674 
3675 	AUDIT_ARG(fd, uap->fd);
3676 	if (per_thread && uap->fd == -1) {
3677 		/*
3678 		 * Switching back from per-thread to per process CWD; verify we
3679 		 * in fact have one before proceeding.  The only success case
3680 		 * for this code path is to return 0 preemptively after zapping
3681 		 * the thread structure contents.
3682 		 */
3683 		thread_t th = vfs_context_thread(ctx);
3684 		if (th) {
3685 			uthread_t uth = get_bsdthread_info(th);
3686 			tvp = uth->uu_cdir;
3687 			uth->uu_cdir = NULLVP;
3688 			if (tvp != NULLVP) {
3689 				vnode_rele(tvp);
3690 				return 0;
3691 			}
3692 		}
3693 		return EBADF;
3694 	}
3695 
3696 	if ((error = file_vnode(uap->fd, &vp))) {
3697 		return error;
3698 	}
3699 	if ((error = vnode_getwithref(vp))) {
3700 		file_drop(uap->fd);
3701 		return error;
3702 	}
3703 
3704 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
3705 
3706 	if (vp->v_type != VDIR) {
3707 		error = ENOTDIR;
3708 		goto out;
3709 	}
3710 
3711 #if CONFIG_MACF
3712 	error = mac_vnode_check_chdir(ctx, vp);
3713 	if (error) {
3714 		goto out;
3715 	}
3716 #endif
3717 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3718 	if (error) {
3719 		goto out;
3720 	}
3721 
3722 	while (!error && (mp = vp->v_mountedhere) != NULL) {
3723 		if (vfs_busy(mp, LK_NOWAIT)) {
3724 			error = EACCES;
3725 			goto out;
3726 		}
3727 		error = VFS_ROOT(mp, &tdp, ctx);
3728 		vfs_unbusy(mp);
3729 		if (error) {
3730 			break;
3731 		}
3732 		vnode_put(vp);
3733 		vp = tdp;
3734 	}
3735 	if (error) {
3736 		goto out;
3737 	}
3738 	if ((error = vnode_ref(vp))) {
3739 		goto out;
3740 	}
3741 	vnode_put(vp);
3742 	should_put = 0;
3743 
3744 	if (per_thread) {
3745 		thread_t th = vfs_context_thread(ctx);
3746 		if (th) {
3747 			uthread_t uth = get_bsdthread_info(th);
3748 			tvp = uth->uu_cdir;
3749 			uth->uu_cdir = vp;
3750 			OSBitOrAtomic(P_THCWD, &p->p_flag);
3751 		} else {
3752 			vnode_rele(vp);
3753 			error = ENOENT;
3754 			goto out;
3755 		}
3756 	} else {
3757 		proc_dirs_lock_exclusive(p);
3758 		proc_fdlock(p);
3759 		tvp = p->p_fd.fd_cdir;
3760 		p->p_fd.fd_cdir = vp;
3761 		proc_fdunlock(p);
3762 		proc_dirs_unlock_exclusive(p);
3763 	}
3764 
3765 	if (tvp) {
3766 		vnode_rele(tvp);
3767 	}
3768 
3769 out:
3770 	if (should_put) {
3771 		vnode_put(vp);
3772 	}
3773 	file_drop(uap->fd);
3774 
3775 	return error;
3776 }
3777 
3778 int
fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)3779 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3780 {
3781 	return common_fchdir(p, uap, 0);
3782 }
3783 
3784 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)3785 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3786 {
3787 	return common_fchdir(p, (void *)uap, 1);
3788 }
3789 
3790 
3791 /*
3792  * Change current working directory (".").
3793  *
3794  * Returns:	0			Success
3795  *	change_dir:ENOTDIR
3796  *	change_dir:???
3797  *	vnode_ref:ENOENT		No such file or directory
3798  */
3799 /* ARGSUSED */
3800 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)3801 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
3802 {
3803 	int error;
3804 	vnode_t tvp;
3805 
3806 	error = change_dir(ndp, ctx);
3807 	if (error) {
3808 		return error;
3809 	}
3810 	if ((error = vnode_ref(ndp->ni_vp))) {
3811 		vnode_put(ndp->ni_vp);
3812 		return error;
3813 	}
3814 	/*
3815 	 * drop the iocount we picked up in change_dir
3816 	 */
3817 	vnode_put(ndp->ni_vp);
3818 
3819 	if (per_thread) {
3820 		thread_t th = vfs_context_thread(ctx);
3821 		if (th) {
3822 			uthread_t uth = get_bsdthread_info(th);
3823 			tvp = uth->uu_cdir;
3824 			uth->uu_cdir = ndp->ni_vp;
3825 			OSBitOrAtomic(P_THCWD, &p->p_flag);
3826 		} else {
3827 			vnode_rele(ndp->ni_vp);
3828 			return ENOENT;
3829 		}
3830 	} else {
3831 		proc_dirs_lock_exclusive(p);
3832 		proc_fdlock(p);
3833 		tvp = p->p_fd.fd_cdir;
3834 		p->p_fd.fd_cdir = ndp->ni_vp;
3835 		proc_fdunlock(p);
3836 		proc_dirs_unlock_exclusive(p);
3837 	}
3838 
3839 	if (tvp) {
3840 		vnode_rele(tvp);
3841 	}
3842 
3843 	return 0;
3844 }
3845 
3846 
3847 /*
3848  * Change current working directory (".").
3849  *
3850  * Returns:	0			Success
3851  *	chdir_internal:ENOTDIR
3852  *	chdir_internal:ENOENT		No such file or directory
3853  *	chdir_internal:???
3854  */
3855 /* ARGSUSED */
3856 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)3857 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3858 {
3859 	struct nameidata nd;
3860 	vfs_context_t ctx = vfs_context_current();
3861 
3862 	NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3863 	    UIO_USERSPACE, uap->path, ctx);
3864 
3865 	return chdir_internal(p, ctx, &nd, per_thread);
3866 }
3867 
3868 
3869 /*
3870  * chdir
3871  *
3872  * Change current working directory (".") for the entire process
3873  *
3874  * Parameters:  p       Process requesting the call
3875  *              uap     User argument descriptor (see below)
3876  *              retval  (ignored)
3877  *
3878  * Indirect parameters:	uap->path	Directory path
3879  *
3880  * Returns:	0			Success
3881  *              common_chdir: ENOTDIR
3882  *              common_chdir: ENOENT	No such file or directory
3883  *              common_chdir: ???
3884  *
3885  */
3886 int
chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)3887 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3888 {
3889 	return common_chdir(p, (void *)uap, 0);
3890 }
3891 
3892 /*
3893  * __pthread_chdir
3894  *
3895  * Change current working directory (".") for a single thread
3896  *
3897  * Parameters:  p       Process requesting the call
3898  *              uap     User argument descriptor (see below)
3899  *              retval  (ignored)
3900  *
3901  * Indirect parameters:	uap->path	Directory path
3902  *
3903  * Returns:	0			Success
3904  *              common_chdir: ENOTDIR
3905  *		common_chdir: ENOENT	No such file or directory
3906  *		common_chdir: ???
3907  *
3908  */
3909 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)3910 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3911 {
3912 	return common_chdir(p, (void *)uap, 1);
3913 }
3914 
3915 
3916 /*
3917  * Change notion of root (``/'') directory.
3918  */
3919 /* ARGSUSED */
3920 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)3921 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3922 {
3923 	struct filedesc *fdp = &p->p_fd;
3924 	int error;
3925 	struct nameidata nd;
3926 	vnode_t tvp;
3927 	vfs_context_t ctx = vfs_context_current();
3928 
3929 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
3930 		return error;
3931 	}
3932 
3933 	NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3934 	    UIO_USERSPACE, uap->path, ctx);
3935 	error = change_dir(&nd, ctx);
3936 	if (error) {
3937 		return error;
3938 	}
3939 
3940 #if CONFIG_MACF
3941 	error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3942 	    &nd.ni_cnd);
3943 	if (error) {
3944 		vnode_put(nd.ni_vp);
3945 		return error;
3946 	}
3947 #endif
3948 
3949 	if ((error = vnode_ref(nd.ni_vp))) {
3950 		vnode_put(nd.ni_vp);
3951 		return error;
3952 	}
3953 	vnode_put(nd.ni_vp);
3954 
3955 	/*
3956 	 * This lock provides the guarantee that as long as you hold the lock
3957 	 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
3958 	 * on a referenced vnode in namei when determining the rootvnode for
3959 	 * a process.
3960 	 */
3961 	/* needed for synchronization with lookup */
3962 	proc_dirs_lock_exclusive(p);
3963 	/* needed for setting the flag and other activities on the fd itself */
3964 	proc_fdlock(p);
3965 	tvp = fdp->fd_rdir;
3966 	fdp->fd_rdir = nd.ni_vp;
3967 	fdt_flag_set(fdp, FD_CHROOT);
3968 	proc_fdunlock(p);
3969 	proc_dirs_unlock_exclusive(p);
3970 
3971 	if (tvp != NULL) {
3972 		vnode_rele(tvp);
3973 	}
3974 
3975 	return 0;
3976 }
3977 
3978 #define PATHSTATICBUFLEN 256
3979 #define PIVOT_ROOT_ENTITLEMENT              \
3980        "com.apple.private.vfs.pivot-root"
3981 
3982 #if defined(XNU_TARGET_OS_OSX)
3983 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)3984 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
3985 {
3986 	int error;
3987 	char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
3988 	char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
3989 	char *new_rootfs_path_before_buf = NULL;
3990 	char *old_rootfs_path_after_buf = NULL;
3991 	char *incoming = NULL;
3992 	char *outgoing = NULL;
3993 	vnode_t incoming_rootvp = NULLVP;
3994 	size_t bytes_copied;
3995 
3996 	/*
3997 	 * XXX : Additional restrictions needed
3998 	 * - perhaps callable only once.
3999 	 */
4000 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4001 		return error;
4002 	}
4003 
4004 	/*
4005 	 * pivot_root can be executed by launchd only.
4006 	 * Enforce entitlement.
4007 	 */
4008 	if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4009 		return EPERM;
4010 	}
4011 
4012 	error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4013 	if (error == ENAMETOOLONG) {
4014 		new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4015 		error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4016 	}
4017 
4018 	if (error) {
4019 		goto out;
4020 	}
4021 
4022 	error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4023 	if (error == ENAMETOOLONG) {
4024 		old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4025 		error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4026 	}
4027 	if (error) {
4028 		goto out;
4029 	}
4030 
4031 	if (new_rootfs_path_before_buf) {
4032 		incoming = new_rootfs_path_before_buf;
4033 	} else {
4034 		incoming = &new_rootfs_path_before[0];
4035 	}
4036 
4037 	if (old_rootfs_path_after_buf) {
4038 		outgoing = old_rootfs_path_after_buf;
4039 	} else {
4040 		outgoing = &old_rootfs_path_after[0];
4041 	}
4042 
4043 	/*
4044 	 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4045 	 * Userland is not allowed to pivot to an image.
4046 	 */
4047 	error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4048 	if (error) {
4049 		goto out;
4050 	}
4051 	error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4052 	if (error) {
4053 		goto out;
4054 	}
4055 
4056 	error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4057 
4058 out:
4059 	if (incoming_rootvp != NULLVP) {
4060 		vnode_put(incoming_rootvp);
4061 		incoming_rootvp = NULLVP;
4062 	}
4063 
4064 	if (old_rootfs_path_after_buf) {
4065 		zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4066 	}
4067 
4068 	if (new_rootfs_path_before_buf) {
4069 		zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4070 	}
4071 
4072 	return error;
4073 }
4074 #else
4075 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4076 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4077 {
4078 	return nosys(p, NULL, retval);
4079 }
4080 #endif /* XNU_TARGET_OS_OSX */
4081 
4082 /*
4083  * Common routine for chroot and chdir.
4084  *
4085  * Returns:	0			Success
4086  *		ENOTDIR			Not a directory
4087  *		namei:???		[anything namei can return]
4088  *		vnode_authorize:???	[anything vnode_authorize can return]
4089  */
4090 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4091 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4092 {
4093 	vnode_t vp;
4094 	int error;
4095 
4096 	if ((error = namei(ndp))) {
4097 		return error;
4098 	}
4099 	nameidone(ndp);
4100 	vp = ndp->ni_vp;
4101 
4102 	if (vp->v_type != VDIR) {
4103 		vnode_put(vp);
4104 		return ENOTDIR;
4105 	}
4106 
4107 #if CONFIG_MACF
4108 	error = mac_vnode_check_chdir(ctx, vp);
4109 	if (error) {
4110 		vnode_put(vp);
4111 		return error;
4112 	}
4113 #endif
4114 
4115 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4116 	if (error) {
4117 		vnode_put(vp);
4118 		return error;
4119 	}
4120 
4121 	return error;
4122 }
4123 
4124 /*
4125  * Free the vnode data (for directories) associated with the file glob.
4126  */
4127 struct fd_vn_data *
fg_vn_data_alloc(void)4128 fg_vn_data_alloc(void)
4129 {
4130 	struct fd_vn_data *fvdata;
4131 
4132 	/* Allocate per fd vnode data */
4133 	fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4134 	lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4135 	return fvdata;
4136 }
4137 
4138 /*
4139  * Free the vnode data (for directories) associated with the file glob.
4140  */
4141 void
fg_vn_data_free(void * fgvndata)4142 fg_vn_data_free(void *fgvndata)
4143 {
4144 	struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4145 
4146 	kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4147 	lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4148 	kfree_type(struct fd_vn_data, fvdata);
4149 }
4150 
4151 /*
4152  * Check permissions, allocate an open file structure,
4153  * and call the device open routine if any.
4154  *
4155  * Returns:	0			Success
4156  *		EINVAL
4157  *		EINTR
4158  *	falloc:ENFILE
4159  *	falloc:EMFILE
4160  *	falloc:ENOMEM
4161  *	vn_open_auth:???
4162  *	dupfdopen:???
4163  *	VNOP_ADVLOCK:???
4164  *	vnode_setsize:???
4165  *
4166  * XXX Need to implement uid, gid
4167  */
4168 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval)4169 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4170     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval)
4171 {
4172 	proc_t p = vfs_context_proc(ctx);
4173 	uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4174 	struct fileproc *fp;
4175 	vnode_t vp;
4176 	int flags, oflags;
4177 	int type, indx, error;
4178 	struct vfs_context context;
4179 
4180 	oflags = uflags;
4181 
4182 	if ((oflags & O_ACCMODE) == O_ACCMODE) {
4183 		return EINVAL;
4184 	}
4185 
4186 	flags = FFLAGS(uflags);
4187 	CLR(flags, FENCRYPTED);
4188 	CLR(flags, FUNENCRYPTED);
4189 
4190 	AUDIT_ARG(fflags, oflags);
4191 	AUDIT_ARG(mode, vap->va_mode);
4192 
4193 	if ((error = falloc_withinit(p, &fp, &indx, ctx, fp_init, initarg)) != 0) {
4194 		return error;
4195 	}
4196 	if (flags & O_CLOEXEC) {
4197 		fp->fp_flags |= FP_CLOEXEC;
4198 	}
4199 	if (flags & O_CLOFORK) {
4200 		fp->fp_flags |= FP_CLOFORK;
4201 	}
4202 
4203 	/* setup state to recognize when fdesc_open was called */
4204 	uu->uu_dupfd = -1;
4205 
4206 	if ((error = vn_open_auth(ndp, &flags, vap))) {
4207 		if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4208 			if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4209 				*retval = indx;
4210 				return 0;
4211 			}
4212 		}
4213 		if (error == ERESTART) {
4214 			error = EINTR;
4215 		}
4216 		fp_free(p, indx, fp);
4217 		return error;
4218 	}
4219 	uu->uu_dupfd = 0;
4220 	vp = ndp->ni_vp;
4221 
4222 	fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4223 	fp->fp_glob->fg_ops = &vnops;
4224 	fp_set_data(fp, vp);
4225 
4226 	if (flags & (O_EXLOCK | O_SHLOCK)) {
4227 		struct flock lf = {
4228 			.l_whence = SEEK_SET,
4229 		};
4230 
4231 		if (flags & O_EXLOCK) {
4232 			lf.l_type = F_WRLCK;
4233 		} else {
4234 			lf.l_type = F_RDLCK;
4235 		}
4236 		type = F_FLOCK;
4237 		if ((flags & FNONBLOCK) == 0) {
4238 			type |= F_WAIT;
4239 		}
4240 #if CONFIG_MACF
4241 		error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4242 		    F_SETLK, &lf);
4243 		if (error) {
4244 			goto bad;
4245 		}
4246 #endif
4247 		if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4248 			goto bad;
4249 		}
4250 		fp->fp_glob->fg_flag |= FWASLOCKED;
4251 	}
4252 
4253 	/* try to truncate by setting the size attribute */
4254 	if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4255 		goto bad;
4256 	}
4257 
4258 	/*
4259 	 * For directories we hold some additional information in the fd.
4260 	 */
4261 	if (vnode_vtype(vp) == VDIR) {
4262 		fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4263 	} else {
4264 		fp->fp_glob->fg_vn_data = NULL;
4265 	}
4266 
4267 	vnode_put(vp);
4268 
4269 	/*
4270 	 * The first terminal open (without a O_NOCTTY) by a session leader
4271 	 * results in it being set as the controlling terminal.
4272 	 */
4273 	if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4274 	    !(flags & O_NOCTTY)) {
4275 		int tmp = 0;
4276 
4277 		(void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4278 		    (caddr_t)&tmp, ctx);
4279 	}
4280 
4281 	proc_fdlock(p);
4282 	procfdtbl_releasefd(p, indx, NULL);
4283 
4284 #if CONFIG_SECLUDED_MEMORY
4285 	if (secluded_for_filecache &&
4286 	    FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE &&
4287 	    vnode_vtype(vp) == VREG) {
4288 		memory_object_control_t moc;
4289 
4290 		moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4291 
4292 		if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4293 			/* nothing to do... */
4294 		} else if (fp->fp_glob->fg_flag & FWRITE) {
4295 			/* writable -> no longer  eligible for secluded pages */
4296 			memory_object_mark_eligible_for_secluded(moc,
4297 			    FALSE);
4298 		} else if (secluded_for_filecache == 1) {
4299 			char pathname[32] = { 0, };
4300 			size_t copied;
4301 			/* XXX FBDP: better way to detect /Applications/ ? */
4302 			if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4303 				(void)copyinstr(ndp->ni_dirp,
4304 				    pathname,
4305 				    sizeof(pathname),
4306 				    &copied);
4307 			} else {
4308 				copystr(CAST_DOWN(void *, ndp->ni_dirp),
4309 				    pathname,
4310 				    sizeof(pathname),
4311 				    &copied);
4312 			}
4313 			pathname[sizeof(pathname) - 1] = '\0';
4314 			if (strncmp(pathname,
4315 			    "/Applications/",
4316 			    strlen("/Applications/")) == 0 &&
4317 			    strncmp(pathname,
4318 			    "/Applications/Camera.app/",
4319 			    strlen("/Applications/Camera.app/")) != 0) {
4320 				/*
4321 				 * not writable
4322 				 * AND from "/Applications/"
4323 				 * AND not from "/Applications/Camera.app/"
4324 				 * ==> eligible for secluded
4325 				 */
4326 				memory_object_mark_eligible_for_secluded(moc,
4327 				    TRUE);
4328 			}
4329 		} else if (secluded_for_filecache == 2) {
4330 			size_t len = strlen(vp->v_name);
4331 			if (!strncmp(vp->v_name, "dyld", len) ||
4332 			    !strncmp(vp->v_name, "launchd", len) ||
4333 			    !strncmp(vp->v_name, "Camera", len) ||
4334 			    !strncmp(vp->v_name, "mediaserverd", len) ||
4335 			    !strncmp(vp->v_name, "SpringBoard", len) ||
4336 			    !strncmp(vp->v_name, "backboardd", len)) {
4337 				/*
4338 				 * This file matters when launching Camera:
4339 				 * do not store its contents in the secluded
4340 				 * pool that will be drained on Camera launch.
4341 				 */
4342 				memory_object_mark_eligible_for_secluded(moc,
4343 				    FALSE);
4344 			}
4345 		}
4346 	}
4347 #endif /* CONFIG_SECLUDED_MEMORY */
4348 
4349 	fp_drop(p, indx, fp, 1);
4350 	proc_fdunlock(p);
4351 
4352 	*retval = indx;
4353 
4354 	return 0;
4355 bad:
4356 	context = *vfs_context_current();
4357 	context.vc_ucred = fp->fp_glob->fg_cred;
4358 
4359 	if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4360 	    (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4361 		struct flock lf = {
4362 			.l_whence = SEEK_SET,
4363 			.l_type = F_UNLCK,
4364 		};
4365 
4366 		(void)VNOP_ADVLOCK(
4367 			vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4368 	}
4369 
4370 	vn_close(vp, fp->fp_glob->fg_flag, &context);
4371 	vnode_put(vp);
4372 	fp_free(p, indx, fp);
4373 
4374 	return error;
4375 }
4376 
4377 /*
4378  * While most of the *at syscall handlers can call nameiat() which
4379  * is a wrapper around namei, the use of namei and initialisation
4380  * of nameidata are far removed and in different functions  - namei
4381  * gets called in vn_open_auth for open1. So we'll just do here what
4382  * nameiat() does.
4383  */
4384 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd)4385 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4386     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4387     int dirfd)
4388 {
4389 	if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4390 		int error;
4391 		char c;
4392 
4393 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4394 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
4395 			if (error) {
4396 				return error;
4397 			}
4398 		} else {
4399 			c = *((char *)(ndp->ni_dirp));
4400 		}
4401 
4402 		if (c != '/') {
4403 			vnode_t dvp_at;
4404 
4405 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4406 			    &dvp_at);
4407 			if (error) {
4408 				return error;
4409 			}
4410 
4411 			if (vnode_vtype(dvp_at) != VDIR) {
4412 				vnode_put(dvp_at);
4413 				return ENOTDIR;
4414 			}
4415 
4416 			ndp->ni_dvp = dvp_at;
4417 			ndp->ni_cnd.cn_flags |= USEDVP;
4418 			error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4419 			    retval);
4420 			vnode_put(dvp_at);
4421 			return error;
4422 		}
4423 	}
4424 
4425 	return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval);
4426 }
4427 
4428 /*
4429  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4430  *
4431  * Parameters:	p			Process requesting the open
4432  *		uap			User argument descriptor (see below)
4433  *		retval			Pointer to an area to receive the
4434  *					return calue from the system call
4435  *
4436  * Indirect:	uap->path		Path to open (same as 'open')
4437  *		uap->flags		Flags to open (same as 'open'
4438  *		uap->uid		UID to set, if creating
4439  *		uap->gid		GID to set, if creating
4440  *		uap->mode		File mode, if creating (same as 'open')
4441  *		uap->xsecurity		ACL to set, if creating
4442  *
4443  * Returns:	0			Success
4444  *		!0			errno value
4445  *
4446  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
4447  *
4448  * XXX:		We should enummerate the possible errno values here, and where
4449  *		in the code they originated.
4450  */
4451 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)4452 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4453 {
4454 	int ciferror;
4455 	kauth_filesec_t xsecdst;
4456 	struct vnode_attr va;
4457 	struct nameidata nd;
4458 	int cmode;
4459 
4460 	AUDIT_ARG(owner, uap->uid, uap->gid);
4461 
4462 	xsecdst = NULL;
4463 	if ((uap->xsecurity != USER_ADDR_NULL) &&
4464 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4465 		return ciferror;
4466 	}
4467 
4468 	VATTR_INIT(&va);
4469 	cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4470 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4471 	if (uap->uid != KAUTH_UID_NONE) {
4472 		VATTR_SET(&va, va_uid, uap->uid);
4473 	}
4474 	if (uap->gid != KAUTH_GID_NONE) {
4475 		VATTR_SET(&va, va_gid, uap->gid);
4476 	}
4477 	if (xsecdst != NULL) {
4478 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4479 		va.va_vaflags |= VA_FILESEC_ACL;
4480 	}
4481 
4482 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4483 	    uap->path, vfs_context_current());
4484 
4485 	ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4486 	    NULL, NULL, retval);
4487 	if (xsecdst != NULL) {
4488 		kauth_filesec_free(xsecdst);
4489 	}
4490 
4491 	return ciferror;
4492 }
4493 
4494 /*
4495  * Go through the data-protected atomically controlled open (2)
4496  *
4497  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4498  */
4499 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)4500 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
4501 {
4502 	int flags = uap->flags;
4503 	int class = uap->class;
4504 	int dpflags = uap->dpflags;
4505 
4506 	/*
4507 	 * Follow the same path as normal open(2)
4508 	 * Look up the item if it exists, and acquire the vnode.
4509 	 */
4510 	struct vnode_attr va;
4511 	struct nameidata nd;
4512 	int cmode;
4513 	int error;
4514 
4515 	VATTR_INIT(&va);
4516 	/* Mask off all but regular access permissions */
4517 	cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4518 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4519 
4520 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4521 	    uap->path, vfs_context_current());
4522 
4523 	/*
4524 	 * Initialize the extra fields in vnode_attr to pass down our
4525 	 * extra fields.
4526 	 * 1. target cprotect class.
4527 	 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4528 	 */
4529 	if (flags & O_CREAT) {
4530 		/* lower level kernel code validates that the class is valid before applying it. */
4531 		if (class != PROTECTION_CLASS_DEFAULT) {
4532 			/*
4533 			 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4534 			 * file behave the same as open (2)
4535 			 */
4536 			VATTR_SET(&va, va_dataprotect_class, class);
4537 		}
4538 	}
4539 
4540 	if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED)) {
4541 		if (flags & (O_RDWR | O_WRONLY)) {
4542 			/* Not allowed to write raw encrypted bytes */
4543 			return EINVAL;
4544 		}
4545 		if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
4546 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4547 		}
4548 		if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
4549 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4550 		}
4551 	}
4552 
4553 	error = open1(vfs_context_current(), &nd, uap->flags, &va,
4554 	    NULL, NULL, retval);
4555 
4556 	return error;
4557 }
4558 
4559 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)4560 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4561     int fd, enum uio_seg segflg, int *retval)
4562 {
4563 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
4564 	struct {
4565 		struct vnode_attr va;
4566 		struct nameidata nd;
4567 	} *__open_data;
4568 	struct vnode_attr *vap;
4569 	struct nameidata *ndp;
4570 	int cmode;
4571 	int error;
4572 
4573 	__open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
4574 	vap = &__open_data->va;
4575 	ndp = &__open_data->nd;
4576 
4577 	VATTR_INIT(vap);
4578 	/* Mask off all but regular access permissions */
4579 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4580 	VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
4581 
4582 	NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
4583 	    segflg, path, ctx);
4584 
4585 	error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd);
4586 
4587 	kfree_type(typeof(*__open_data), __open_data);
4588 
4589 	return error;
4590 }
4591 
4592 int
open(proc_t p,struct open_args * uap,int32_t * retval)4593 open(proc_t p, struct open_args *uap, int32_t *retval)
4594 {
4595 	__pthread_testcancel(1);
4596 	return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
4597 }
4598 
4599 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)4600 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
4601     int32_t *retval)
4602 {
4603 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
4604 	           uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
4605 }
4606 
4607 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)4608 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
4609     int32_t *retval)
4610 {
4611 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
4612 	           uap->mode, uap->fd, UIO_USERSPACE, retval);
4613 }
4614 
4615 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)4616 openat(proc_t p, struct openat_args *uap, int32_t *retval)
4617 {
4618 	__pthread_testcancel(1);
4619 	return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
4620 }
4621 
4622 /*
4623  * openbyid_np: open a file given a file system id and a file system object id
4624  *	the hfs file system object id is an fsobj_id_t {uint32, uint32}
4625  *	file systems that don't support object ids it is a node id (uint64_t).
4626  *
4627  * Parameters:	p			Process requesting the open
4628  *		uap			User argument descriptor (see below)
4629  *		retval			Pointer to an area to receive the
4630  *					return calue from the system call
4631  *
4632  * Indirect:	uap->path		Path to open (same as 'open')
4633  *
4634  *		uap->fsid		id of target file system
4635  *		uap->objid		id of target file system object
4636  *		uap->flags		Flags to open (same as 'open')
4637  *
4638  * Returns:	0			Success
4639  *		!0			errno value
4640  *
4641  *
4642  * XXX:		We should enummerate the possible errno values here, and where
4643  *		in the code they originated.
4644  */
4645 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)4646 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
4647 {
4648 	fsid_t fsid;
4649 	uint64_t objid;
4650 	int error;
4651 	char *buf = NULL;
4652 	int buflen = MAXPATHLEN;
4653 	int pathlen = 0;
4654 	vfs_context_t ctx = vfs_context_current();
4655 
4656 	if ((error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_OPEN_BY_ID, 0))) {
4657 		return error;
4658 	}
4659 
4660 	if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
4661 		return error;
4662 	}
4663 
4664 	/*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
4665 	if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
4666 		return error;
4667 	}
4668 
4669 	AUDIT_ARG(value32, fsid.val[0]);
4670 	AUDIT_ARG(value64, objid);
4671 
4672 	/*resolve path from fsis, objid*/
4673 	do {
4674 		buf = kalloc_data(buflen + 1, Z_WAITOK);
4675 		if (buf == NULL) {
4676 			return ENOMEM;
4677 		}
4678 
4679 		error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
4680 		    buf, FSOPT_ISREALFSID, &pathlen);
4681 
4682 		if (error) {
4683 			kfree_data(buf, buflen + 1);
4684 			buf = NULL;
4685 		}
4686 	} while (error == ENOSPC && (buflen += MAXPATHLEN));
4687 
4688 	if (error) {
4689 		return error;
4690 	}
4691 
4692 	buf[pathlen] = 0;
4693 
4694 	error = openat_internal(
4695 		ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
4696 
4697 	kfree_data(buf, buflen + 1);
4698 
4699 	return error;
4700 }
4701 
4702 
4703 /*
4704  * Create a special file.
4705  */
4706 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
4707 
4708 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)4709 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
4710 {
4711 	struct vnode_attr va;
4712 	vfs_context_t ctx = vfs_context_current();
4713 	int error;
4714 	struct nameidata nd;
4715 	vnode_t vp, dvp;
4716 
4717 	VATTR_INIT(&va);
4718 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
4719 	VATTR_SET(&va, va_rdev, uap->dev);
4720 
4721 	/* If it's a mknod() of a FIFO, call mkfifo1() instead */
4722 	if ((uap->mode & S_IFMT) == S_IFIFO) {
4723 		return mkfifo1(ctx, uap->path, &va);
4724 	}
4725 
4726 	AUDIT_ARG(mode, (mode_t)uap->mode);
4727 	AUDIT_ARG(value32, uap->dev);
4728 
4729 	if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
4730 		return error;
4731 	}
4732 	NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
4733 	    UIO_USERSPACE, uap->path, ctx);
4734 	error = namei(&nd);
4735 	if (error) {
4736 		return error;
4737 	}
4738 	dvp = nd.ni_dvp;
4739 	vp = nd.ni_vp;
4740 
4741 	if (vp != NULL) {
4742 		error = EEXIST;
4743 		goto out;
4744 	}
4745 
4746 	switch (uap->mode & S_IFMT) {
4747 	case S_IFCHR:
4748 		VATTR_SET(&va, va_type, VCHR);
4749 		break;
4750 	case S_IFBLK:
4751 		VATTR_SET(&va, va_type, VBLK);
4752 		break;
4753 	default:
4754 		error = EINVAL;
4755 		goto out;
4756 	}
4757 
4758 #if CONFIG_MACF
4759 	error = mac_vnode_check_create(ctx,
4760 	    nd.ni_dvp, &nd.ni_cnd, &va);
4761 	if (error) {
4762 		goto out;
4763 	}
4764 #endif
4765 
4766 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
4767 		goto out;
4768 	}
4769 
4770 	if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0) {
4771 		goto out;
4772 	}
4773 
4774 	if (vp) {
4775 		int     update_flags = 0;
4776 
4777 		// Make sure the name & parent pointers are hooked up
4778 		if (vp->v_name == NULL) {
4779 			update_flags |= VNODE_UPDATE_NAME;
4780 		}
4781 		if (vp->v_parent == NULLVP) {
4782 			update_flags |= VNODE_UPDATE_PARENT;
4783 		}
4784 
4785 		if (update_flags) {
4786 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4787 		}
4788 
4789 #if CONFIG_FSE
4790 		add_fsevent(FSE_CREATE_FILE, ctx,
4791 		    FSE_ARG_VNODE, vp,
4792 		    FSE_ARG_DONE);
4793 #endif
4794 	}
4795 
4796 out:
4797 	/*
4798 	 * nameidone has to happen before we vnode_put(dvp)
4799 	 * since it may need to release the fs_nodelock on the dvp
4800 	 */
4801 	nameidone(&nd);
4802 
4803 	if (vp) {
4804 		vnode_put(vp);
4805 	}
4806 	vnode_put(dvp);
4807 
4808 	return error;
4809 }
4810 
4811 /*
4812  * Create a named pipe.
4813  *
4814  * Returns:	0			Success
4815  *		EEXIST
4816  *	namei:???
4817  *	vnode_authorize:???
4818  *	vn_create:???
4819  */
4820 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap)4821 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
4822 {
4823 	vnode_t vp, dvp;
4824 	int error;
4825 	struct nameidata nd;
4826 
4827 	NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
4828 	    UIO_USERSPACE, upath, ctx);
4829 	error = namei(&nd);
4830 	if (error) {
4831 		return error;
4832 	}
4833 	dvp = nd.ni_dvp;
4834 	vp = nd.ni_vp;
4835 
4836 	/* check that this is a new file and authorize addition */
4837 	if (vp != NULL) {
4838 		error = EEXIST;
4839 		goto out;
4840 	}
4841 	VATTR_SET(vap, va_type, VFIFO);
4842 
4843 	if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
4844 		goto out;
4845 	}
4846 
4847 	error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
4848 out:
4849 	/*
4850 	 * nameidone has to happen before we vnode_put(dvp)
4851 	 * since it may need to release the fs_nodelock on the dvp
4852 	 */
4853 	nameidone(&nd);
4854 
4855 	if (vp) {
4856 		vnode_put(vp);
4857 	}
4858 	vnode_put(dvp);
4859 
4860 	return error;
4861 }
4862 
4863 
4864 /*
4865  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
4866  *
4867  * Parameters:	p			Process requesting the open
4868  *		uap			User argument descriptor (see below)
4869  *		retval			(Ignored)
4870  *
4871  * Indirect:	uap->path		Path to fifo (same as 'mkfifo')
4872  *		uap->uid		UID to set
4873  *		uap->gid		GID to set
4874  *		uap->mode		File mode to set (same as 'mkfifo')
4875  *		uap->xsecurity		ACL to set, if creating
4876  *
4877  * Returns:	0			Success
4878  *		!0			errno value
4879  *
4880  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
4881  *
4882  * XXX:		We should enummerate the possible errno values here, and where
4883  *		in the code they originated.
4884  */
4885 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)4886 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
4887 {
4888 	int ciferror;
4889 	kauth_filesec_t xsecdst;
4890 	struct vnode_attr va;
4891 
4892 	AUDIT_ARG(owner, uap->uid, uap->gid);
4893 
4894 	xsecdst = KAUTH_FILESEC_NONE;
4895 	if (uap->xsecurity != USER_ADDR_NULL) {
4896 		if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
4897 			return ciferror;
4898 		}
4899 	}
4900 
4901 	VATTR_INIT(&va);
4902 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
4903 	if (uap->uid != KAUTH_UID_NONE) {
4904 		VATTR_SET(&va, va_uid, uap->uid);
4905 	}
4906 	if (uap->gid != KAUTH_GID_NONE) {
4907 		VATTR_SET(&va, va_gid, uap->gid);
4908 	}
4909 	if (xsecdst != KAUTH_FILESEC_NONE) {
4910 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4911 		va.va_vaflags |= VA_FILESEC_ACL;
4912 	}
4913 
4914 	ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
4915 
4916 	if (xsecdst != KAUTH_FILESEC_NONE) {
4917 		kauth_filesec_free(xsecdst);
4918 	}
4919 	return ciferror;
4920 }
4921 
4922 /* ARGSUSED */
4923 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)4924 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
4925 {
4926 	struct vnode_attr va;
4927 
4928 	VATTR_INIT(&va);
4929 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
4930 
4931 	return mkfifo1(vfs_context_current(), uap->path, &va);
4932 }
4933 
4934 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
4935 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4936 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4937 
4938 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)4939 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
4940 {
4941 	int ret, len = _len;
4942 
4943 	*truncated_path = 0;
4944 
4945 	if (firmlink) {
4946 		ret = vn_getpath(dvp, path, &len);
4947 	} else {
4948 		ret = vn_getpath_no_firmlink(dvp, path, &len);
4949 	}
4950 	if (ret == 0 && len < (MAXPATHLEN - 1)) {
4951 		if (leafname) {
4952 			path[len - 1] = '/';
4953 			len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
4954 			if (len > MAXPATHLEN) {
4955 				char *ptr;
4956 
4957 				// the string got truncated!
4958 				*truncated_path = 1;
4959 				ptr = strrchr(path, '/');
4960 				if (ptr) {
4961 					*ptr = '\0';   // chop off the string at the last directory component
4962 				}
4963 				len = (int)strlen(path) + 1;
4964 			}
4965 		}
4966 	} else if (ret == 0) {
4967 		*truncated_path = 1;
4968 	} else if (ret != 0) {
4969 		struct vnode *mydvp = dvp;
4970 
4971 		if (ret != ENOSPC) {
4972 			printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
4973 			    dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
4974 		}
4975 		*truncated_path = 1;
4976 
4977 		do {
4978 			if (mydvp->v_parent != NULL) {
4979 				mydvp = mydvp->v_parent;
4980 			} else if (mydvp->v_mount) {
4981 				strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4982 				break;
4983 			} else {
4984 				// no parent and no mount point?  only thing is to punt and say "/" changed
4985 				strlcpy(path, "/", _len);
4986 				len = 2;
4987 				mydvp = NULL;
4988 			}
4989 
4990 			if (mydvp == NULL) {
4991 				break;
4992 			}
4993 
4994 			len = _len;
4995 			if (firmlink) {
4996 				ret = vn_getpath(mydvp, path, &len);
4997 			} else {
4998 				ret = vn_getpath_no_firmlink(mydvp, path, &len);
4999 			}
5000 		} while (ret == ENOSPC);
5001 	}
5002 
5003 	return len;
5004 }
5005 
5006 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5007 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5008 {
5009 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5010 }
5011 
5012 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5013 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5014 {
5015 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5016 }
5017 
5018 /*
5019  * Make a hard file link.
5020  *
5021  * Returns:	0			Success
5022  *		EPERM
5023  *		EEXIST
5024  *		EXDEV
5025  *	namei:???
5026  *	vnode_authorize:???
5027  *	VNOP_LINK:???
5028  */
5029 /* ARGSUSED */
5030 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5031 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5032     user_addr_t link, int flag, enum uio_seg segflg)
5033 {
5034 	vnode_t vp, pvp, dvp, lvp;
5035 	struct nameidata nd;
5036 	int follow;
5037 	int error;
5038 #if CONFIG_FSE
5039 	fse_info finfo;
5040 #endif
5041 	int need_event, has_listeners, need_kpath2;
5042 	char *target_path = NULL;
5043 	char  *no_firmlink_path = NULL;
5044 	int truncated = 0;
5045 	int truncated_no_firmlink_path = 0;
5046 
5047 	vp = dvp = lvp = NULLVP;
5048 
5049 	/* look up the object we are linking to */
5050 	follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5051 	NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5052 	    segflg, path, ctx);
5053 
5054 	error = nameiat(&nd, fd1);
5055 	if (error) {
5056 		return error;
5057 	}
5058 	vp = nd.ni_vp;
5059 
5060 	nameidone(&nd);
5061 
5062 	/*
5063 	 * Normally, linking to directories is not supported.
5064 	 * However, some file systems may have limited support.
5065 	 */
5066 	if (vp->v_type == VDIR) {
5067 		if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5068 			error = EPERM;   /* POSIX */
5069 			goto out;
5070 		}
5071 
5072 		/* Linking to a directory requires ownership. */
5073 		if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5074 			struct vnode_attr dva;
5075 
5076 			VATTR_INIT(&dva);
5077 			VATTR_WANTED(&dva, va_uid);
5078 			if (vnode_getattr(vp, &dva, ctx) != 0 ||
5079 			    !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5080 			    (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5081 				error = EACCES;
5082 				goto out;
5083 			}
5084 		}
5085 	}
5086 
5087 	/* lookup the target node */
5088 #if CONFIG_TRIGGERS
5089 	nd.ni_op = OP_LINK;
5090 #endif
5091 	nd.ni_cnd.cn_nameiop = CREATE;
5092 	nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5093 	nd.ni_dirp = link;
5094 	error = nameiat(&nd, fd2);
5095 	if (error != 0) {
5096 		goto out;
5097 	}
5098 	dvp = nd.ni_dvp;
5099 	lvp = nd.ni_vp;
5100 
5101 #if CONFIG_MACF
5102 	if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5103 		goto out2;
5104 	}
5105 #endif
5106 
5107 	/* or to anything that kauth doesn't want us to (eg. immutable items) */
5108 	if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5109 		goto out2;
5110 	}
5111 
5112 	/* target node must not exist */
5113 	if (lvp != NULLVP) {
5114 		error = EEXIST;
5115 		goto out2;
5116 	}
5117 	/* cannot link across mountpoints */
5118 	if (vnode_mount(vp) != vnode_mount(dvp)) {
5119 		error = EXDEV;
5120 		goto out2;
5121 	}
5122 
5123 	/* authorize creation of the target note */
5124 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5125 		goto out2;
5126 	}
5127 
5128 	/* and finally make the link */
5129 	error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5130 	if (error) {
5131 		goto out2;
5132 	}
5133 
5134 #if CONFIG_MACF
5135 	(void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5136 #endif
5137 
5138 #if CONFIG_FSE
5139 	need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5140 #else
5141 	need_event = 0;
5142 #endif
5143 	has_listeners = kauth_authorize_fileop_has_listeners();
5144 
5145 	need_kpath2 = 0;
5146 #if CONFIG_AUDIT
5147 	if (AUDIT_RECORD_EXISTS()) {
5148 		need_kpath2 = 1;
5149 	}
5150 #endif
5151 
5152 	if (need_event || has_listeners || need_kpath2) {
5153 		char *link_to_path = NULL;
5154 		int len, link_name_len;
5155 		int  len_no_firmlink_path = 0;
5156 
5157 		/* build the path to the new link file */
5158 		GET_PATH(target_path);
5159 
5160 		len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5161 		if (no_firmlink_path == NULL) {
5162 			GET_PATH(no_firmlink_path);
5163 		}
5164 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5165 
5166 		AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5167 
5168 		if (has_listeners) {
5169 			/* build the path to file we are linking to */
5170 			GET_PATH(link_to_path);
5171 
5172 			link_name_len = MAXPATHLEN;
5173 			if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5174 				/*
5175 				 * Call out to allow 3rd party notification of rename.
5176 				 * Ignore result of kauth_authorize_fileop call.
5177 				 */
5178 				kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5179 				    (uintptr_t)link_to_path,
5180 				    (uintptr_t)target_path);
5181 			}
5182 			if (link_to_path != NULL) {
5183 				RELEASE_PATH(link_to_path);
5184 			}
5185 		}
5186 #if CONFIG_FSE
5187 		if (need_event) {
5188 			/* construct fsevent */
5189 			if (get_fse_info(vp, &finfo, ctx) == 0) {
5190 				if (truncated_no_firmlink_path) {
5191 					finfo.mode |= FSE_TRUNCATED_PATH;
5192 				}
5193 
5194 				// build the path to the destination of the link
5195 				add_fsevent(FSE_CREATE_FILE, ctx,
5196 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5197 				    FSE_ARG_FINFO, &finfo,
5198 				    FSE_ARG_DONE);
5199 			}
5200 
5201 			pvp = vp->v_parent;
5202 			// need an iocount on pvp in this case
5203 			if (pvp && pvp != dvp) {
5204 				error = vnode_get(pvp);
5205 				if (error) {
5206 					pvp = NULLVP;
5207 					error = 0;
5208 				}
5209 			}
5210 			if (pvp) {
5211 				add_fsevent(FSE_STAT_CHANGED, ctx,
5212 				    FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5213 			}
5214 			if (pvp && pvp != dvp) {
5215 				vnode_put(pvp);
5216 			}
5217 		}
5218 #endif
5219 	}
5220 out2:
5221 	/*
5222 	 * nameidone has to happen before we vnode_put(dvp)
5223 	 * since it may need to release the fs_nodelock on the dvp
5224 	 */
5225 	nameidone(&nd);
5226 	if (target_path != NULL) {
5227 		RELEASE_PATH(target_path);
5228 	}
5229 	if (no_firmlink_path != NULL) {
5230 		RELEASE_PATH(no_firmlink_path);
5231 		no_firmlink_path = NULL;
5232 	}
5233 out:
5234 	if (lvp) {
5235 		vnode_put(lvp);
5236 	}
5237 	if (dvp) {
5238 		vnode_put(dvp);
5239 	}
5240 	vnode_put(vp);
5241 	return error;
5242 }
5243 
5244 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5245 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5246 {
5247 	return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5248 	           AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5249 }
5250 
5251 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5252 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5253 {
5254 	if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5255 		return EINVAL;
5256 	}
5257 
5258 	return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5259 	           uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5260 }
5261 
5262 /*
5263  * Make a symbolic link.
5264  *
5265  * We could add support for ACLs here too...
5266  */
5267 /* ARGSUSED */
5268 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5269 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5270     user_addr_t link, enum uio_seg segflg)
5271 {
5272 	struct vnode_attr va;
5273 	char *path;
5274 	int error;
5275 	struct nameidata nd;
5276 	vnode_t vp, dvp;
5277 	size_t dummy = 0;
5278 	proc_t p;
5279 
5280 	error = 0;
5281 	if (UIO_SEG_IS_USER_SPACE(segflg)) {
5282 		path = zalloc(ZV_NAMEI);
5283 		error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5284 	} else {
5285 		path = (char *)path_data;
5286 	}
5287 	if (error) {
5288 		goto out;
5289 	}
5290 	AUDIT_ARG(text, path);  /* This is the link string */
5291 
5292 	NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5293 	    segflg, link, ctx);
5294 
5295 	error = nameiat(&nd, fd);
5296 	if (error) {
5297 		goto out;
5298 	}
5299 	dvp = nd.ni_dvp;
5300 	vp = nd.ni_vp;
5301 
5302 	p = vfs_context_proc(ctx);
5303 	VATTR_INIT(&va);
5304 	VATTR_SET(&va, va_type, VLNK);
5305 	VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5306 
5307 #if CONFIG_MACF
5308 	error = mac_vnode_check_create(ctx,
5309 	    dvp, &nd.ni_cnd, &va);
5310 #endif
5311 	if (error != 0) {
5312 		goto skipit;
5313 	}
5314 
5315 	if (vp != NULL) {
5316 		error = EEXIST;
5317 		goto skipit;
5318 	}
5319 
5320 	/* authorize */
5321 	if (error == 0) {
5322 		error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5323 	}
5324 	/* get default ownership, etc. */
5325 	if (error == 0) {
5326 		error = vnode_authattr_new(dvp, &va, 0, ctx);
5327 	}
5328 	if (error == 0) {
5329 		error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5330 	}
5331 
5332 	/* do fallback attribute handling */
5333 	if (error == 0 && vp) {
5334 		error = vnode_setattr_fallback(vp, &va, ctx);
5335 	}
5336 
5337 #if CONFIG_MACF
5338 	if (error == 0 && vp) {
5339 		error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5340 	}
5341 #endif
5342 
5343 	if (error == 0) {
5344 		int     update_flags = 0;
5345 
5346 		/*check if a new vnode was created, else try to get one*/
5347 		if (vp == NULL) {
5348 			nd.ni_cnd.cn_nameiop = LOOKUP;
5349 #if CONFIG_TRIGGERS
5350 			nd.ni_op = OP_LOOKUP;
5351 #endif
5352 			/*
5353 			 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5354 			 * reallocated again in namei().
5355 			 */
5356 			nd.ni_cnd.cn_flags &= HASBUF;
5357 			error = nameiat(&nd, fd);
5358 			if (error) {
5359 				goto skipit;
5360 			}
5361 			vp = nd.ni_vp;
5362 		}
5363 
5364 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5365 		/* call out to allow 3rd party notification of rename.
5366 		 * Ignore result of kauth_authorize_fileop call.
5367 		 */
5368 		if (kauth_authorize_fileop_has_listeners() &&
5369 		    namei(&nd) == 0) {
5370 			char *new_link_path = NULL;
5371 			int             len;
5372 
5373 			/* build the path to the new link file */
5374 			new_link_path = get_pathbuff();
5375 			len = MAXPATHLEN;
5376 			vn_getpath(dvp, new_link_path, &len);
5377 			if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5378 				new_link_path[len - 1] = '/';
5379 				strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5380 			}
5381 
5382 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5383 			    (uintptr_t)path, (uintptr_t)new_link_path);
5384 			if (new_link_path != NULL) {
5385 				release_pathbuff(new_link_path);
5386 			}
5387 		}
5388 #endif
5389 		// Make sure the name & parent pointers are hooked up
5390 		if (vp->v_name == NULL) {
5391 			update_flags |= VNODE_UPDATE_NAME;
5392 		}
5393 		if (vp->v_parent == NULLVP) {
5394 			update_flags |= VNODE_UPDATE_PARENT;
5395 		}
5396 
5397 		if (update_flags) {
5398 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5399 		}
5400 
5401 #if CONFIG_FSE
5402 		add_fsevent(FSE_CREATE_FILE, ctx,
5403 		    FSE_ARG_VNODE, vp,
5404 		    FSE_ARG_DONE);
5405 #endif
5406 	}
5407 
5408 skipit:
5409 	/*
5410 	 * nameidone has to happen before we vnode_put(dvp)
5411 	 * since it may need to release the fs_nodelock on the dvp
5412 	 */
5413 	nameidone(&nd);
5414 
5415 	if (vp) {
5416 		vnode_put(vp);
5417 	}
5418 	vnode_put(dvp);
5419 out:
5420 	if (path && (path != (char *)path_data)) {
5421 		zfree(ZV_NAMEI, path);
5422 	}
5423 
5424 	return error;
5425 }
5426 
5427 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)5428 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5429 {
5430 	return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5431 	           uap->link, UIO_USERSPACE);
5432 }
5433 
5434 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)5435 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5436     __unused int32_t *retval)
5437 {
5438 	return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5439 	           uap->path2, UIO_USERSPACE);
5440 }
5441 
5442 /*
5443  * Delete a whiteout from the filesystem.
5444  * No longer supported.
5445  */
5446 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)5447 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5448 {
5449 	return ENOTSUP;
5450 }
5451 
5452 /*
5453  * Delete a name from the filesystem.
5454  */
5455 /* ARGSUSED */
5456 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5457 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5458     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5459 {
5460 	struct {
5461 		struct nameidata nd;
5462 #if CONFIG_FSE
5463 		struct vnode_attr va;
5464 		fse_info finfo;
5465 #endif
5466 	} *__unlink_data;
5467 	struct nameidata *ndp;
5468 	vnode_t vp, dvp;
5469 	int error;
5470 	struct componentname *cnp;
5471 	char  *path = NULL;
5472 	char  *no_firmlink_path = NULL;
5473 	int  len_path = 0;
5474 	int  len_no_firmlink_path = 0;
5475 	int flags;
5476 	int need_event;
5477 	int has_listeners;
5478 	int truncated_path;
5479 	int truncated_no_firmlink_path;
5480 	int batched;
5481 	struct vnode_attr *vap;
5482 	int do_retry;
5483 	int retry_count = 0;
5484 	int cn_flags;
5485 
5486 	cn_flags = LOCKPARENT;
5487 	if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
5488 		cn_flags |= AUDITVNPATH1;
5489 	}
5490 	/* If a starting dvp is passed, it trumps any fd passed. */
5491 	if (start_dvp) {
5492 		cn_flags |= USEDVP;
5493 	}
5494 
5495 #if NAMEDRSRCFORK
5496 	/* unlink or delete is allowed on rsrc forks and named streams */
5497 	cn_flags |= CN_ALLOWRSRCFORK;
5498 #endif
5499 
5500 	__unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
5501 	ndp = &__unlink_data->nd;
5502 #if CONFIG_FSE
5503 	fse_info *finfop = &__unlink_data->finfo;
5504 #endif
5505 
5506 retry:
5507 	do_retry = 0;
5508 	flags = 0;
5509 	need_event = 0;
5510 	has_listeners = 0;
5511 	truncated_path = 0;
5512 	truncated_no_firmlink_path = 0;
5513 	vap = NULL;
5514 
5515 	NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
5516 
5517 	ndp->ni_dvp = start_dvp;
5518 	ndp->ni_flag |= NAMEI_COMPOUNDREMOVE;
5519 	cnp = &ndp->ni_cnd;
5520 
5521 continue_lookup:
5522 	error = nameiat(ndp, fd);
5523 	if (error) {
5524 		goto early_out;
5525 	}
5526 
5527 	dvp = ndp->ni_dvp;
5528 	vp = ndp->ni_vp;
5529 
5530 	/* With Carbon delete semantics, busy files cannot be deleted */
5531 	if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
5532 		flags |= VNODE_REMOVE_NODELETEBUSY;
5533 	}
5534 
5535 	/* Skip any potential upcalls if told to. */
5536 	if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
5537 		flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
5538 	}
5539 
5540 	if (vp) {
5541 		batched = vnode_compound_remove_available(vp);
5542 		/*
5543 		 * The root of a mounted filesystem cannot be deleted.
5544 		 */
5545 		if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
5546 			error = EBUSY;
5547 			goto out;
5548 		}
5549 
5550 #if DEVELOPMENT || DEBUG
5551 		/*
5552 		 * XXX VSWAP: Check for entitlements or special flag here
5553 		 * so we can restrict access appropriately.
5554 		 */
5555 #else /* DEVELOPMENT || DEBUG */
5556 
5557 		if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
5558 			error = EPERM;
5559 			goto out;
5560 		}
5561 #endif /* DEVELOPMENT || DEBUG */
5562 
5563 		if (!batched) {
5564 			error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
5565 			if (error) {
5566 				if (error == ENOENT) {
5567 					if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5568 						do_retry = 1;
5569 						retry_count++;
5570 					}
5571 				}
5572 				goto out;
5573 			}
5574 		}
5575 	} else {
5576 		batched = 1;
5577 
5578 		if (!vnode_compound_remove_available(dvp)) {
5579 			panic("No vp, but no compound remove?");
5580 		}
5581 	}
5582 
5583 #if CONFIG_FSE
5584 	need_event = need_fsevent(FSE_DELETE, dvp);
5585 	if (need_event) {
5586 		if (!batched) {
5587 			if ((vp->v_flag & VISHARDLINK) == 0) {
5588 				/* XXX need to get these data in batched VNOP */
5589 				get_fse_info(vp, finfop, ctx);
5590 			}
5591 		} else {
5592 			error =
5593 			    vfs_get_notify_attributes(&__unlink_data->va);
5594 			if (error) {
5595 				goto out;
5596 			}
5597 
5598 			vap = &__unlink_data->va;
5599 		}
5600 	}
5601 #endif
5602 	has_listeners = kauth_authorize_fileop_has_listeners();
5603 	if (need_event || has_listeners) {
5604 		if (path == NULL) {
5605 			GET_PATH(path);
5606 		}
5607 		len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
5608 		if (no_firmlink_path == NULL) {
5609 			GET_PATH(no_firmlink_path);
5610 		}
5611 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5612 	}
5613 
5614 #if NAMEDRSRCFORK
5615 	if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
5616 		error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
5617 	} else
5618 #endif
5619 	{
5620 		error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
5621 		vp = ndp->ni_vp;
5622 		if (error == EKEEPLOOKING) {
5623 			if (!batched) {
5624 				panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
5625 			}
5626 
5627 			if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
5628 				panic("EKEEPLOOKING, but continue flag not set?");
5629 			}
5630 
5631 			if (vnode_isdir(vp)) {
5632 				error = EISDIR;
5633 				goto out;
5634 			}
5635 			goto continue_lookup;
5636 		} else if (error == ENOENT && batched) {
5637 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5638 				/*
5639 				 * For compound VNOPs, the authorization callback may
5640 				 * return ENOENT in case of racing hardlink lookups
5641 				 * hitting the name  cache, redrive the lookup.
5642 				 */
5643 				do_retry = 1;
5644 				retry_count += 1;
5645 				goto out;
5646 			}
5647 		}
5648 	}
5649 
5650 	/*
5651 	 * Call out to allow 3rd party notification of delete.
5652 	 * Ignore result of kauth_authorize_fileop call.
5653 	 */
5654 	if (!error) {
5655 		if (has_listeners) {
5656 			kauth_authorize_fileop(vfs_context_ucred(ctx),
5657 			    KAUTH_FILEOP_DELETE,
5658 			    (uintptr_t)vp,
5659 			    (uintptr_t)path);
5660 		}
5661 
5662 		if (vp->v_flag & VISHARDLINK) {
5663 			//
5664 			// if a hardlink gets deleted we want to blow away the
5665 			// v_parent link because the path that got us to this
5666 			// instance of the link is no longer valid.  this will
5667 			// force the next call to get the path to ask the file
5668 			// system instead of just following the v_parent link.
5669 			//
5670 			vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
5671 		}
5672 
5673 #if CONFIG_FSE
5674 		if (need_event) {
5675 			if (vp->v_flag & VISHARDLINK) {
5676 				get_fse_info(vp, finfop, ctx);
5677 			} else if (vap) {
5678 				vnode_get_fse_info_from_vap(vp, finfop, vap);
5679 			}
5680 			if (truncated_path) {
5681 				finfop->mode |= FSE_TRUNCATED_PATH;
5682 			}
5683 			add_fsevent(FSE_DELETE, ctx,
5684 			    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5685 			    FSE_ARG_FINFO, finfop,
5686 			    FSE_ARG_DONE);
5687 		}
5688 #endif
5689 	}
5690 
5691 out:
5692 	if (path != NULL) {
5693 		RELEASE_PATH(path);
5694 		path = NULL;
5695 	}
5696 
5697 	if (no_firmlink_path != NULL) {
5698 		RELEASE_PATH(no_firmlink_path);
5699 		no_firmlink_path = NULL;
5700 	}
5701 #if NAMEDRSRCFORK
5702 	/* recycle the deleted rsrc fork vnode to force a reclaim, which
5703 	 * will cause its shadow file to go away if necessary.
5704 	 */
5705 	if (vp && (vnode_isnamedstream(vp)) &&
5706 	    (vp->v_parent != NULLVP) &&
5707 	    vnode_isshadow(vp)) {
5708 		vnode_recycle(vp);
5709 	}
5710 #endif
5711 	/*
5712 	 * nameidone has to happen before we vnode_put(dvp)
5713 	 * since it may need to release the fs_nodelock on the dvp
5714 	 */
5715 	nameidone(ndp);
5716 	vnode_put(dvp);
5717 	if (vp) {
5718 		vnode_put(vp);
5719 	}
5720 
5721 	if (do_retry) {
5722 		goto retry;
5723 	}
5724 
5725 early_out:
5726 	kfree_type(typeof(*__unlink_data), __unlink_data);
5727 	return error;
5728 }
5729 
5730 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5731 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
5732     enum uio_seg segflg, int unlink_flags)
5733 {
5734 	return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
5735 	           unlink_flags);
5736 }
5737 
5738 /*
5739  * Delete a name from the filesystem using Carbon semantics.
5740  */
5741 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)5742 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
5743 {
5744 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5745 	           uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
5746 }
5747 
5748 /*
5749  * Delete a name from the filesystem using POSIX semantics.
5750  */
5751 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)5752 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
5753 {
5754 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5755 	           uap->path, UIO_USERSPACE, 0);
5756 }
5757 
5758 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)5759 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
5760 {
5761 	if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5762 		return EINVAL;
5763 	}
5764 
5765 	if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5766 		int unlink_flags = 0;
5767 
5768 		if (uap->flag & AT_REMOVEDIR_DATALESS) {
5769 			unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
5770 		}
5771 		return rmdirat_internal(vfs_context_current(), uap->fd,
5772 		           uap->path, UIO_USERSPACE, unlink_flags);
5773 	} else {
5774 		return unlinkat_internal(vfs_context_current(), uap->fd,
5775 		           NULLVP, uap->path, UIO_USERSPACE, 0);
5776 	}
5777 }
5778 
5779 /*
5780  * Reposition read/write file offset.
5781  */
5782 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)5783 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
5784 {
5785 	struct fileproc *fp;
5786 	vnode_t vp;
5787 	struct vfs_context *ctx;
5788 	off_t offset = uap->offset, file_size;
5789 	int error;
5790 
5791 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
5792 		if (error == ENOTSUP) {
5793 			return ESPIPE;
5794 		}
5795 		return error;
5796 	}
5797 	if (vnode_isfifo(vp)) {
5798 		file_drop(uap->fd);
5799 		return ESPIPE;
5800 	}
5801 
5802 
5803 	ctx = vfs_context_current();
5804 #if CONFIG_MACF
5805 	if (uap->whence == L_INCR && uap->offset == 0) {
5806 		error = mac_file_check_get_offset(vfs_context_ucred(ctx),
5807 		    fp->fp_glob);
5808 	} else {
5809 		error = mac_file_check_change_offset(vfs_context_ucred(ctx),
5810 		    fp->fp_glob);
5811 	}
5812 	if (error) {
5813 		file_drop(uap->fd);
5814 		return error;
5815 	}
5816 #endif
5817 	if ((error = vnode_getwithref(vp))) {
5818 		file_drop(uap->fd);
5819 		return error;
5820 	}
5821 
5822 	switch (uap->whence) {
5823 	case L_INCR:
5824 		offset += fp->fp_glob->fg_offset;
5825 		break;
5826 	case L_XTND:
5827 		if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
5828 			break;
5829 		}
5830 		offset += file_size;
5831 		break;
5832 	case L_SET:
5833 		break;
5834 	case SEEK_HOLE:
5835 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
5836 		break;
5837 	case SEEK_DATA:
5838 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
5839 		break;
5840 	default:
5841 		error = EINVAL;
5842 	}
5843 	if (error == 0) {
5844 		if (uap->offset > 0 && offset < 0) {
5845 			/* Incremented/relative move past max size */
5846 			error = EOVERFLOW;
5847 		} else {
5848 			/*
5849 			 * Allow negative offsets on character devices, per
5850 			 * POSIX 1003.1-2001.  Most likely for writing disk
5851 			 * labels.
5852 			 */
5853 			if (offset < 0 && vp->v_type != VCHR) {
5854 				/* Decremented/relative move before start */
5855 				error = EINVAL;
5856 			} else {
5857 				/* Success */
5858 				fp->fp_glob->fg_offset = offset;
5859 				*retval = fp->fp_glob->fg_offset;
5860 			}
5861 		}
5862 	}
5863 
5864 	/*
5865 	 * An lseek can affect whether data is "available to read."  Use
5866 	 * hint of NOTE_NONE so no EVFILT_VNODE events fire
5867 	 */
5868 	post_event_if_success(vp, error, NOTE_NONE);
5869 	(void)vnode_put(vp);
5870 	file_drop(uap->fd);
5871 	return error;
5872 }
5873 
5874 
5875 /*
5876  * Check access permissions.
5877  *
5878  * Returns:	0			Success
5879  *		vnode_authorize:???
5880  */
5881 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)5882 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
5883 {
5884 	kauth_action_t action;
5885 	int error;
5886 
5887 	/*
5888 	 * If just the regular access bits, convert them to something
5889 	 * that vnode_authorize will understand.
5890 	 */
5891 	if (!(uflags & _ACCESS_EXTENDED_MASK)) {
5892 		action = 0;
5893 		if (uflags & R_OK) {
5894 			action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
5895 		}
5896 		if (uflags & W_OK) {
5897 			if (vnode_isdir(vp)) {
5898 				action |= KAUTH_VNODE_ADD_FILE |
5899 				    KAUTH_VNODE_ADD_SUBDIRECTORY;
5900 				/* might want delete rights here too */
5901 			} else {
5902 				action |= KAUTH_VNODE_WRITE_DATA;
5903 			}
5904 		}
5905 		if (uflags & X_OK) {
5906 			if (vnode_isdir(vp)) {
5907 				action |= KAUTH_VNODE_SEARCH;
5908 			} else {
5909 				action |= KAUTH_VNODE_EXECUTE;
5910 			}
5911 		}
5912 	} else {
5913 		/* take advantage of definition of uflags */
5914 		action = uflags >> 8;
5915 	}
5916 
5917 #if CONFIG_MACF
5918 	error = mac_vnode_check_access(ctx, vp, uflags);
5919 	if (error) {
5920 		return error;
5921 	}
5922 #endif /* MAC */
5923 
5924 	/* action == 0 means only check for existence */
5925 	if (action != 0) {
5926 		error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
5927 	} else {
5928 		error = 0;
5929 	}
5930 
5931 	return error;
5932 }
5933 
5934 
5935 
5936 /*
5937  * access_extended: Check access permissions in bulk.
5938  *
5939  * Description:	uap->entries		Pointer to an array of accessx
5940  *                                      descriptor structs, plus one or
5941  *                                      more NULL terminated strings (see
5942  *                                      "Notes" section below).
5943  *		uap->size		Size of the area pointed to by
5944  *					uap->entries.
5945  *		uap->results		Pointer to the results array.
5946  *
5947  * Returns:	0			Success
5948  *		ENOMEM			Insufficient memory
5949  *		EINVAL			Invalid arguments
5950  *		namei:EFAULT		Bad address
5951  *		namei:ENAMETOOLONG	Filename too long
5952  *		namei:ENOENT		No such file or directory
5953  *		namei:ELOOP		Too many levels of symbolic links
5954  *		namei:EBADF		Bad file descriptor
5955  *		namei:ENOTDIR		Not a directory
5956  *		namei:???
5957  *		access1:
5958  *
5959  * Implicit returns:
5960  *		uap->results		Array contents modified
5961  *
5962  * Notes:	The uap->entries are structured as an arbitrary length array
5963  *		of accessx descriptors, followed by one or more NULL terminated
5964  *		strings
5965  *
5966  *			struct accessx_descriptor[0]
5967  *			...
5968  *			struct accessx_descriptor[n]
5969  *			char name_data[0];
5970  *
5971  *		We determine the entry count by walking the buffer containing
5972  *		the uap->entries argument descriptor.  For each descriptor we
5973  *		see, the valid values for the offset ad_name_offset will be
5974  *		in the byte range:
5975  *
5976  *			[ uap->entries + sizeof(struct accessx_descriptor) ]
5977  *						to
5978  *				[ uap->entries + uap->size - 2 ]
5979  *
5980  *		since we must have at least one string, and the string must
5981  *		be at least one character plus the NULL terminator in length.
5982  *
5983  * XXX:		Need to support the check-as uid argument
5984  */
5985 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)5986 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
5987 {
5988 	struct accessx_descriptor *input = NULL;
5989 	errno_t *result = NULL;
5990 	errno_t error = 0;
5991 	int wantdelete = 0;
5992 	size_t desc_max, desc_actual;
5993 	unsigned int i, j;
5994 	struct vfs_context context;
5995 	struct nameidata nd;
5996 	int niopts;
5997 	vnode_t vp = NULL;
5998 	vnode_t dvp = NULL;
5999 #define ACCESSX_MAX_DESCR_ON_STACK 10
6000 	struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6001 
6002 	context.vc_ucred = NULL;
6003 
6004 	/*
6005 	 * Validate parameters; if valid, copy the descriptor array and string
6006 	 * arguments into local memory.  Before proceeding, the following
6007 	 * conditions must have been met:
6008 	 *
6009 	 * o	The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6010 	 * o	There must be sufficient room in the request for at least one
6011 	 *	descriptor and a one yte NUL terminated string.
6012 	 * o	The allocation of local storage must not fail.
6013 	 */
6014 	if (uap->size > ACCESSX_MAX_TABLESIZE) {
6015 		return ENOMEM;
6016 	}
6017 	if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6018 		return EINVAL;
6019 	}
6020 	if (uap->size <= sizeof(stack_input)) {
6021 		input = stack_input;
6022 	} else {
6023 		input = kalloc_data(uap->size, Z_WAITOK);
6024 		if (input == NULL) {
6025 			error = ENOMEM;
6026 			goto out;
6027 		}
6028 	}
6029 	error = copyin(uap->entries, input, uap->size);
6030 	if (error) {
6031 		goto out;
6032 	}
6033 
6034 	AUDIT_ARG(opaque, input, uap->size);
6035 
6036 	/*
6037 	 * Force NUL termination of the copyin buffer to avoid nami() running
6038 	 * off the end.  If the caller passes us bogus data, they may get a
6039 	 * bogus result.
6040 	 */
6041 	((char *)input)[uap->size - 1] = 0;
6042 
6043 	/*
6044 	 * Access is defined as checking against the process' real identity,
6045 	 * even if operations are checking the effective identity.  This
6046 	 * requires that we use a local vfs context.
6047 	 */
6048 	context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6049 	context.vc_thread = current_thread();
6050 
6051 	/*
6052 	 * Find out how many entries we have, so we can allocate the result
6053 	 * array by walking the list and adjusting the count downward by the
6054 	 * earliest string offset we see.
6055 	 */
6056 	desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6057 	desc_actual = desc_max;
6058 	for (i = 0; i < desc_actual; i++) {
6059 		/*
6060 		 * Take the offset to the name string for this entry and
6061 		 * convert to an input array index, which would be one off
6062 		 * the end of the array if this entry was the lowest-addressed
6063 		 * name string.
6064 		 */
6065 		j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6066 
6067 		/*
6068 		 * An offset greater than the max allowable offset is an error.
6069 		 * It is also an error for any valid entry to point
6070 		 * to a location prior to the end of the current entry, if
6071 		 * it's not a reference to the string of the previous entry.
6072 		 */
6073 		if (j > desc_max || (j != 0 && j <= i)) {
6074 			error = EINVAL;
6075 			goto out;
6076 		}
6077 
6078 		/* Also do not let ad_name_offset point to something beyond the size of the input */
6079 		if (input[i].ad_name_offset >= uap->size) {
6080 			error = EINVAL;
6081 			goto out;
6082 		}
6083 
6084 		/*
6085 		 * An offset of 0 means use the previous descriptor's offset;
6086 		 * this is used to chain multiple requests for the same file
6087 		 * to avoid multiple lookups.
6088 		 */
6089 		if (j == 0) {
6090 			/* This is not valid for the first entry */
6091 			if (i == 0) {
6092 				error = EINVAL;
6093 				goto out;
6094 			}
6095 			continue;
6096 		}
6097 
6098 		/*
6099 		 * If the offset of the string for this descriptor is before
6100 		 * what we believe is the current actual last descriptor,
6101 		 * then we need to adjust our estimate downward; this permits
6102 		 * the string table following the last descriptor to be out
6103 		 * of order relative to the descriptor list.
6104 		 */
6105 		if (j < desc_actual) {
6106 			desc_actual = j;
6107 		}
6108 	}
6109 
6110 	/*
6111 	 * We limit the actual number of descriptors we are willing to process
6112 	 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
6113 	 * requested does not exceed this limit,
6114 	 */
6115 	if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6116 		error = ENOMEM;
6117 		goto out;
6118 	}
6119 	result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6120 	if (result == NULL) {
6121 		error = ENOMEM;
6122 		goto out;
6123 	}
6124 
6125 	/*
6126 	 * Do the work by iterating over the descriptor entries we know to
6127 	 * at least appear to contain valid data.
6128 	 */
6129 	error = 0;
6130 	for (i = 0; i < desc_actual; i++) {
6131 		/*
6132 		 * If the ad_name_offset is 0, then we use the previous
6133 		 * results to make the check; otherwise, we are looking up
6134 		 * a new file name.
6135 		 */
6136 		if (input[i].ad_name_offset != 0) {
6137 			/* discard old vnodes */
6138 			if (vp) {
6139 				vnode_put(vp);
6140 				vp = NULL;
6141 			}
6142 			if (dvp) {
6143 				vnode_put(dvp);
6144 				dvp = NULL;
6145 			}
6146 
6147 			/*
6148 			 * Scan forward in the descriptor list to see if we
6149 			 * need the parent vnode.  We will need it if we are
6150 			 * deleting, since we must have rights  to remove
6151 			 * entries in the parent directory, as well as the
6152 			 * rights to delete the object itself.
6153 			 */
6154 			wantdelete = input[i].ad_flags & _DELETE_OK;
6155 			for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6156 				if (input[j].ad_flags & _DELETE_OK) {
6157 					wantdelete = 1;
6158 				}
6159 			}
6160 
6161 			niopts = FOLLOW | AUDITVNPATH1;
6162 
6163 			/* need parent for vnode_authorize for deletion test */
6164 			if (wantdelete) {
6165 				niopts |= WANTPARENT;
6166 			}
6167 
6168 			/* do the lookup */
6169 			NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6170 			    CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6171 			    &context);
6172 			error = namei(&nd);
6173 			if (!error) {
6174 				vp = nd.ni_vp;
6175 				if (wantdelete) {
6176 					dvp = nd.ni_dvp;
6177 				}
6178 			}
6179 			nameidone(&nd);
6180 		}
6181 
6182 		/*
6183 		 * Handle lookup errors.
6184 		 */
6185 		switch (error) {
6186 		case ENOENT:
6187 		case EACCES:
6188 		case EPERM:
6189 		case ENOTDIR:
6190 			result[i] = error;
6191 			break;
6192 		case 0:
6193 			/* run this access check */
6194 			result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6195 			break;
6196 		default:
6197 			/* fatal lookup error */
6198 
6199 			goto out;
6200 		}
6201 	}
6202 
6203 	AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6204 
6205 	/* copy out results */
6206 	error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6207 
6208 out:
6209 	if (input && input != stack_input) {
6210 		kfree_data(input, uap->size);
6211 	}
6212 	if (result) {
6213 		kfree_data(result, desc_actual * sizeof(errno_t));
6214 	}
6215 	if (vp) {
6216 		vnode_put(vp);
6217 	}
6218 	if (dvp) {
6219 		vnode_put(dvp);
6220 	}
6221 	if (IS_VALID_CRED(context.vc_ucred)) {
6222 		kauth_cred_unref(&context.vc_ucred);
6223 	}
6224 	return error;
6225 }
6226 
6227 
6228 /*
6229  * Returns:	0			Success
6230  *		namei:EFAULT		Bad address
6231  *		namei:ENAMETOOLONG	Filename too long
6232  *		namei:ENOENT		No such file or directory
6233  *		namei:ELOOP		Too many levels of symbolic links
6234  *		namei:EBADF		Bad file descriptor
6235  *		namei:ENOTDIR		Not a directory
6236  *		namei:???
6237  *		access1:
6238  */
6239 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6240 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6241     int flag, enum uio_seg segflg)
6242 {
6243 	int error;
6244 	struct nameidata nd;
6245 	int niopts;
6246 	struct vfs_context context;
6247 #if NAMEDRSRCFORK
6248 	int is_namedstream = 0;
6249 #endif
6250 
6251 	/*
6252 	 * Unless the AT_EACCESS option is used, Access is defined as checking
6253 	 * against the process' real identity, even if operations are checking
6254 	 * the effective identity.  So we need to tweak the credential
6255 	 * in the context for that case.
6256 	 */
6257 	if (!(flag & AT_EACCESS)) {
6258 		context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6259 	} else {
6260 		context.vc_ucred = ctx->vc_ucred;
6261 	}
6262 	context.vc_thread = ctx->vc_thread;
6263 
6264 
6265 	niopts = (flag & AT_SYMLINK_NOFOLLOW ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6266 	/* need parent for vnode_authorize for deletion test */
6267 	if (amode & _DELETE_OK) {
6268 		niopts |= WANTPARENT;
6269 	}
6270 	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6271 	    path, &context);
6272 
6273 #if NAMEDRSRCFORK
6274 	/* access(F_OK) calls are allowed for resource forks. */
6275 	if (amode == F_OK) {
6276 		nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6277 	}
6278 #endif
6279 	error = nameiat(&nd, fd);
6280 	if (error) {
6281 		goto out;
6282 	}
6283 
6284 #if NAMEDRSRCFORK
6285 	/* Grab reference on the shadow stream file vnode to
6286 	 * force an inactive on release which will mark it
6287 	 * for recycle.
6288 	 */
6289 	if (vnode_isnamedstream(nd.ni_vp) &&
6290 	    (nd.ni_vp->v_parent != NULLVP) &&
6291 	    vnode_isshadow(nd.ni_vp)) {
6292 		is_namedstream = 1;
6293 		vnode_ref(nd.ni_vp);
6294 	}
6295 #endif
6296 
6297 	error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6298 
6299 #if NAMEDRSRCFORK
6300 	if (is_namedstream) {
6301 		vnode_rele(nd.ni_vp);
6302 	}
6303 #endif
6304 
6305 	vnode_put(nd.ni_vp);
6306 	if (amode & _DELETE_OK) {
6307 		vnode_put(nd.ni_dvp);
6308 	}
6309 	nameidone(&nd);
6310 
6311 out:
6312 	if (!(flag & AT_EACCESS)) {
6313 		kauth_cred_unref(&context.vc_ucred);
6314 	}
6315 	return error;
6316 }
6317 
6318 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)6319 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6320 {
6321 	return faccessat_internal(vfs_context_current(), AT_FDCWD,
6322 	           uap->path, uap->flags, 0, UIO_USERSPACE);
6323 }
6324 
6325 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)6326 faccessat(__unused proc_t p, struct faccessat_args *uap,
6327     __unused int32_t *retval)
6328 {
6329 	if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW)) {
6330 		return EINVAL;
6331 	}
6332 
6333 	return faccessat_internal(vfs_context_current(), uap->fd,
6334 	           uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6335 }
6336 
6337 /*
6338  * Returns:	0			Success
6339  *		EFAULT
6340  *	copyout:EFAULT
6341  *	namei:???
6342  *	vn_stat:???
6343  */
6344 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)6345 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6346     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6347     enum uio_seg segflg, int fd, int flag)
6348 {
6349 	struct nameidata nd;
6350 	int follow;
6351 	union {
6352 		struct stat sb;
6353 		struct stat64 sb64;
6354 	} source = {};
6355 	union {
6356 		struct user64_stat user64_sb;
6357 		struct user32_stat user32_sb;
6358 		struct user64_stat64 user64_sb64;
6359 		struct user32_stat64 user32_sb64;
6360 	} dest = {};
6361 	caddr_t sbp;
6362 	int error, my_size;
6363 	kauth_filesec_t fsec;
6364 	size_t xsecurity_bufsize;
6365 	void * statptr;
6366 	struct fileproc *fp = NULL;
6367 	int needsrealdev = 0;
6368 
6369 	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6370 	NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6371 	    segflg, path, ctx);
6372 
6373 #if NAMEDRSRCFORK
6374 	int is_namedstream = 0;
6375 	/* stat calls are allowed for resource forks. */
6376 	nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6377 #endif
6378 
6379 	if (flag & AT_FDONLY) {
6380 		vnode_t fvp;
6381 
6382 		error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6383 		if (error) {
6384 			return error;
6385 		}
6386 		if ((error = vnode_getwithref(fvp))) {
6387 			file_drop(fd);
6388 			return error;
6389 		}
6390 		nd.ni_vp = fvp;
6391 	} else {
6392 		error = nameiat(&nd, fd);
6393 		if (error) {
6394 			return error;
6395 		}
6396 	}
6397 	fsec = KAUTH_FILESEC_NONE;
6398 
6399 	statptr = (void *)&source;
6400 
6401 #if NAMEDRSRCFORK
6402 	/* Grab reference on the shadow stream file vnode to
6403 	 * force an inactive on release which will mark it
6404 	 * for recycle.
6405 	 */
6406 	if (vnode_isnamedstream(nd.ni_vp) &&
6407 	    (nd.ni_vp->v_parent != NULLVP) &&
6408 	    vnode_isshadow(nd.ni_vp)) {
6409 		is_namedstream = 1;
6410 		vnode_ref(nd.ni_vp);
6411 	}
6412 #endif
6413 
6414 	needsrealdev = flag & AT_REALDEV ? 1 : 0;
6415 	if (fp && (xsecurity == USER_ADDR_NULL)) {
6416 		/*
6417 		 * If the caller has the file open, and is not
6418 		 * requesting extended security information, we are
6419 		 * going to let them get the basic stat information.
6420 		 */
6421 		error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6422 		    fp->fp_glob->fg_cred);
6423 	} else {
6424 		error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6425 		    isstat64, needsrealdev, ctx);
6426 	}
6427 
6428 #if NAMEDRSRCFORK
6429 	if (is_namedstream) {
6430 		vnode_rele(nd.ni_vp);
6431 	}
6432 #endif
6433 	vnode_put(nd.ni_vp);
6434 	nameidone(&nd);
6435 	if (fp) {
6436 		file_drop(fd);
6437 		fp = NULL;
6438 	}
6439 
6440 	if (error) {
6441 		return error;
6442 	}
6443 	/* Zap spare fields */
6444 	if (isstat64 != 0) {
6445 		source.sb64.st_lspare = 0;
6446 		source.sb64.st_qspare[0] = 0LL;
6447 		source.sb64.st_qspare[1] = 0LL;
6448 		if (vfs_context_is64bit(ctx)) {
6449 			munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6450 			my_size = sizeof(dest.user64_sb64);
6451 			sbp = (caddr_t)&dest.user64_sb64;
6452 		} else {
6453 			munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6454 			my_size = sizeof(dest.user32_sb64);
6455 			sbp = (caddr_t)&dest.user32_sb64;
6456 		}
6457 		/*
6458 		 * Check if we raced (post lookup) against the last unlink of a file.
6459 		 */
6460 		if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
6461 			source.sb64.st_nlink = 1;
6462 		}
6463 	} else {
6464 		source.sb.st_lspare = 0;
6465 		source.sb.st_qspare[0] = 0LL;
6466 		source.sb.st_qspare[1] = 0LL;
6467 		if (vfs_context_is64bit(ctx)) {
6468 			munge_user64_stat(&source.sb, &dest.user64_sb);
6469 			my_size = sizeof(dest.user64_sb);
6470 			sbp = (caddr_t)&dest.user64_sb;
6471 		} else {
6472 			munge_user32_stat(&source.sb, &dest.user32_sb);
6473 			my_size = sizeof(dest.user32_sb);
6474 			sbp = (caddr_t)&dest.user32_sb;
6475 		}
6476 
6477 		/*
6478 		 * Check if we raced (post lookup) against the last unlink of a file.
6479 		 */
6480 		if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
6481 			source.sb.st_nlink = 1;
6482 		}
6483 	}
6484 	if ((error = copyout(sbp, ub, my_size)) != 0) {
6485 		goto out;
6486 	}
6487 
6488 	/* caller wants extended security information? */
6489 	if (xsecurity != USER_ADDR_NULL) {
6490 		/* did we get any? */
6491 		if (fsec == KAUTH_FILESEC_NONE) {
6492 			if (susize(xsecurity_size, 0) != 0) {
6493 				error = EFAULT;
6494 				goto out;
6495 			}
6496 		} else {
6497 			/* find the user buffer size */
6498 			xsecurity_bufsize = fusize(xsecurity_size);
6499 
6500 			/* copy out the actual data size */
6501 			if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
6502 				error = EFAULT;
6503 				goto out;
6504 			}
6505 
6506 			/* if the caller supplied enough room, copy out to it */
6507 			if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
6508 				error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
6509 			}
6510 		}
6511 	}
6512 out:
6513 	if (fsec != KAUTH_FILESEC_NONE) {
6514 		kauth_filesec_free(fsec);
6515 	}
6516 	return error;
6517 }
6518 
6519 /*
6520  * stat_extended: Get file status; with extended security (ACL).
6521  *
6522  * Parameters:    p                       (ignored)
6523  *                uap                     User argument descriptor (see below)
6524  *                retval                  (ignored)
6525  *
6526  * Indirect:      uap->path               Path of file to get status from
6527  *                uap->ub                 User buffer (holds file status info)
6528  *                uap->xsecurity          ACL to get (extended security)
6529  *                uap->xsecurity_size     Size of ACL
6530  *
6531  * Returns:        0                      Success
6532  *                !0                      errno value
6533  *
6534  */
6535 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)6536 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
6537     __unused int32_t *retval)
6538 {
6539 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6540 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6541 	           0);
6542 }
6543 
6544 /*
6545  * Returns:	0			Success
6546  *	fstatat_internal:???		[see fstatat_internal() in this file]
6547  */
6548 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)6549 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
6550 {
6551 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6552 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
6553 }
6554 
6555 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)6556 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
6557 {
6558 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6559 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
6560 }
6561 
6562 /*
6563  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
6564  *
6565  * Parameters:    p                       (ignored)
6566  *                uap                     User argument descriptor (see below)
6567  *                retval                  (ignored)
6568  *
6569  * Indirect:      uap->path               Path of file to get status from
6570  *                uap->ub                 User buffer (holds file status info)
6571  *                uap->xsecurity          ACL to get (extended security)
6572  *                uap->xsecurity_size     Size of ACL
6573  *
6574  * Returns:        0                      Success
6575  *                !0                      errno value
6576  *
6577  */
6578 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)6579 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
6580 {
6581 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6582 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6583 	           0);
6584 }
6585 
6586 /*
6587  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
6588  *
6589  * Parameters:    p                       (ignored)
6590  *                uap                     User argument descriptor (see below)
6591  *                retval                  (ignored)
6592  *
6593  * Indirect:      uap->path               Path of file to get status from
6594  *                uap->ub                 User buffer (holds file status info)
6595  *                uap->xsecurity          ACL to get (extended security)
6596  *                uap->xsecurity_size     Size of ACL
6597  *
6598  * Returns:        0                      Success
6599  *                !0                      errno value
6600  *
6601  */
6602 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)6603 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
6604 {
6605 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6606 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6607 	           AT_SYMLINK_NOFOLLOW);
6608 }
6609 
6610 /*
6611  * Get file status; this version does not follow links.
6612  */
6613 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)6614 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
6615 {
6616 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6617 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6618 }
6619 
6620 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)6621 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
6622 {
6623 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6624 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6625 }
6626 
6627 /*
6628  * lstat64_extended: Get file status; can handle large inode numbers; does not
6629  * follow links; with extended security (ACL).
6630  *
6631  * Parameters:    p                       (ignored)
6632  *                uap                     User argument descriptor (see below)
6633  *                retval                  (ignored)
6634  *
6635  * Indirect:      uap->path               Path of file to get status from
6636  *                uap->ub                 User buffer (holds file status info)
6637  *                uap->xsecurity          ACL to get (extended security)
6638  *                uap->xsecurity_size     Size of ACL
6639  *
6640  * Returns:        0                      Success
6641  *                !0                      errno value
6642  *
6643  */
6644 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)6645 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
6646 {
6647 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6648 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6649 	           AT_SYMLINK_NOFOLLOW);
6650 }
6651 
6652 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)6653 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
6654 {
6655 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
6656 		return EINVAL;
6657 	}
6658 
6659 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6660 	           0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
6661 }
6662 
6663 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)6664 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
6665     __unused int32_t *retval)
6666 {
6667 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
6668 		return EINVAL;
6669 	}
6670 
6671 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6672 	           0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
6673 }
6674 
6675 /*
6676  * Get configurable pathname variables.
6677  *
6678  * Returns:	0			Success
6679  *	namei:???
6680  *	vn_pathconf:???
6681  *
6682  * Notes:	Global implementation  constants are intended to be
6683  *		implemented in this function directly; all other constants
6684  *		are per-FS implementation, and therefore must be handled in
6685  *		each respective FS, instead.
6686  *
6687  * XXX We implement some things globally right now that should actually be
6688  * XXX per-FS; we will need to deal with this at some point.
6689  */
6690 /* ARGSUSED */
6691 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)6692 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
6693 {
6694 	int error;
6695 	struct nameidata nd;
6696 	vfs_context_t ctx = vfs_context_current();
6697 
6698 	NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
6699 	    UIO_USERSPACE, uap->path, ctx);
6700 	error = namei(&nd);
6701 	if (error) {
6702 		return error;
6703 	}
6704 
6705 	error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
6706 
6707 	vnode_put(nd.ni_vp);
6708 	nameidone(&nd);
6709 	return error;
6710 }
6711 
6712 /*
6713  * Return target name of a symbolic link.
6714  */
6715 /* ARGSUSED */
6716 static int
readlinkat_internal(vfs_context_t ctx,int fd,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)6717 readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
6718     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
6719     int *retval)
6720 {
6721 	vnode_t vp;
6722 	uio_t auio;
6723 	int error;
6724 	struct nameidata nd;
6725 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
6726 
6727 	if (bufsize > INT32_MAX) {
6728 		return EINVAL;
6729 	}
6730 
6731 	NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
6732 	    seg, path, ctx);
6733 
6734 	error = nameiat(&nd, fd);
6735 	if (error) {
6736 		return error;
6737 	}
6738 	vp = nd.ni_vp;
6739 
6740 	nameidone(&nd);
6741 
6742 	auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
6743 	    &uio_buf[0], sizeof(uio_buf));
6744 	uio_addiov(auio, buf, bufsize);
6745 	if (vp->v_type != VLNK) {
6746 		error = EINVAL;
6747 	} else {
6748 #if CONFIG_MACF
6749 		error = mac_vnode_check_readlink(ctx, vp);
6750 #endif
6751 		if (error == 0) {
6752 			error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
6753 			    ctx);
6754 		}
6755 		if (error == 0) {
6756 			error = VNOP_READLINK(vp, auio, ctx);
6757 		}
6758 	}
6759 	vnode_put(vp);
6760 
6761 	*retval = (int)(bufsize - uio_resid(auio));
6762 	return error;
6763 }
6764 
6765 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)6766 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
6767 {
6768 	enum uio_seg procseg;
6769 
6770 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6771 	return readlinkat_internal(vfs_context_current(), AT_FDCWD,
6772 	           CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
6773 	           uap->count, procseg, retval);
6774 }
6775 
6776 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)6777 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
6778 {
6779 	enum uio_seg procseg;
6780 
6781 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6782 	return readlinkat_internal(vfs_context_current(), uap->fd, uap->path,
6783 	           procseg, uap->buf, uap->bufsize, procseg, retval);
6784 }
6785 
6786 /*
6787  * Change file flags, the deep inner layer.
6788  */
6789 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)6790 chflags0(vnode_t vp, struct vnode_attr *va,
6791     int (*setattr)(vnode_t, void *, vfs_context_t),
6792     void *arg, vfs_context_t ctx)
6793 {
6794 	kauth_action_t action = 0;
6795 	int error;
6796 
6797 #if CONFIG_MACF
6798 	error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
6799 	if (error) {
6800 		goto out;
6801 	}
6802 #endif
6803 
6804 	/* request authorisation, disregard immutability */
6805 	if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
6806 		goto out;
6807 	}
6808 	/*
6809 	 * Request that the auth layer disregard those file flags it's allowed to when
6810 	 * authorizing this operation; we need to do this in order to be able to
6811 	 * clear immutable flags.
6812 	 */
6813 	if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
6814 		goto out;
6815 	}
6816 	error = (*setattr)(vp, arg, ctx);
6817 
6818 #if CONFIG_MACF
6819 	if (error == 0) {
6820 		mac_vnode_notify_setflags(ctx, vp, va->va_flags);
6821 	}
6822 #endif
6823 
6824 out:
6825 	return error;
6826 }
6827 
6828 /*
6829  * Change file flags.
6830  *
6831  * NOTE: this will vnode_put() `vp'
6832  */
6833 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)6834 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
6835 {
6836 	struct vnode_attr va;
6837 	int error;
6838 
6839 	VATTR_INIT(&va);
6840 	VATTR_SET(&va, va_flags, flags);
6841 
6842 	error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
6843 	vnode_put(vp);
6844 
6845 	if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
6846 		error = ENOTSUP;
6847 	}
6848 
6849 	return error;
6850 }
6851 
6852 /*
6853  * Change flags of a file given a path name.
6854  */
6855 /* ARGSUSED */
6856 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)6857 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
6858 {
6859 	vnode_t vp;
6860 	vfs_context_t ctx = vfs_context_current();
6861 	int error;
6862 	struct nameidata nd;
6863 
6864 	AUDIT_ARG(fflags, uap->flags);
6865 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6866 	    UIO_USERSPACE, uap->path, ctx);
6867 	error = namei(&nd);
6868 	if (error) {
6869 		return error;
6870 	}
6871 	vp = nd.ni_vp;
6872 	nameidone(&nd);
6873 
6874 	/* we don't vnode_put() here because chflags1 does internally */
6875 	error = chflags1(vp, uap->flags, ctx);
6876 
6877 	return error;
6878 }
6879 
6880 /*
6881  * Change flags of a file given a file descriptor.
6882  */
6883 /* ARGSUSED */
6884 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)6885 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
6886 {
6887 	vnode_t vp;
6888 	int error;
6889 
6890 	AUDIT_ARG(fd, uap->fd);
6891 	AUDIT_ARG(fflags, uap->flags);
6892 	if ((error = file_vnode(uap->fd, &vp))) {
6893 		return error;
6894 	}
6895 
6896 	if ((error = vnode_getwithref(vp))) {
6897 		file_drop(uap->fd);
6898 		return error;
6899 	}
6900 
6901 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6902 
6903 	/* we don't vnode_put() here because chflags1 does internally */
6904 	error = chflags1(vp, uap->flags, vfs_context_current());
6905 
6906 	file_drop(uap->fd);
6907 	return error;
6908 }
6909 
6910 /*
6911  * Change security information on a filesystem object.
6912  *
6913  * Returns:	0			Success
6914  *		EPERM			Operation not permitted
6915  *		vnode_authattr:???	[anything vnode_authattr can return]
6916  *		vnode_authorize:???	[anything vnode_authorize can return]
6917  *		vnode_setattr:???	[anything vnode_setattr can return]
6918  *
6919  * Notes:	If vnode_authattr or vnode_authorize return EACCES, it will be
6920  *		translated to EPERM before being returned.
6921  */
6922 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)6923 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
6924 {
6925 	kauth_action_t action;
6926 	int error;
6927 
6928 	AUDIT_ARG(mode, vap->va_mode);
6929 	/* XXX audit new args */
6930 
6931 #if NAMEDSTREAMS
6932 	/* chmod calls are not allowed for resource forks. */
6933 	if (vp->v_flag & VISNAMEDSTREAM) {
6934 		return EPERM;
6935 	}
6936 #endif
6937 
6938 #if CONFIG_MACF
6939 	if (VATTR_IS_ACTIVE(vap, va_mode) &&
6940 	    (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
6941 		return error;
6942 	}
6943 
6944 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6945 		if ((error = mac_vnode_check_setowner(ctx, vp,
6946 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6947 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
6948 			return error;
6949 		}
6950 	}
6951 
6952 	if (VATTR_IS_ACTIVE(vap, va_acl) &&
6953 	    (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
6954 		return error;
6955 	}
6956 #endif
6957 
6958 	/* make sure that the caller is allowed to set this security information */
6959 	if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
6960 	    ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6961 		if (error == EACCES) {
6962 			error = EPERM;
6963 		}
6964 		return error;
6965 	}
6966 
6967 	if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
6968 		return error;
6969 	}
6970 
6971 #if CONFIG_MACF
6972 	if (VATTR_IS_ACTIVE(vap, va_mode)) {
6973 		mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
6974 	}
6975 
6976 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6977 		mac_vnode_notify_setowner(ctx, vp,
6978 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6979 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
6980 	}
6981 
6982 	if (VATTR_IS_ACTIVE(vap, va_acl)) {
6983 		mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
6984 	}
6985 #endif
6986 
6987 	return error;
6988 }
6989 
6990 
6991 /*
6992  * Change mode of a file given a path name.
6993  *
6994  * Returns:	0			Success
6995  *		namei:???		[anything namei can return]
6996  *		chmod_vnode:???		[anything chmod_vnode can return]
6997  */
6998 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)6999 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7000     int fd, int flag, enum uio_seg segflg)
7001 {
7002 	struct nameidata nd;
7003 	int follow, error;
7004 
7005 	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7006 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
7007 	    segflg, path, ctx);
7008 	if ((error = nameiat(&nd, fd))) {
7009 		return error;
7010 	}
7011 	error = chmod_vnode(ctx, nd.ni_vp, vap);
7012 	vnode_put(nd.ni_vp);
7013 	nameidone(&nd);
7014 	return error;
7015 }
7016 
7017 /*
7018  * chmod_extended: Change the mode of a file given a path name; with extended
7019  * argument list (including extended security (ACL)).
7020  *
7021  * Parameters:	p			Process requesting the open
7022  *		uap			User argument descriptor (see below)
7023  *		retval			(ignored)
7024  *
7025  * Indirect:	uap->path		Path to object (same as 'chmod')
7026  *		uap->uid		UID to set
7027  *		uap->gid		GID to set
7028  *		uap->mode		File mode to set (same as 'chmod')
7029  *		uap->xsecurity		ACL to set (or delete)
7030  *
7031  * Returns:	0			Success
7032  *		!0			errno value
7033  *
7034  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
7035  *
7036  * XXX:		We should enummerate the possible errno values here, and where
7037  *		in the code they originated.
7038  */
7039 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7040 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7041 {
7042 	int error;
7043 	struct vnode_attr va;
7044 	kauth_filesec_t xsecdst;
7045 
7046 	AUDIT_ARG(owner, uap->uid, uap->gid);
7047 
7048 	VATTR_INIT(&va);
7049 	if (uap->mode != -1) {
7050 		VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7051 	}
7052 	if (uap->uid != KAUTH_UID_NONE) {
7053 		VATTR_SET(&va, va_uid, uap->uid);
7054 	}
7055 	if (uap->gid != KAUTH_GID_NONE) {
7056 		VATTR_SET(&va, va_gid, uap->gid);
7057 	}
7058 
7059 	xsecdst = NULL;
7060 	switch (uap->xsecurity) {
7061 	/* explicit remove request */
7062 	case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
7063 		VATTR_SET(&va, va_acl, NULL);
7064 		break;
7065 	/* not being set */
7066 	case USER_ADDR_NULL:
7067 		break;
7068 	default:
7069 		if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
7070 			return error;
7071 		}
7072 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
7073 		va.va_vaflags |= VA_FILESEC_ACL;
7074 		KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
7075 	}
7076 
7077 	error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7078 	    UIO_USERSPACE);
7079 
7080 	if (xsecdst != NULL) {
7081 		kauth_filesec_free(xsecdst);
7082 	}
7083 	return error;
7084 }
7085 
7086 /*
7087  * Returns:	0			Success
7088  *		chmodat:???		[anything chmodat can return]
7089  */
7090 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7091 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7092     int flag, enum uio_seg segflg)
7093 {
7094 	struct vnode_attr va;
7095 
7096 	VATTR_INIT(&va);
7097 	VATTR_SET(&va, va_mode, mode & ALLPERMS);
7098 
7099 	return chmodat(ctx, path, &va, fd, flag, segflg);
7100 }
7101 
7102 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7103 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7104 {
7105 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7106 	           AT_FDCWD, 0, UIO_USERSPACE);
7107 }
7108 
7109 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7110 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7111 {
7112 	if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7113 		return EINVAL;
7114 	}
7115 
7116 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7117 	           uap->fd, uap->flag, UIO_USERSPACE);
7118 }
7119 
7120 /*
7121  * Change mode of a file given a file descriptor.
7122  */
7123 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7124 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7125 {
7126 	vnode_t vp;
7127 	int error;
7128 
7129 	AUDIT_ARG(fd, fd);
7130 
7131 	if ((error = file_vnode(fd, &vp)) != 0) {
7132 		return error;
7133 	}
7134 	if ((error = vnode_getwithref(vp)) != 0) {
7135 		file_drop(fd);
7136 		return error;
7137 	}
7138 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7139 
7140 	error = chmod_vnode(vfs_context_current(), vp, vap);
7141 	(void)vnode_put(vp);
7142 	file_drop(fd);
7143 
7144 	return error;
7145 }
7146 
7147 /*
7148  * fchmod_extended: Change mode of a file given a file descriptor; with
7149  * extended argument list (including extended security (ACL)).
7150  *
7151  * Parameters:    p                       Process requesting to change file mode
7152  *                uap                     User argument descriptor (see below)
7153  *                retval                  (ignored)
7154  *
7155  * Indirect:      uap->mode               File mode to set (same as 'chmod')
7156  *                uap->uid                UID to set
7157  *                uap->gid                GID to set
7158  *                uap->xsecurity          ACL to set (or delete)
7159  *                uap->fd                 File descriptor of file to change mode
7160  *
7161  * Returns:        0                      Success
7162  *                !0                      errno value
7163  *
7164  */
7165 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7166 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7167 {
7168 	int error;
7169 	struct vnode_attr va;
7170 	kauth_filesec_t xsecdst;
7171 
7172 	AUDIT_ARG(owner, uap->uid, uap->gid);
7173 
7174 	VATTR_INIT(&va);
7175 	if (uap->mode != -1) {
7176 		VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7177 	}
7178 	if (uap->uid != KAUTH_UID_NONE) {
7179 		VATTR_SET(&va, va_uid, uap->uid);
7180 	}
7181 	if (uap->gid != KAUTH_GID_NONE) {
7182 		VATTR_SET(&va, va_gid, uap->gid);
7183 	}
7184 
7185 	xsecdst = NULL;
7186 	switch (uap->xsecurity) {
7187 	case USER_ADDR_NULL:
7188 		VATTR_SET(&va, va_acl, NULL);
7189 		break;
7190 	case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
7191 		VATTR_SET(&va, va_acl, NULL);
7192 		break;
7193 	/* not being set */
7194 	case CAST_USER_ADDR_T(-1):
7195 		break;
7196 	default:
7197 		if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
7198 			return error;
7199 		}
7200 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
7201 		va.va_vaflags |= VA_FILESEC_ACL;
7202 	}
7203 
7204 	error = fchmod1(p, uap->fd, &va);
7205 
7206 
7207 	switch (uap->xsecurity) {
7208 	case USER_ADDR_NULL:
7209 	case CAST_USER_ADDR_T(-1):
7210 		break;
7211 	default:
7212 		if (xsecdst != NULL) {
7213 			kauth_filesec_free(xsecdst);
7214 		}
7215 	}
7216 	return error;
7217 }
7218 
7219 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)7220 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7221 {
7222 	struct vnode_attr va;
7223 
7224 	VATTR_INIT(&va);
7225 	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7226 
7227 	return fchmod1(p, uap->fd, &va);
7228 }
7229 
7230 
7231 /*
7232  * Set ownership given a path name.
7233  */
7234 /* ARGSUSED */
7235 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)7236 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7237     gid_t gid, int flag, enum uio_seg segflg)
7238 {
7239 	vnode_t vp;
7240 	struct vnode_attr va;
7241 	int error;
7242 	struct nameidata nd;
7243 	int follow;
7244 	kauth_action_t action;
7245 
7246 	AUDIT_ARG(owner, uid, gid);
7247 
7248 	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7249 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
7250 	    path, ctx);
7251 	error = nameiat(&nd, fd);
7252 	if (error) {
7253 		return error;
7254 	}
7255 	vp = nd.ni_vp;
7256 
7257 	nameidone(&nd);
7258 
7259 	VATTR_INIT(&va);
7260 	if (uid != (uid_t)VNOVAL) {
7261 		VATTR_SET(&va, va_uid, uid);
7262 	}
7263 	if (gid != (gid_t)VNOVAL) {
7264 		VATTR_SET(&va, va_gid, gid);
7265 	}
7266 
7267 #if CONFIG_MACF
7268 	error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7269 	if (error) {
7270 		goto out;
7271 	}
7272 #endif
7273 
7274 	/* preflight and authorize attribute changes */
7275 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7276 		goto out;
7277 	}
7278 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7279 		goto out;
7280 	}
7281 	error = vnode_setattr(vp, &va, ctx);
7282 
7283 #if CONFIG_MACF
7284 	if (error == 0) {
7285 		mac_vnode_notify_setowner(ctx, vp, uid, gid);
7286 	}
7287 #endif
7288 
7289 out:
7290 	/*
7291 	 * EACCES is only allowed from namei(); permissions failure should
7292 	 * return EPERM, so we need to translate the error code.
7293 	 */
7294 	if (error == EACCES) {
7295 		error = EPERM;
7296 	}
7297 
7298 	vnode_put(vp);
7299 	return error;
7300 }
7301 
7302 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)7303 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7304 {
7305 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7306 	           uap->uid, uap->gid, 0, UIO_USERSPACE);
7307 }
7308 
7309 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)7310 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7311 {
7312 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7313 	           uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7314 }
7315 
7316 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)7317 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7318 {
7319 	if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7320 		return EINVAL;
7321 	}
7322 
7323 	return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7324 	           uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7325 }
7326 
7327 /*
7328  * Set ownership given a file descriptor.
7329  */
7330 /* ARGSUSED */
7331 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)7332 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7333 {
7334 	struct vnode_attr va;
7335 	vfs_context_t ctx = vfs_context_current();
7336 	vnode_t vp;
7337 	int error;
7338 	kauth_action_t action;
7339 
7340 	AUDIT_ARG(owner, uap->uid, uap->gid);
7341 	AUDIT_ARG(fd, uap->fd);
7342 
7343 	if ((error = file_vnode(uap->fd, &vp))) {
7344 		return error;
7345 	}
7346 
7347 	if ((error = vnode_getwithref(vp))) {
7348 		file_drop(uap->fd);
7349 		return error;
7350 	}
7351 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7352 
7353 	VATTR_INIT(&va);
7354 	if (uap->uid != VNOVAL) {
7355 		VATTR_SET(&va, va_uid, uap->uid);
7356 	}
7357 	if (uap->gid != VNOVAL) {
7358 		VATTR_SET(&va, va_gid, uap->gid);
7359 	}
7360 
7361 #if NAMEDSTREAMS
7362 	/* chown calls are not allowed for resource forks. */
7363 	if (vp->v_flag & VISNAMEDSTREAM) {
7364 		error = EPERM;
7365 		goto out;
7366 	}
7367 #endif
7368 
7369 #if CONFIG_MACF
7370 	error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
7371 	if (error) {
7372 		goto out;
7373 	}
7374 #endif
7375 
7376 	/* preflight and authorize attribute changes */
7377 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7378 		goto out;
7379 	}
7380 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7381 		if (error == EACCES) {
7382 			error = EPERM;
7383 		}
7384 		goto out;
7385 	}
7386 	error = vnode_setattr(vp, &va, ctx);
7387 
7388 #if CONFIG_MACF
7389 	if (error == 0) {
7390 		mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
7391 	}
7392 #endif
7393 
7394 out:
7395 	(void)vnode_put(vp);
7396 	file_drop(uap->fd);
7397 	return error;
7398 }
7399 
7400 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)7401 getutimes(user_addr_t usrtvp, struct timespec *tsp)
7402 {
7403 	int error;
7404 
7405 	if (usrtvp == USER_ADDR_NULL) {
7406 		struct timeval old_tv;
7407 		/* XXX Y2038 bug because of microtime argument */
7408 		microtime(&old_tv);
7409 		TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
7410 		tsp[1] = tsp[0];
7411 	} else {
7412 		if (IS_64BIT_PROCESS(current_proc())) {
7413 			struct user64_timeval tv[2];
7414 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
7415 			if (error) {
7416 				return error;
7417 			}
7418 			TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
7419 			TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
7420 		} else {
7421 			struct user32_timeval tv[2];
7422 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
7423 			if (error) {
7424 				return error;
7425 			}
7426 			TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
7427 			TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
7428 		}
7429 	}
7430 	return 0;
7431 }
7432 
7433 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)7434 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
7435     int nullflag)
7436 {
7437 	int error;
7438 	struct vnode_attr va;
7439 	kauth_action_t action;
7440 
7441 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7442 
7443 	VATTR_INIT(&va);
7444 	VATTR_SET(&va, va_access_time, ts[0]);
7445 	VATTR_SET(&va, va_modify_time, ts[1]);
7446 	if (nullflag) {
7447 		va.va_vaflags |= VA_UTIMES_NULL;
7448 	}
7449 
7450 #if NAMEDSTREAMS
7451 	/* utimes calls are not allowed for resource forks. */
7452 	if (vp->v_flag & VISNAMEDSTREAM) {
7453 		error = EPERM;
7454 		goto out;
7455 	}
7456 #endif
7457 
7458 #if CONFIG_MACF
7459 	error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
7460 	if (error) {
7461 		goto out;
7462 	}
7463 #endif
7464 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7465 		if (!nullflag && error == EACCES) {
7466 			error = EPERM;
7467 		}
7468 		goto out;
7469 	}
7470 
7471 	/* since we may not need to auth anything, check here */
7472 	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7473 		if (!nullflag && error == EACCES) {
7474 			error = EPERM;
7475 		}
7476 		goto out;
7477 	}
7478 	error = vnode_setattr(vp, &va, ctx);
7479 
7480 #if CONFIG_MACF
7481 	if (error == 0) {
7482 		mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
7483 	}
7484 #endif
7485 
7486 out:
7487 	return error;
7488 }
7489 
7490 /*
7491  * Set the access and modification times of a file.
7492  */
7493 /* ARGSUSED */
7494 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)7495 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
7496 {
7497 	struct timespec ts[2];
7498 	user_addr_t usrtvp;
7499 	int error;
7500 	struct nameidata nd;
7501 	vfs_context_t ctx = vfs_context_current();
7502 
7503 	/*
7504 	 * AUDIT: Needed to change the order of operations to do the
7505 	 * name lookup first because auditing wants the path.
7506 	 */
7507 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
7508 	    UIO_USERSPACE, uap->path, ctx);
7509 	error = namei(&nd);
7510 	if (error) {
7511 		return error;
7512 	}
7513 	nameidone(&nd);
7514 
7515 	/*
7516 	 * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
7517 	 * the current time instead.
7518 	 */
7519 	usrtvp = uap->tptr;
7520 	if ((error = getutimes(usrtvp, ts)) != 0) {
7521 		goto out;
7522 	}
7523 
7524 	error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
7525 
7526 out:
7527 	vnode_put(nd.ni_vp);
7528 	return error;
7529 }
7530 
7531 /*
7532  * Set the access and modification times of a file.
7533  */
7534 /* ARGSUSED */
7535 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)7536 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
7537 {
7538 	struct timespec ts[2];
7539 	vnode_t vp;
7540 	user_addr_t usrtvp;
7541 	int error;
7542 
7543 	AUDIT_ARG(fd, uap->fd);
7544 	usrtvp = uap->tptr;
7545 	if ((error = getutimes(usrtvp, ts)) != 0) {
7546 		return error;
7547 	}
7548 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
7549 		return error;
7550 	}
7551 	if ((error = vnode_getwithref(vp))) {
7552 		file_drop(uap->fd);
7553 		return error;
7554 	}
7555 
7556 	error =  setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
7557 	vnode_put(vp);
7558 	file_drop(uap->fd);
7559 	return error;
7560 }
7561 
7562 /*
7563  * Truncate a file given its path name.
7564  */
7565 /* ARGSUSED */
7566 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)7567 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
7568 {
7569 	vnode_t vp;
7570 	struct vnode_attr va;
7571 	vfs_context_t ctx = vfs_context_current();
7572 	int error;
7573 	struct nameidata nd;
7574 	kauth_action_t action;
7575 	rlim_t fsize_limit;
7576 
7577 	if (uap->length < 0) {
7578 		return EINVAL;
7579 	}
7580 
7581 	fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
7582 	if ((rlim_t)uap->length > fsize_limit) {
7583 		psignal(p, SIGXFSZ);
7584 		return EFBIG;
7585 	}
7586 
7587 	NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
7588 	    UIO_USERSPACE, uap->path, ctx);
7589 	if ((error = namei(&nd))) {
7590 		return error;
7591 	}
7592 	vp = nd.ni_vp;
7593 
7594 	nameidone(&nd);
7595 
7596 	VATTR_INIT(&va);
7597 	VATTR_SET(&va, va_data_size, uap->length);
7598 
7599 #if CONFIG_MACF
7600 	error = mac_vnode_check_truncate(ctx, NOCRED, vp);
7601 	if (error) {
7602 		goto out;
7603 	}
7604 #endif
7605 
7606 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7607 		goto out;
7608 	}
7609 	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7610 		goto out;
7611 	}
7612 	error = vnode_setattr(vp, &va, ctx);
7613 
7614 #if CONFIG_MACF
7615 	if (error == 0) {
7616 		mac_vnode_notify_truncate(ctx, NOCRED, vp);
7617 	}
7618 #endif
7619 
7620 out:
7621 	vnode_put(vp);
7622 	return error;
7623 }
7624 
7625 /*
7626  * Truncate a file given a file descriptor.
7627  */
7628 /* ARGSUSED */
7629 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)7630 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
7631 {
7632 	vfs_context_t ctx = vfs_context_current();
7633 	struct vnode_attr va;
7634 	vnode_t vp;
7635 	struct fileproc *fp;
7636 	int error;
7637 	int fd = uap->fd;
7638 	rlim_t fsize_limit;
7639 
7640 	AUDIT_ARG(fd, uap->fd);
7641 	if (uap->length < 0) {
7642 		return EINVAL;
7643 	}
7644 
7645 	fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
7646 	if ((rlim_t)uap->length > fsize_limit) {
7647 		psignal(p, SIGXFSZ);
7648 		return EFBIG;
7649 	}
7650 
7651 	if ((error = fp_lookup(p, fd, &fp, 0))) {
7652 		return error;
7653 	}
7654 
7655 	switch (FILEGLOB_DTYPE(fp->fp_glob)) {
7656 	case DTYPE_PSXSHM:
7657 		error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
7658 		goto out;
7659 	case DTYPE_VNODE:
7660 		break;
7661 	default:
7662 		error = EINVAL;
7663 		goto out;
7664 	}
7665 
7666 	vp = (vnode_t)fp_get_data(fp);
7667 
7668 	if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
7669 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7670 		error = EINVAL;
7671 		goto out;
7672 	}
7673 
7674 	if ((error = vnode_getwithref(vp)) != 0) {
7675 		goto out;
7676 	}
7677 
7678 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7679 
7680 #if CONFIG_MACF
7681 	error = mac_vnode_check_truncate(ctx,
7682 	    fp->fp_glob->fg_cred, vp);
7683 	if (error) {
7684 		(void)vnode_put(vp);
7685 		goto out;
7686 	}
7687 #endif
7688 	VATTR_INIT(&va);
7689 	VATTR_SET(&va, va_data_size, uap->length);
7690 	error = vnode_setattr(vp, &va, ctx);
7691 
7692 #if CONFIG_MACF
7693 	if (error == 0) {
7694 		mac_vnode_notify_truncate(ctx, fp->fp_glob->fg_cred, vp);
7695 	}
7696 #endif
7697 
7698 	(void)vnode_put(vp);
7699 out:
7700 	file_drop(fd);
7701 	return error;
7702 }
7703 
7704 
7705 /*
7706  * Sync an open file with synchronized I/O _file_ integrity completion
7707  */
7708 /* ARGSUSED */
7709 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)7710 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
7711 {
7712 	__pthread_testcancel(1);
7713 	return fsync_common(p, uap, MNT_WAIT);
7714 }
7715 
7716 
7717 /*
7718  * Sync an open file with synchronized I/O _file_ integrity completion
7719  *
7720  * Notes:	This is a legacy support function that does not test for
7721  *		thread cancellation points.
7722  */
7723 /* ARGSUSED */
7724 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)7725 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
7726 {
7727 	return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
7728 }
7729 
7730 
7731 /*
7732  * Sync an open file with synchronized I/O _data_ integrity completion
7733  */
7734 /* ARGSUSED */
7735 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)7736 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
7737 {
7738 	__pthread_testcancel(1);
7739 	return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
7740 }
7741 
7742 
7743 /*
7744  * fsync_common
7745  *
7746  * Common fsync code to support both synchronized I/O file integrity completion
7747  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
7748  *
7749  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
7750  * will only guarantee that the file data contents are retrievable.  If
7751  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
7752  * includes additional metadata unnecessary for retrieving the file data
7753  * contents, such as atime, mtime, ctime, etc., also be committed to stable
7754  * storage.
7755  *
7756  * Parameters:	p				The process
7757  *		uap->fd				The descriptor to synchronize
7758  *		flags				The data integrity flags
7759  *
7760  * Returns:	int				Success
7761  *	fp_getfvp:EBADF				Bad file descriptor
7762  *	fp_getfvp:ENOTSUP			fd does not refer to a vnode
7763  *	VNOP_FSYNC:???				unspecified
7764  *
7765  * Notes:	We use struct fsync_args because it is a short name, and all
7766  *		caller argument structures are otherwise identical.
7767  */
7768 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)7769 fsync_common(proc_t p, struct fsync_args *uap, int flags)
7770 {
7771 	vnode_t vp;
7772 	struct fileproc *fp;
7773 	vfs_context_t ctx = vfs_context_current();
7774 	int error;
7775 
7776 	AUDIT_ARG(fd, uap->fd);
7777 
7778 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
7779 		return error;
7780 	}
7781 	if ((error = vnode_getwithref(vp))) {
7782 		file_drop(uap->fd);
7783 		return error;
7784 	}
7785 
7786 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7787 
7788 	error = VNOP_FSYNC(vp, flags, ctx);
7789 
7790 #if NAMEDRSRCFORK
7791 	/* Sync resource fork shadow file if necessary. */
7792 	if ((error == 0) &&
7793 	    (vp->v_flag & VISNAMEDSTREAM) &&
7794 	    (vp->v_parent != NULLVP) &&
7795 	    vnode_isshadow(vp) &&
7796 	    (fp->fp_glob->fg_flag & FWASWRITTEN)) {
7797 		(void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
7798 	}
7799 #endif
7800 
7801 	(void)vnode_put(vp);
7802 	file_drop(uap->fd);
7803 	return error;
7804 }
7805 
7806 /*
7807  * Duplicate files.  Source must be a file, target must be a file or
7808  * must not exist.
7809  *
7810  * XXX Copyfile authorisation checking is woefully inadequate, and will not
7811  *     perform inheritance correctly.
7812  */
7813 /* ARGSUSED */
7814 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)7815 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
7816 {
7817 	vnode_t tvp, fvp, tdvp, sdvp;
7818 	struct nameidata fromnd, tond;
7819 	int error;
7820 	vfs_context_t ctx = vfs_context_current();
7821 
7822 	/* Check that the flags are valid. */
7823 	if (uap->flags & ~CPF_MASK) {
7824 		return EINVAL;
7825 	}
7826 
7827 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
7828 	    UIO_USERSPACE, uap->from, ctx);
7829 	if ((error = namei(&fromnd))) {
7830 		return error;
7831 	}
7832 	fvp = fromnd.ni_vp;
7833 
7834 	NDINIT(&tond, CREATE, OP_LINK,
7835 	    LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
7836 	    UIO_USERSPACE, uap->to, ctx);
7837 	if ((error = namei(&tond))) {
7838 		goto out1;
7839 	}
7840 	tdvp = tond.ni_dvp;
7841 	tvp = tond.ni_vp;
7842 
7843 	if (tvp != NULL) {
7844 		if (!(uap->flags & CPF_OVERWRITE)) {
7845 			error = EEXIST;
7846 			goto out;
7847 		}
7848 	}
7849 
7850 	if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
7851 		error = EISDIR;
7852 		goto out;
7853 	}
7854 
7855 	if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
7856 		error = EOPNOTSUPP;
7857 		goto out;
7858 	}
7859 
7860 #if CONFIG_MACF
7861 	if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
7862 		goto out;
7863 	}
7864 #endif /* CONFIG_MACF */
7865 
7866 	if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
7867 		goto out;
7868 	}
7869 	if (tvp) {
7870 		if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
7871 			goto out;
7872 		}
7873 	}
7874 	if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
7875 		goto out;
7876 	}
7877 
7878 	if (fvp == tdvp) {
7879 		error = EINVAL;
7880 	}
7881 	/*
7882 	 * If source is the same as the destination (that is the
7883 	 * same inode number) then there is nothing to do.
7884 	 * (fixed to have POSIX semantics - CSM 3/2/98)
7885 	 */
7886 	if (fvp == tvp) {
7887 		error = -1;
7888 	}
7889 	if (!error) {
7890 		error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
7891 	}
7892 out:
7893 	sdvp = tond.ni_startdir;
7894 	/*
7895 	 * nameidone has to happen before we vnode_put(tdvp)
7896 	 * since it may need to release the fs_nodelock on the tdvp
7897 	 */
7898 	nameidone(&tond);
7899 
7900 	if (tvp) {
7901 		vnode_put(tvp);
7902 	}
7903 	vnode_put(tdvp);
7904 	vnode_put(sdvp);
7905 out1:
7906 	vnode_put(fvp);
7907 
7908 	nameidone(&fromnd);
7909 
7910 	if (error == -1) {
7911 		return 0;
7912 	}
7913 	return error;
7914 }
7915 
7916 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
7917 
7918 /*
7919  * Helper function for doing clones. The caller is expected to provide an
7920  * iocounted source vnode and release it.
7921  */
7922 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)7923 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
7924     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
7925 {
7926 	vnode_t tvp, tdvp;
7927 	struct nameidata tond;
7928 	int error;
7929 	int follow;
7930 	boolean_t free_src_acl;
7931 	boolean_t attr_cleanup;
7932 	enum vtype v_type;
7933 	kauth_action_t action;
7934 	struct componentname *cnp;
7935 	uint32_t defaulted;
7936 	struct vnode_attr va;
7937 	struct vnode_attr nva;
7938 	uint32_t vnop_flags;
7939 
7940 	v_type = vnode_vtype(fvp);
7941 	switch (v_type) {
7942 	case VLNK:
7943 	/* FALLTHRU */
7944 	case VREG:
7945 		action = KAUTH_VNODE_ADD_FILE;
7946 		break;
7947 	case VDIR:
7948 		if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
7949 		    fvp->v_mountedhere) {
7950 			return EINVAL;
7951 		}
7952 		action = KAUTH_VNODE_ADD_SUBDIRECTORY;
7953 		break;
7954 	default:
7955 		return EINVAL;
7956 	}
7957 
7958 	AUDIT_ARG(fd2, dst_dirfd);
7959 	AUDIT_ARG(value32, flags);
7960 
7961 	follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7962 	NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
7963 	    UIO_USERSPACE, dst, ctx);
7964 	if ((error = nameiat(&tond, dst_dirfd))) {
7965 		return error;
7966 	}
7967 	cnp = &tond.ni_cnd;
7968 	tdvp = tond.ni_dvp;
7969 	tvp = tond.ni_vp;
7970 
7971 	free_src_acl = FALSE;
7972 	attr_cleanup = FALSE;
7973 
7974 	if (tvp != NULL) {
7975 		error = EEXIST;
7976 		goto out;
7977 	}
7978 
7979 	if (vnode_mount(tdvp) != vnode_mount(fvp)) {
7980 		error = EXDEV;
7981 		goto out;
7982 	}
7983 
7984 #if CONFIG_MACF
7985 	if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
7986 		goto out;
7987 	}
7988 #endif
7989 	if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
7990 		goto out;
7991 	}
7992 
7993 	action = KAUTH_VNODE_GENERIC_READ_BITS;
7994 	if (data_read_authorised) {
7995 		action &= ~KAUTH_VNODE_READ_DATA;
7996 	}
7997 	if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
7998 		goto out;
7999 	}
8000 
8001 	/*
8002 	 * certain attributes may need to be changed from the source, we ask for
8003 	 * those here with the exception of source file's ACL. The clone file
8004 	 * will inherit the target directory's ACL.
8005 	 */
8006 	VATTR_INIT(&va);
8007 	VATTR_WANTED(&va, va_uid);
8008 	VATTR_WANTED(&va, va_gid);
8009 	VATTR_WANTED(&va, va_mode);
8010 	VATTR_WANTED(&va, va_flags);
8011 
8012 	if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
8013 		goto out;
8014 	}
8015 
8016 	VATTR_INIT(&nva);
8017 	VATTR_SET(&nva, va_type, v_type);
8018 	if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8019 		VATTR_SET(&nva, va_acl, va.va_acl);
8020 		free_src_acl = TRUE;
8021 	}
8022 
8023 	/* Handle ACL inheritance, initialize vap. */
8024 	if (v_type == VLNK) {
8025 		error = vnode_authattr_new(tdvp, &nva, 0, ctx);
8026 	} else {
8027 		error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
8028 		if (error) {
8029 			goto out;
8030 		}
8031 		attr_cleanup = TRUE;
8032 	}
8033 
8034 	vnop_flags = VNODE_CLONEFILE_DEFAULT;
8035 	/*
8036 	 * We've got initial values for all security parameters,
8037 	 * If we are superuser, then we can change owners to be the
8038 	 * same as the source. Both superuser and the owner have default
8039 	 * WRITE_SECURITY privileges so all other fields can be taken
8040 	 * from source as well.
8041 	 */
8042 	if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8043 		if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8044 			VATTR_SET(&nva, va_uid, va.va_uid);
8045 		}
8046 		if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8047 			VATTR_SET(&nva, va_gid, va.va_gid);
8048 		}
8049 	} else {
8050 		vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8051 	}
8052 
8053 	if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8054 		VATTR_SET(&nva, va_mode, va.va_mode);
8055 	}
8056 	if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8057 		VATTR_SET(&nva, va_flags,
8058 		    ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8059 		    (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8060 	}
8061 
8062 	error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8063 
8064 	if (!error && tvp) {
8065 		int     update_flags = 0;
8066 #if CONFIG_FSE
8067 		int fsevent;
8068 #endif /* CONFIG_FSE */
8069 
8070 		/*
8071 		 * If some of the requested attributes weren't handled by the
8072 		 * VNOP, use our fallback code.
8073 		 */
8074 		if (!VATTR_ALL_SUPPORTED(&nva)) {
8075 			(void)vnode_setattr_fallback(tvp, &nva, ctx);
8076 		}
8077 
8078 #if CONFIG_MACF
8079 		(void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8080 		    VNODE_LABEL_CREATE, ctx);
8081 #endif
8082 
8083 		// Make sure the name & parent pointers are hooked up
8084 		if (tvp->v_name == NULL) {
8085 			update_flags |= VNODE_UPDATE_NAME;
8086 		}
8087 		if (tvp->v_parent == NULLVP) {
8088 			update_flags |= VNODE_UPDATE_PARENT;
8089 		}
8090 
8091 		if (update_flags) {
8092 			(void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8093 			    cnp->cn_namelen, cnp->cn_hash, update_flags);
8094 		}
8095 
8096 #if CONFIG_FSE
8097 		switch (vnode_vtype(tvp)) {
8098 		case VLNK:
8099 		/* FALLTHRU */
8100 		case VREG:
8101 			fsevent = FSE_CREATE_FILE;
8102 			break;
8103 		case VDIR:
8104 			fsevent = FSE_CREATE_DIR;
8105 			break;
8106 		default:
8107 			goto out;
8108 		}
8109 
8110 		if (need_fsevent(fsevent, tvp)) {
8111 			/*
8112 			 * The following is a sequence of three explicit events.
8113 			 * A pair of FSE_CLONE events representing the source and destination
8114 			 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8115 			 * fseventsd may coalesce the destination clone and create events
8116 			 * into a single event resulting in the following sequence for a client
8117 			 * FSE_CLONE (src)
8118 			 * FSE_CLONE | FSE_CREATE (dst)
8119 			 */
8120 			add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8121 			    FSE_ARG_DONE);
8122 			add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8123 			    FSE_ARG_DONE);
8124 		}
8125 #endif /* CONFIG_FSE */
8126 	}
8127 
8128 out:
8129 	if (attr_cleanup) {
8130 		vn_attribute_cleanup(&nva, defaulted);
8131 	}
8132 	if (free_src_acl && va.va_acl) {
8133 		kauth_acl_free(va.va_acl);
8134 	}
8135 	nameidone(&tond);
8136 	if (tvp) {
8137 		vnode_put(tvp);
8138 	}
8139 	vnode_put(tdvp);
8140 	return error;
8141 }
8142 
8143 /*
8144  * clone files or directories, target must not exist.
8145  */
8146 /* ARGSUSED */
8147 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8148 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8149     __unused int32_t *retval)
8150 {
8151 	vnode_t fvp;
8152 	struct nameidata fromnd;
8153 	int follow;
8154 	int error;
8155 	vfs_context_t ctx = vfs_context_current();
8156 
8157 	/* Check that the flags are valid. */
8158 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
8159 		return EINVAL;
8160 	}
8161 
8162 	AUDIT_ARG(fd, uap->src_dirfd);
8163 
8164 	follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8165 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8166 	    UIO_USERSPACE, uap->src, ctx);
8167 	if ((error = nameiat(&fromnd, uap->src_dirfd))) {
8168 		return error;
8169 	}
8170 
8171 	fvp = fromnd.ni_vp;
8172 	nameidone(&fromnd);
8173 
8174 	error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8175 	    uap->flags, ctx);
8176 
8177 	vnode_put(fvp);
8178 	return error;
8179 }
8180 
8181 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)8182 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8183     __unused int32_t *retval)
8184 {
8185 	vnode_t fvp;
8186 	struct fileproc *fp;
8187 	int error;
8188 	vfs_context_t ctx = vfs_context_current();
8189 
8190 	/* Check that the flags are valid. */
8191 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
8192 		return EINVAL;
8193 	}
8194 
8195 	AUDIT_ARG(fd, uap->src_fd);
8196 	error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8197 	if (error) {
8198 		return error;
8199 	}
8200 
8201 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8202 		AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8203 		error = EBADF;
8204 		goto out;
8205 	}
8206 
8207 	if ((error = vnode_getwithref(fvp))) {
8208 		goto out;
8209 	}
8210 
8211 	AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8212 
8213 	error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8214 	    uap->flags, ctx);
8215 
8216 	vnode_put(fvp);
8217 out:
8218 	file_drop(uap->src_fd);
8219 	return error;
8220 }
8221 
8222 static int
rename_submounts_callback(mount_t mp,void * arg)8223 rename_submounts_callback(mount_t mp, void *arg)
8224 {
8225 	int error = 0;
8226 	mount_t pmp = (mount_t)arg;
8227 	int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8228 
8229 	if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8230 		return 0;
8231 	}
8232 
8233 	if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8234 		return 0;
8235 	}
8236 
8237 	if ((error = vfs_busy(mp, LK_NOWAIT))) {
8238 		printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8239 		return -1;
8240 	}
8241 
8242 	int pathlen = MAXPATHLEN;
8243 	if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8244 		printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8245 	}
8246 
8247 	vfs_unbusy(mp);
8248 
8249 	return error;
8250 }
8251 
8252 /*
8253  * Rename files.  Source and destination must either both be directories,
8254  * or both not be directories.  If target is a directory, it must be empty.
8255  */
8256 /* ARGSUSED */
8257 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,vfs_rename_flags_t flags)8258 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8259     int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
8260 {
8261 	if (flags & ~VFS_RENAME_FLAGS_MASK) {
8262 		return EINVAL;
8263 	}
8264 
8265 	if (ISSET(flags, VFS_RENAME_SWAP) && ISSET(flags, VFS_RENAME_EXCL)) {
8266 		return EINVAL;
8267 	}
8268 
8269 	vnode_t tvp, tdvp;
8270 	vnode_t fvp, fdvp;
8271 	vnode_t mnt_fvp;
8272 	struct nameidata *fromnd, *tond;
8273 	int error;
8274 	int do_retry;
8275 	int retry_count;
8276 	int mntrename;
8277 	int need_event;
8278 	int need_kpath2;
8279 	int has_listeners;
8280 	const char *oname = NULL;
8281 	char *from_name = NULL, *to_name = NULL;
8282 	char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8283 	int from_len = 0, to_len = 0;
8284 	int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8285 	int holding_mntlock;
8286 	int vn_authorize_skipped;
8287 	mount_t locked_mp = NULL;
8288 	vnode_t oparent = NULLVP;
8289 #if CONFIG_FSE
8290 	fse_info from_finfo, to_finfo;
8291 #endif
8292 	int from_truncated = 0, to_truncated = 0;
8293 	int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8294 	int batched = 0;
8295 	struct vnode_attr *fvap, *tvap;
8296 	int continuing = 0;
8297 	/* carving out a chunk for structs that are too big to be on stack. */
8298 	struct {
8299 		struct nameidata from_node, to_node;
8300 		struct vnode_attr fv_attr, tv_attr;
8301 	} * __rename_data;
8302 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
8303 	fromnd = &__rename_data->from_node;
8304 	tond = &__rename_data->to_node;
8305 
8306 	holding_mntlock = 0;
8307 	do_retry = 0;
8308 	retry_count = 0;
8309 retry:
8310 	fvp = tvp = NULL;
8311 	fdvp = tdvp = NULL;
8312 	fvap = tvap = NULL;
8313 	mnt_fvp = NULLVP;
8314 	mntrename = FALSE;
8315 	vn_authorize_skipped = FALSE;
8316 
8317 	NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8318 	    segflg, from, ctx);
8319 	fromnd->ni_flag = NAMEI_COMPOUNDRENAME;
8320 
8321 	NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8322 	    segflg, to, ctx);
8323 	tond->ni_flag = NAMEI_COMPOUNDRENAME;
8324 
8325 continue_lookup:
8326 	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8327 		if ((error = nameiat(fromnd, fromfd))) {
8328 			goto out1;
8329 		}
8330 		fdvp = fromnd->ni_dvp;
8331 		fvp  = fromnd->ni_vp;
8332 
8333 		if (fvp && fvp->v_type == VDIR) {
8334 			tond->ni_cnd.cn_flags |= WILLBEDIR;
8335 		}
8336 	}
8337 
8338 	if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8339 		if ((error = nameiat(tond, tofd))) {
8340 			/*
8341 			 * Translate error code for rename("dir1", "dir2/.").
8342 			 */
8343 			if (error == EISDIR && fvp->v_type == VDIR) {
8344 				error = EINVAL;
8345 			}
8346 			goto out1;
8347 		}
8348 		tdvp = tond->ni_dvp;
8349 		tvp  = tond->ni_vp;
8350 	}
8351 
8352 #if DEVELOPMENT || DEBUG
8353 	/*
8354 	 * XXX VSWAP: Check for entitlements or special flag here
8355 	 * so we can restrict access appropriately.
8356 	 */
8357 #else /* DEVELOPMENT || DEBUG */
8358 
8359 	if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
8360 		error = EPERM;
8361 		goto out1;
8362 	}
8363 
8364 	if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
8365 		error = EPERM;
8366 		goto out1;
8367 	}
8368 #endif /* DEVELOPMENT || DEBUG */
8369 
8370 	if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
8371 		error = ENOENT;
8372 		goto out1;
8373 	}
8374 
8375 	if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
8376 		int32_t pval = 0;
8377 		int err = 0;
8378 
8379 		/*
8380 		 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
8381 		 * has the same name as target iff the following conditions are met:
8382 		 * 1. the target file system is case insensitive
8383 		 * 2. source and target directories are the same
8384 		 * 3. source and target files are the same
8385 		 * 4. name only differs in case (determined by underlying filesystem)
8386 		 */
8387 		if (fvp != tvp || fdvp != tdvp) {
8388 			error = EEXIST;
8389 			goto out1;
8390 		}
8391 
8392 		/*
8393 		 * Assume that the target file system is case sensitive if
8394 		 * _PC_CASE_SENSITIVE selector isn't supported.
8395 		 */
8396 		err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
8397 		if (err != 0 || pval != 0) {
8398 			error = EEXIST;
8399 			goto out1;
8400 		}
8401 	}
8402 
8403 	batched = vnode_compound_rename_available(fdvp);
8404 
8405 #if CONFIG_FSE
8406 	need_event = need_fsevent(FSE_RENAME, fdvp);
8407 	if (need_event) {
8408 		if (fvp) {
8409 			get_fse_info(fvp, &from_finfo, ctx);
8410 		} else {
8411 			error = vfs_get_notify_attributes(&__rename_data->fv_attr);
8412 			if (error) {
8413 				goto out1;
8414 			}
8415 
8416 			fvap = &__rename_data->fv_attr;
8417 		}
8418 
8419 		if (tvp) {
8420 			get_fse_info(tvp, &to_finfo, ctx);
8421 		} else if (batched) {
8422 			error = vfs_get_notify_attributes(&__rename_data->tv_attr);
8423 			if (error) {
8424 				goto out1;
8425 			}
8426 
8427 			tvap = &__rename_data->tv_attr;
8428 		}
8429 	}
8430 #else
8431 	need_event = 0;
8432 #endif /* CONFIG_FSE */
8433 
8434 	has_listeners = kauth_authorize_fileop_has_listeners();
8435 
8436 	need_kpath2 = 0;
8437 #if CONFIG_AUDIT
8438 	if (AUDIT_RECORD_EXISTS()) {
8439 		need_kpath2 = 1;
8440 	}
8441 #endif
8442 
8443 	if (need_event || has_listeners) {
8444 		if (from_name == NULL) {
8445 			GET_PATH(from_name);
8446 		}
8447 
8448 		from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
8449 
8450 		if (from_name_no_firmlink == NULL) {
8451 			GET_PATH(from_name_no_firmlink);
8452 		}
8453 
8454 		from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
8455 	}
8456 
8457 	if (need_event || need_kpath2 || has_listeners) {
8458 		if (to_name == NULL) {
8459 			GET_PATH(to_name);
8460 		}
8461 
8462 		to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
8463 
8464 		if (to_name_no_firmlink == NULL) {
8465 			GET_PATH(to_name_no_firmlink);
8466 		}
8467 
8468 		to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
8469 		if (to_name && need_kpath2) {
8470 			AUDIT_ARG(kpath, to_name, ARG_KPATH2);
8471 		}
8472 	}
8473 	if (!fvp) {
8474 		/*
8475 		 * Claim: this check will never reject a valid rename.
8476 		 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
8477 		 * Suppose fdvp and tdvp are not on the same mount.
8478 		 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
8479 		 *      then you can't move it to within another dir on the same mountpoint.
8480 		 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
8481 		 *
8482 		 * If this check passes, then we are safe to pass these vnodes to the same FS.
8483 		 */
8484 		if (fdvp->v_mount != tdvp->v_mount) {
8485 			error = EXDEV;
8486 			goto out1;
8487 		}
8488 		goto skipped_lookup;
8489 	}
8490 
8491 	/*
8492 	 * If the source and destination are the same (i.e. they're
8493 	 * links to the same vnode) and the target file system is
8494 	 * case sensitive, then there is nothing to do.
8495 	 *
8496 	 * XXX Come back to this.
8497 	 */
8498 	if (fvp == tvp) {
8499 		int pathconf_val;
8500 
8501 		/*
8502 		 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
8503 		 * then assume that this file system is case sensitive.
8504 		 */
8505 		if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
8506 		    pathconf_val != 0) {
8507 			vn_authorize_skipped = TRUE;
8508 			goto out1;
8509 		}
8510 	}
8511 
8512 	/*
8513 	 * Allow the renaming of mount points.
8514 	 * - target must not exist
8515 	 * - target must reside in the same directory as source
8516 	 * - union mounts cannot be renamed
8517 	 * - the root fs, and tightly-linked system volumes, cannot be renamed
8518 	 *
8519 	 * XXX Handle this in VFS after a continued lookup (if we missed
8520 	 * in the cache to start off)
8521 	 *
8522 	 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
8523 	 * we'll skip past here.  The file system is responsible for
8524 	 * checking that @tvp is not a descendent of @fvp and vice versa
8525 	 * so it should always return EINVAL if either @tvp or @fvp is the
8526 	 * root of a volume.
8527 	 */
8528 	if ((fvp->v_flag & VROOT) &&
8529 	    (fvp->v_type == VDIR) &&
8530 	    (tvp == NULL) &&
8531 	    (fvp->v_mountedhere == NULL) &&
8532 	    (fdvp == tdvp) &&
8533 	    ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
8534 	    ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
8535 	    (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
8536 		vnode_t coveredvp;
8537 
8538 		/* switch fvp to the covered vnode */
8539 		coveredvp = fvp->v_mount->mnt_vnodecovered;
8540 		if ((vnode_getwithref(coveredvp))) {
8541 			error = ENOENT;
8542 			goto out1;
8543 		}
8544 		/*
8545 		 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
8546 		 * later.
8547 		 */
8548 		mnt_fvp = fvp;
8549 
8550 		fvp = coveredvp;
8551 		mntrename = TRUE;
8552 	}
8553 	/*
8554 	 * Check for cross-device rename.
8555 	 */
8556 	if ((fvp->v_mount != tdvp->v_mount) ||
8557 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
8558 		error = EXDEV;
8559 		goto out1;
8560 	}
8561 
8562 	/*
8563 	 * If source is the same as the destination (that is the
8564 	 * same inode number) then there is nothing to do...
8565 	 * EXCEPT if the underlying file system supports case
8566 	 * insensitivity and is case preserving.  In this case
8567 	 * the file system needs to handle the special case of
8568 	 * getting the same vnode as target (fvp) and source (tvp).
8569 	 *
8570 	 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
8571 	 * and _PC_CASE_PRESERVING can have this exception, and they need to
8572 	 * handle the special case of getting the same vnode as target and
8573 	 * source.  NOTE: Then the target is unlocked going into vnop_rename,
8574 	 * so not to cause locking problems. There is a single reference on tvp.
8575 	 *
8576 	 * NOTE - that fvp == tvp also occurs if they are hard linked and
8577 	 * that correct behaviour then is just to return success without doing
8578 	 * anything.
8579 	 *
8580 	 * XXX filesystem should take care of this itself, perhaps...
8581 	 */
8582 	if (fvp == tvp && fdvp == tdvp) {
8583 		if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
8584 		    !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
8585 		    fromnd->ni_cnd.cn_namelen)) {
8586 			vn_authorize_skipped = TRUE;
8587 			goto out1;
8588 		}
8589 	}
8590 
8591 	if (holding_mntlock && fvp->v_mount != locked_mp) {
8592 		/*
8593 		 * we're holding a reference and lock
8594 		 * on locked_mp, but it no longer matches
8595 		 * what we want to do... so drop our hold
8596 		 */
8597 		mount_unlock_renames(locked_mp);
8598 		mount_drop(locked_mp, 0);
8599 		holding_mntlock = 0;
8600 	}
8601 	if (tdvp != fdvp && fvp->v_type == VDIR) {
8602 		/*
8603 		 * serialize renames that re-shape
8604 		 * the tree... if holding_mntlock is
8605 		 * set, then we're ready to go...
8606 		 * otherwise we
8607 		 * first need to drop the iocounts
8608 		 * we picked up, second take the
8609 		 * lock to serialize the access,
8610 		 * then finally start the lookup
8611 		 * process over with the lock held
8612 		 */
8613 		if (!holding_mntlock) {
8614 			/*
8615 			 * need to grab a reference on
8616 			 * the mount point before we
8617 			 * drop all the iocounts... once
8618 			 * the iocounts are gone, the mount
8619 			 * could follow
8620 			 */
8621 			locked_mp = fvp->v_mount;
8622 			mount_ref(locked_mp, 0);
8623 
8624 			/*
8625 			 * nameidone has to happen before we vnode_put(tvp)
8626 			 * since it may need to release the fs_nodelock on the tvp
8627 			 */
8628 			nameidone(tond);
8629 
8630 			if (tvp) {
8631 				vnode_put(tvp);
8632 			}
8633 			vnode_put(tdvp);
8634 
8635 			/*
8636 			 * nameidone has to happen before we vnode_put(fdvp)
8637 			 * since it may need to release the fs_nodelock on the fvp
8638 			 */
8639 			nameidone(fromnd);
8640 
8641 			vnode_put(fvp);
8642 			vnode_put(fdvp);
8643 
8644 			if (mnt_fvp != NULLVP) {
8645 				vnode_put(mnt_fvp);
8646 			}
8647 
8648 			mount_lock_renames(locked_mp);
8649 			holding_mntlock = 1;
8650 
8651 			goto retry;
8652 		}
8653 	} else {
8654 		/*
8655 		 * when we dropped the iocounts to take
8656 		 * the lock, we allowed the identity of
8657 		 * the various vnodes to change... if they did,
8658 		 * we may no longer be dealing with a rename
8659 		 * that reshapes the tree... once we're holding
8660 		 * the iocounts, the vnodes can't change type
8661 		 * so we're free to drop the lock at this point
8662 		 * and continue on
8663 		 */
8664 		if (holding_mntlock) {
8665 			mount_unlock_renames(locked_mp);
8666 			mount_drop(locked_mp, 0);
8667 			holding_mntlock = 0;
8668 		}
8669 	}
8670 
8671 	if (!batched) {
8672 		error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
8673 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
8674 		    flags, NULL);
8675 		if (error) {
8676 			if (error == ENOENT) {
8677 				if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8678 					/*
8679 					 * We encountered a race where after doing the namei,
8680 					 * tvp stops being valid. If so, simply re-drive the rename
8681 					 * call from the top.
8682 					 */
8683 					do_retry = 1;
8684 					retry_count += 1;
8685 				}
8686 			}
8687 			goto out1;
8688 		}
8689 	}
8690 
8691 	/* Release the 'mnt_fvp' now that it is no longer needed. */
8692 	if (mnt_fvp != NULLVP) {
8693 		vnode_put(mnt_fvp);
8694 		mnt_fvp = NULLVP;
8695 	}
8696 
8697 	// save these off so we can later verify that fvp is the same
8698 	oname   = fvp->v_name;
8699 	oparent = fvp->v_parent;
8700 
8701 skipped_lookup:
8702 	error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
8703 	    tdvp, &tvp, &tond->ni_cnd, tvap,
8704 	    flags, ctx);
8705 
8706 	if (holding_mntlock) {
8707 		/*
8708 		 * we can drop our serialization
8709 		 * lock now
8710 		 */
8711 		mount_unlock_renames(locked_mp);
8712 		mount_drop(locked_mp, 0);
8713 		holding_mntlock = 0;
8714 	}
8715 	if (error) {
8716 		if (error == EDATALESS) {
8717 			/*
8718 			 * If we've been here before, something has gone
8719 			 * horribly wrong and we should just get out lest
8720 			 * we spiral around the drain forever.
8721 			 */
8722 			if (flags & VFS_RENAME_DATALESS) {
8723 				error = EIO;
8724 				goto out1;
8725 			}
8726 
8727 			/*
8728 			 * The object we're renaming is dataless (or has a
8729 			 * dataless descendent) and requires materialization
8730 			 * before the rename occurs.  But we're holding the
8731 			 * mount point's rename lock, so it's not safe to
8732 			 * make the upcall.
8733 			 *
8734 			 * In this case, we release the lock, perform the
8735 			 * materialization, and start the whole thing over.
8736 			 */
8737 			error = vnode_materialize_dataless_file(fvp,
8738 			    NAMESPACE_HANDLER_RENAME_OP);
8739 
8740 			if (error == 0) {
8741 				/*
8742 				 * The next time around we need to tell the
8743 				 * file system that the materializtaion has
8744 				 * been performed.
8745 				 */
8746 				flags |= VFS_RENAME_DATALESS;
8747 				do_retry = 1;
8748 			}
8749 			goto out1;
8750 		}
8751 		if (error == EKEEPLOOKING) {
8752 			if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8753 				if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8754 					panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
8755 				}
8756 			}
8757 
8758 			fromnd->ni_vp = fvp;
8759 			tond->ni_vp = tvp;
8760 
8761 			goto continue_lookup;
8762 		}
8763 
8764 		/*
8765 		 * We may encounter a race in the VNOP where the destination didn't
8766 		 * exist when we did the namei, but it does by the time we go and
8767 		 * try to create the entry. In this case, we should re-drive this rename
8768 		 * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
8769 		 * but other filesystems susceptible to this race could return it, too.
8770 		 */
8771 		if (error == ERECYCLE) {
8772 			if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
8773 				do_retry = 1;
8774 				retry_count += 1;
8775 			} else {
8776 				printf("rename retry limit due to ERECYCLE reached\n");
8777 				error = ENOENT;
8778 			}
8779 		}
8780 
8781 		/*
8782 		 * For compound VNOPs, the authorization callback may return
8783 		 * ENOENT in case of racing hardlink lookups hitting the name
8784 		 * cache, redrive the lookup.
8785 		 */
8786 		if (batched && error == ENOENT) {
8787 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8788 				do_retry = 1;
8789 				retry_count += 1;
8790 			}
8791 		}
8792 
8793 		goto out1;
8794 	}
8795 
8796 	/* call out to allow 3rd party notification of rename.
8797 	 * Ignore result of kauth_authorize_fileop call.
8798 	 */
8799 	kauth_authorize_fileop(vfs_context_ucred(ctx),
8800 	    KAUTH_FILEOP_RENAME,
8801 	    (uintptr_t)from_name, (uintptr_t)to_name);
8802 	if (flags & VFS_RENAME_SWAP) {
8803 		kauth_authorize_fileop(vfs_context_ucred(ctx),
8804 		    KAUTH_FILEOP_RENAME,
8805 		    (uintptr_t)to_name, (uintptr_t)from_name);
8806 	}
8807 
8808 #if CONFIG_FSE
8809 	if (from_name != NULL && to_name != NULL) {
8810 		if (from_truncated || to_truncated) {
8811 			// set it here since only the from_finfo gets reported up to user space
8812 			from_finfo.mode |= FSE_TRUNCATED_PATH;
8813 		}
8814 
8815 		if (tvap && tvp) {
8816 			vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
8817 		}
8818 		if (fvap) {
8819 			vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
8820 		}
8821 
8822 		if (tvp) {
8823 			add_fsevent(FSE_RENAME, ctx,
8824 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8825 			    FSE_ARG_FINFO, &from_finfo,
8826 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8827 			    FSE_ARG_FINFO, &to_finfo,
8828 			    FSE_ARG_DONE);
8829 			if (flags & VFS_RENAME_SWAP) {
8830 				/*
8831 				 * Strictly speaking, swap is the equivalent of
8832 				 * *three* renames.  FSEvents clients should only take
8833 				 * the events as a hint, so we only bother reporting
8834 				 * two.
8835 				 */
8836 				add_fsevent(FSE_RENAME, ctx,
8837 				    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8838 				    FSE_ARG_FINFO, &to_finfo,
8839 				    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8840 				    FSE_ARG_FINFO, &from_finfo,
8841 				    FSE_ARG_DONE);
8842 			}
8843 		} else {
8844 			add_fsevent(FSE_RENAME, ctx,
8845 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8846 			    FSE_ARG_FINFO, &from_finfo,
8847 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8848 			    FSE_ARG_DONE);
8849 		}
8850 	}
8851 #endif /* CONFIG_FSE */
8852 
8853 	/*
8854 	 * update filesystem's mount point data
8855 	 */
8856 	if (mntrename) {
8857 		char *cp, *pathend, *mpname;
8858 		char * tobuf;
8859 		struct mount *mp;
8860 		int maxlen;
8861 		size_t len = 0;
8862 
8863 		mp = fvp->v_mountedhere;
8864 
8865 		if (vfs_busy(mp, LK_NOWAIT)) {
8866 			error = EBUSY;
8867 			goto out1;
8868 		}
8869 		tobuf = zalloc(ZV_NAMEI);
8870 
8871 		if (UIO_SEG_IS_USER_SPACE(segflg)) {
8872 			error = copyinstr(to, tobuf, MAXPATHLEN, &len);
8873 		} else {
8874 			error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
8875 		}
8876 		if (!error) {
8877 			/* find current mount point prefix */
8878 			pathend = &mp->mnt_vfsstat.f_mntonname[0];
8879 			for (cp = pathend; *cp != '\0'; ++cp) {
8880 				if (*cp == '/') {
8881 					pathend = cp + 1;
8882 				}
8883 			}
8884 			/* find last component of target name */
8885 			for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
8886 				if (*cp == '/') {
8887 					mpname = cp + 1;
8888 				}
8889 			}
8890 
8891 			/* Update f_mntonname of sub mounts */
8892 			vfs_iterate(0, rename_submounts_callback, (void *)mp);
8893 
8894 			/* append name to prefix */
8895 			maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
8896 			bzero(pathend, maxlen);
8897 
8898 			strlcpy(pathend, mpname, maxlen);
8899 		}
8900 		zfree(ZV_NAMEI, tobuf);
8901 
8902 		vfs_unbusy(mp);
8903 
8904 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
8905 	}
8906 	/*
8907 	 * fix up name & parent pointers.  note that we first
8908 	 * check that fvp has the same name/parent pointers it
8909 	 * had before the rename call... this is a 'weak' check
8910 	 * at best...
8911 	 *
8912 	 * XXX oparent and oname may not be set in the compound vnop case
8913 	 */
8914 	if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
8915 		int update_flags;
8916 
8917 		update_flags = VNODE_UPDATE_NAME;
8918 
8919 		if (fdvp != tdvp) {
8920 			update_flags |= VNODE_UPDATE_PARENT;
8921 		}
8922 
8923 		vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
8924 	}
8925 out1:
8926 	/*
8927 	 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
8928 	 * skipped earlier as no actual rename was performed.
8929 	 */
8930 	if (vn_authorize_skipped && error == 0) {
8931 		error = vn_authorize_renamex_with_paths(fdvp, fvp,
8932 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
8933 		    flags, NULL);
8934 		if (error && error == ENOENT) {
8935 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8936 				do_retry = 1;
8937 				retry_count += 1;
8938 			}
8939 		}
8940 	}
8941 	if (to_name != NULL) {
8942 		RELEASE_PATH(to_name);
8943 		to_name = NULL;
8944 	}
8945 	if (to_name_no_firmlink != NULL) {
8946 		RELEASE_PATH(to_name_no_firmlink);
8947 		to_name_no_firmlink = NULL;
8948 	}
8949 	if (from_name != NULL) {
8950 		RELEASE_PATH(from_name);
8951 		from_name = NULL;
8952 	}
8953 	if (from_name_no_firmlink != NULL) {
8954 		RELEASE_PATH(from_name_no_firmlink);
8955 		from_name_no_firmlink = NULL;
8956 	}
8957 	if (holding_mntlock) {
8958 		mount_unlock_renames(locked_mp);
8959 		mount_drop(locked_mp, 0);
8960 		holding_mntlock = 0;
8961 	}
8962 	if (tdvp) {
8963 		/*
8964 		 * nameidone has to happen before we vnode_put(tdvp)
8965 		 * since it may need to release the fs_nodelock on the tdvp
8966 		 */
8967 		nameidone(tond);
8968 
8969 		if (tvp) {
8970 			vnode_put(tvp);
8971 		}
8972 		vnode_put(tdvp);
8973 	}
8974 	if (fdvp) {
8975 		/*
8976 		 * nameidone has to happen before we vnode_put(fdvp)
8977 		 * since it may need to release the fs_nodelock on the fdvp
8978 		 */
8979 		nameidone(fromnd);
8980 
8981 		if (fvp) {
8982 			vnode_put(fvp);
8983 		}
8984 		vnode_put(fdvp);
8985 	}
8986 	if (mnt_fvp != NULLVP) {
8987 		vnode_put(mnt_fvp);
8988 	}
8989 	/*
8990 	 * If things changed after we did the namei, then we will re-drive
8991 	 * this rename call from the top.
8992 	 */
8993 	if (do_retry) {
8994 		do_retry = 0;
8995 		goto retry;
8996 	}
8997 
8998 	kfree_type(typeof(*__rename_data), __rename_data);
8999 	return error;
9000 }
9001 
9002 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9003 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9004 {
9005 	return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9006 	           AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9007 }
9008 
9009 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9010 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9011 {
9012 	return renameat_internal(
9013 		vfs_context_current(),
9014 		uap->fromfd, uap->from,
9015 		uap->tofd, uap->to,
9016 		UIO_USERSPACE, uap->flags);
9017 }
9018 
9019 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9020 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9021 {
9022 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9023 	           uap->tofd, uap->to, UIO_USERSPACE, 0);
9024 }
9025 
9026 /*
9027  * Make a directory file.
9028  *
9029  * Returns:	0			Success
9030  *		EEXIST
9031  *	namei:???
9032  *	vnode_authorize:???
9033  *	vn_create:???
9034  */
9035 /* ARGSUSED */
9036 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9037 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9038     enum uio_seg segflg)
9039 {
9040 	vnode_t vp, dvp;
9041 	int error;
9042 	int update_flags = 0;
9043 	int batched;
9044 	struct nameidata nd;
9045 
9046 	AUDIT_ARG(mode, vap->va_mode);
9047 	NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9048 	    path, ctx);
9049 	nd.ni_cnd.cn_flags |= WILLBEDIR;
9050 	nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9051 
9052 continue_lookup:
9053 	error = nameiat(&nd, fd);
9054 	if (error) {
9055 		return error;
9056 	}
9057 	dvp = nd.ni_dvp;
9058 	vp = nd.ni_vp;
9059 
9060 	if (vp != NULL) {
9061 		error = EEXIST;
9062 		goto out;
9063 	}
9064 
9065 	batched = vnode_compound_mkdir_available(dvp);
9066 
9067 	VATTR_SET(vap, va_type, VDIR);
9068 
9069 	/*
9070 	 * XXX
9071 	 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9072 	 * only get EXISTS or EISDIR for existing path components, and not that it could see
9073 	 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9074 	 * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
9075 	 */
9076 	if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9077 		if (error == EACCES || error == EPERM) {
9078 			int error2;
9079 
9080 			nameidone(&nd);
9081 			vnode_put(dvp);
9082 			dvp = NULLVP;
9083 
9084 			/*
9085 			 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9086 			 * rather than EACCESS if the target exists.
9087 			 */
9088 			NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9089 			    path, ctx);
9090 			error2 = nameiat(&nd, fd);
9091 			if (error2) {
9092 				goto out;
9093 			} else {
9094 				vp = nd.ni_vp;
9095 				error = EEXIST;
9096 				goto out;
9097 			}
9098 		}
9099 
9100 		goto out;
9101 	}
9102 
9103 	/*
9104 	 * make the directory
9105 	 */
9106 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9107 		if (error == EKEEPLOOKING) {
9108 			nd.ni_vp = vp;
9109 			goto continue_lookup;
9110 		}
9111 
9112 		goto out;
9113 	}
9114 
9115 	// Make sure the name & parent pointers are hooked up
9116 	if (vp->v_name == NULL) {
9117 		update_flags |= VNODE_UPDATE_NAME;
9118 	}
9119 	if (vp->v_parent == NULLVP) {
9120 		update_flags |= VNODE_UPDATE_PARENT;
9121 	}
9122 
9123 	if (update_flags) {
9124 		vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9125 	}
9126 
9127 #if CONFIG_FSE
9128 	add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9129 #endif
9130 
9131 out:
9132 	/*
9133 	 * nameidone has to happen before we vnode_put(dvp)
9134 	 * since it may need to release the fs_nodelock on the dvp
9135 	 */
9136 	nameidone(&nd);
9137 
9138 	if (vp) {
9139 		vnode_put(vp);
9140 	}
9141 	if (dvp) {
9142 		vnode_put(dvp);
9143 	}
9144 
9145 	return error;
9146 }
9147 
9148 /*
9149  * mkdir_extended: Create a directory; with extended security (ACL).
9150  *
9151  * Parameters:    p                       Process requesting to create the directory
9152  *                uap                     User argument descriptor (see below)
9153  *                retval                  (ignored)
9154  *
9155  * Indirect:      uap->path               Path of directory to create
9156  *                uap->mode               Access permissions to set
9157  *                uap->xsecurity          ACL to set
9158  *
9159  * Returns:        0                      Success
9160  *                !0                      Not success
9161  *
9162  */
9163 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)9164 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9165 {
9166 	int ciferror;
9167 	kauth_filesec_t xsecdst;
9168 	struct vnode_attr va;
9169 
9170 	AUDIT_ARG(owner, uap->uid, uap->gid);
9171 
9172 	xsecdst = NULL;
9173 	if ((uap->xsecurity != USER_ADDR_NULL) &&
9174 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9175 		return ciferror;
9176 	}
9177 
9178 	VATTR_INIT(&va);
9179 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9180 	if (xsecdst != NULL) {
9181 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9182 		va.va_vaflags |= VA_FILESEC_ACL;
9183 	}
9184 
9185 	ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9186 	    UIO_USERSPACE);
9187 	if (xsecdst != NULL) {
9188 		kauth_filesec_free(xsecdst);
9189 	}
9190 	return ciferror;
9191 }
9192 
9193 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)9194 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9195 {
9196 	struct vnode_attr va;
9197 
9198 	VATTR_INIT(&va);
9199 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9200 
9201 	return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9202 	           UIO_USERSPACE);
9203 }
9204 
9205 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)9206 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9207 {
9208 	struct vnode_attr va;
9209 
9210 	VATTR_INIT(&va);
9211 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9212 
9213 	return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9214 	           UIO_USERSPACE);
9215 }
9216 
9217 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)9218 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9219     enum uio_seg segflg, int unlink_flags)
9220 {
9221 	struct {
9222 		struct nameidata nd;
9223 #if CONFIG_FSE
9224 		struct vnode_attr va;
9225 #endif /* CONFIG_FSE */
9226 	} *__rmdir_data;
9227 	vnode_t vp, dvp;
9228 	int error;
9229 	struct nameidata *ndp;
9230 	char     *path = NULL;
9231 	char     *no_firmlink_path = NULL;
9232 	int       len_path = 0;
9233 	int       len_no_firmlink_path = 0;
9234 	int has_listeners = 0;
9235 	int need_event = 0;
9236 	int truncated_path = 0;
9237 	int truncated_no_firmlink_path = 0;
9238 	struct vnode_attr *vap = NULL;
9239 	int restart_count = 0;
9240 	int batched;
9241 
9242 	int restart_flag;
9243 
9244 	__rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
9245 	ndp = &__rmdir_data->nd;
9246 
9247 	/*
9248 	 * This loop exists to restart rmdir in the unlikely case that two
9249 	 * processes are simultaneously trying to remove the same directory
9250 	 * containing orphaned appleDouble files.
9251 	 */
9252 	do {
9253 		NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9254 		    segflg, dirpath, ctx);
9255 		ndp->ni_flag = NAMEI_COMPOUNDRMDIR;
9256 continue_lookup:
9257 		restart_flag = 0;
9258 		vap = NULL;
9259 
9260 		error = nameiat(ndp, fd);
9261 		if (error) {
9262 			goto err_out;
9263 		}
9264 
9265 		dvp = ndp->ni_dvp;
9266 		vp = ndp->ni_vp;
9267 
9268 		if (vp) {
9269 			batched = vnode_compound_rmdir_available(vp);
9270 
9271 			if (vp->v_flag & VROOT) {
9272 				/*
9273 				 * The root of a mounted filesystem cannot be deleted.
9274 				 */
9275 				error = EBUSY;
9276 				goto out;
9277 			}
9278 
9279 #if DEVELOPMENT || DEBUG
9280 			/*
9281 			 * XXX VSWAP: Check for entitlements or special flag here
9282 			 * so we can restrict access appropriately.
9283 			 */
9284 #else /* DEVELOPMENT || DEBUG */
9285 
9286 			if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
9287 				error = EPERM;
9288 				goto out;
9289 			}
9290 #endif /* DEVELOPMENT || DEBUG */
9291 
9292 			/*
9293 			 * Removed a check here; we used to abort if vp's vid
9294 			 * was not the same as what we'd seen the last time around.
9295 			 * I do not think that check was valid, because if we retry
9296 			 * and all dirents are gone, the directory could legitimately
9297 			 * be recycled but still be present in a situation where we would
9298 			 * have had permission to delete.  Therefore, we won't make
9299 			 * an effort to preserve that check now that we may not have a
9300 			 * vp here.
9301 			 */
9302 
9303 			if (!batched) {
9304 				error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
9305 				if (error) {
9306 					if (error == ENOENT) {
9307 						if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9308 							restart_flag = 1;
9309 							restart_count += 1;
9310 						}
9311 					}
9312 					goto out;
9313 				}
9314 			}
9315 		} else {
9316 			batched = 1;
9317 
9318 			if (!vnode_compound_rmdir_available(dvp)) {
9319 				panic("No error, but no compound rmdir?");
9320 			}
9321 		}
9322 
9323 #if CONFIG_FSE
9324 		fse_info  finfo = {0};
9325 
9326 		need_event = need_fsevent(FSE_DELETE, dvp);
9327 		if (need_event) {
9328 			if (!batched) {
9329 				get_fse_info(vp, &finfo, ctx);
9330 			} else {
9331 				error = vfs_get_notify_attributes(&__rmdir_data->va);
9332 				if (error) {
9333 					goto out;
9334 				}
9335 
9336 				vap = &__rmdir_data->va;
9337 			}
9338 		}
9339 #endif
9340 		has_listeners = kauth_authorize_fileop_has_listeners();
9341 		if (need_event || has_listeners) {
9342 			if (path == NULL) {
9343 				GET_PATH(path);
9344 			}
9345 
9346 			len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
9347 
9348 			if (no_firmlink_path == NULL) {
9349 				GET_PATH(no_firmlink_path);
9350 			}
9351 
9352 			len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
9353 #if CONFIG_FSE
9354 			if (truncated_no_firmlink_path) {
9355 				finfo.mode |= FSE_TRUNCATED_PATH;
9356 			}
9357 #endif
9358 		}
9359 
9360 		error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
9361 		ndp->ni_vp = vp;
9362 		if (vp == NULLVP) {
9363 			/* Couldn't find a vnode */
9364 			goto out;
9365 		}
9366 
9367 		if (error == EKEEPLOOKING) {
9368 			goto continue_lookup;
9369 		} else if (batched && error == ENOENT) {
9370 			if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9371 				/*
9372 				 * For compound VNOPs, the authorization callback
9373 				 * may return ENOENT in case of racing hard link lookups
9374 				 * redrive the lookup.
9375 				 */
9376 				restart_flag = 1;
9377 				restart_count += 1;
9378 				goto out;
9379 			}
9380 		}
9381 
9382 		/*
9383 		 * XXX There's no provision for passing flags
9384 		 * to VNOP_RMDIR().  So, if vn_rmdir() fails
9385 		 * because it's not empty, then we try again
9386 		 * with VNOP_REMOVE(), passing in a special
9387 		 * flag that clever file systems will know
9388 		 * how to handle.
9389 		 */
9390 		if (error == ENOTEMPTY &&
9391 		    (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
9392 			/*
9393 			 * If this fails, we want to keep the original
9394 			 * error.
9395 			 */
9396 			if (vn_remove(dvp, &vp, ndp,
9397 			    VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
9398 				error = 0;
9399 			}
9400 		}
9401 
9402 #if CONFIG_APPLEDOUBLE
9403 		/*
9404 		 * Special case to remove orphaned AppleDouble
9405 		 * files. I don't like putting this in the kernel,
9406 		 * but carbon does not like putting this in carbon either,
9407 		 * so here we are.
9408 		 */
9409 		if (error == ENOTEMPTY) {
9410 			int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
9411 			if (ad_error == EBUSY) {
9412 				error = ad_error;
9413 				goto out;
9414 			}
9415 
9416 
9417 			/*
9418 			 * Assuming everything went well, we will try the RMDIR again
9419 			 */
9420 			if (!ad_error) {
9421 				error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
9422 			}
9423 		}
9424 #endif /* CONFIG_APPLEDOUBLE */
9425 		/*
9426 		 * Call out to allow 3rd party notification of delete.
9427 		 * Ignore result of kauth_authorize_fileop call.
9428 		 */
9429 		if (!error) {
9430 			if (has_listeners) {
9431 				kauth_authorize_fileop(vfs_context_ucred(ctx),
9432 				    KAUTH_FILEOP_DELETE,
9433 				    (uintptr_t)vp,
9434 				    (uintptr_t)path);
9435 			}
9436 
9437 			if (vp->v_flag & VISHARDLINK) {
9438 				// see the comment in unlink1() about why we update
9439 				// the parent of a hard link when it is removed
9440 				vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
9441 			}
9442 
9443 #if CONFIG_FSE
9444 			if (need_event) {
9445 				if (vap) {
9446 					vnode_get_fse_info_from_vap(vp, &finfo, vap);
9447 				}
9448 				add_fsevent(FSE_DELETE, ctx,
9449 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
9450 				    FSE_ARG_FINFO, &finfo,
9451 				    FSE_ARG_DONE);
9452 			}
9453 #endif
9454 		}
9455 
9456 out:
9457 		if (path != NULL) {
9458 			RELEASE_PATH(path);
9459 			path = NULL;
9460 		}
9461 
9462 		if (no_firmlink_path != NULL) {
9463 			RELEASE_PATH(no_firmlink_path);
9464 			no_firmlink_path = NULL;
9465 		}
9466 
9467 		/*
9468 		 * nameidone has to happen before we vnode_put(dvp)
9469 		 * since it may need to release the fs_nodelock on the dvp
9470 		 */
9471 		nameidone(ndp);
9472 		vnode_put(dvp);
9473 
9474 		if (vp) {
9475 			vnode_put(vp);
9476 		}
9477 
9478 		if (restart_flag == 0) {
9479 			wakeup_one((caddr_t)vp);
9480 			goto err_out;
9481 		}
9482 		tsleep(vp, PVFS, "rm AD", 1);
9483 	} while (restart_flag != 0);
9484 
9485 err_out:
9486 	kfree_type(typeof(*__rmdir_data), __rmdir_data);
9487 
9488 	return error;
9489 }
9490 
9491 /*
9492  * Remove a directory file.
9493  */
9494 /* ARGSUSED */
9495 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)9496 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
9497 {
9498 	return rmdirat_internal(vfs_context_current(), AT_FDCWD,
9499 	           CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
9500 }
9501 
9502 /* Get direntry length padded to 8 byte alignment */
9503 #define DIRENT64_LEN(namlen) \
9504 	((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
9505 
9506 /* Get dirent length padded to 4 byte alignment */
9507 #define DIRENT_LEN(namelen) \
9508 	((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
9509 
9510 /* Get the end of this dirent */
9511 #define DIRENT_END(dep) \
9512 	(((char *)(dep)) + (dep)->d_reclen - 1)
9513 
9514 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)9515 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
9516     int *numdirent, vfs_context_t ctxp)
9517 {
9518 	/* Check if fs natively supports VNODE_READDIR_EXTENDED */
9519 	if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
9520 	    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
9521 		return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
9522 	} else {
9523 		size_t bufsize;
9524 		void * bufptr;
9525 		uio_t auio;
9526 		struct direntry *entry64;
9527 		struct dirent *dep;
9528 		size_t bytesread;
9529 		int error;
9530 
9531 		/*
9532 		 * We're here because the underlying file system does not
9533 		 * support direnties or we mounted denying support so we must
9534 		 * fall back to dirents and convert them to direntries.
9535 		 *
9536 		 * Our kernel buffer needs to be smaller since re-packing will
9537 		 * expand each dirent.  The worse case (when the name length
9538 		 * is 3 or less) corresponds to a struct direntry size of 32
9539 		 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
9540 		 * (4-byte aligned).  So having a buffer that is 3/8 the size
9541 		 * will prevent us from reading more than we can pack.
9542 		 *
9543 		 * Since this buffer is wired memory, we will limit the
9544 		 * buffer size to a maximum of 32K. We would really like to
9545 		 * use 32K in the MIN(), but we use magic number 87371 to
9546 		 * prevent uio_resid() * 3 / 8 from overflowing.
9547 		 */
9548 		bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
9549 		bufptr = kalloc_data(bufsize, Z_WAITOK);
9550 		if (bufptr == NULL) {
9551 			return ENOMEM;
9552 		}
9553 
9554 		auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
9555 		uio_addiov(auio, (uintptr_t)bufptr, bufsize);
9556 		auio->uio_offset = uio->uio_offset;
9557 
9558 		error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
9559 
9560 		dep = (struct dirent *)bufptr;
9561 		bytesread = bufsize - uio_resid(auio);
9562 
9563 		entry64 = kalloc_type(struct direntry, Z_WAITOK);
9564 		/*
9565 		 * Convert all the entries and copy them out to user's buffer.
9566 		 */
9567 		while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
9568 			/* First check that the dirent struct up to d_name is within the buffer */
9569 			if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
9570 			    /* Check that the length of the entire dirent is within the buffer */
9571 			    DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
9572 			    /* Check that the actual length including the name doesn't exceed d_reclen */
9573 			    DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
9574 				printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
9575 				    vp->v_mount->mnt_vfsstat.f_mntonname,
9576 				    vp->v_name ? vp->v_name : "<unknown>");
9577 				error = EIO;
9578 				break;
9579 			}
9580 
9581 			size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
9582 
9583 			bzero(entry64, enbufsize);
9584 			/* Convert a dirent to a dirent64. */
9585 			entry64->d_ino = dep->d_ino;
9586 			entry64->d_seekoff = 0;
9587 			entry64->d_reclen = (uint16_t)enbufsize;
9588 			entry64->d_namlen = dep->d_namlen;
9589 			entry64->d_type = dep->d_type;
9590 			bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
9591 
9592 			/* Move to next entry. */
9593 			dep = (struct dirent *)((char *)dep + dep->d_reclen);
9594 
9595 			/* Copy entry64 to user's buffer. */
9596 			error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
9597 		}
9598 
9599 		/* Update the real offset using the offset we got from VNOP_READDIR. */
9600 		if (error == 0) {
9601 			uio->uio_offset = auio->uio_offset;
9602 		}
9603 		uio_free(auio);
9604 		kfree_data(bufptr, bufsize);
9605 		kfree_type(struct direntry, entry64);
9606 		return error;
9607 	}
9608 }
9609 
9610 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
9611 
9612 /*
9613  * Read a block of directory entries in a file system independent format.
9614  */
9615 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)9616 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
9617     off_t *offset, int *eofflag, int flags)
9618 {
9619 	vnode_t vp;
9620 	struct vfs_context context = *vfs_context_current();    /* local copy */
9621 	struct fileproc *fp;
9622 	uio_t auio;
9623 	int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9624 	off_t loff;
9625 	int error, numdirent;
9626 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
9627 
9628 get_from_fd:
9629 	error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
9630 	if (error) {
9631 		return error;
9632 	}
9633 
9634 	vn_offset_lock(fp->fp_glob);
9635 	if (((vnode_t)fp_get_data(fp)) != vp) {
9636 		vn_offset_unlock(fp->fp_glob);
9637 		file_drop(fd);
9638 		goto get_from_fd;
9639 	}
9640 
9641 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
9642 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9643 		error = EBADF;
9644 		goto out;
9645 	}
9646 
9647 	if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
9648 		bufsize = GETDIRENTRIES_MAXBUFSIZE;
9649 	}
9650 
9651 #if CONFIG_MACF
9652 	error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
9653 	if (error) {
9654 		goto out;
9655 	}
9656 #endif
9657 
9658 	if ((error = vnode_getwithref(vp))) {
9659 		goto out;
9660 	}
9661 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9662 
9663 #if CONFIG_UNION_MOUNTS
9664 unionread:
9665 #endif /* CONFIG_UNION_MOUNTS */
9666 	if (vp->v_type != VDIR) {
9667 		(void)vnode_put(vp);
9668 		error = EINVAL;
9669 		goto out;
9670 	}
9671 
9672 #if CONFIG_MACF
9673 	error = mac_vnode_check_readdir(&context, vp);
9674 	if (error != 0) {
9675 		(void)vnode_put(vp);
9676 		goto out;
9677 	}
9678 #endif /* MAC */
9679 
9680 	loff = fp->fp_glob->fg_offset;
9681 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9682 	uio_addiov(auio, bufp, bufsize);
9683 
9684 	if (flags & VNODE_READDIR_EXTENDED) {
9685 		error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
9686 		fp->fp_glob->fg_offset = uio_offset(auio);
9687 	} else {
9688 		error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
9689 		fp->fp_glob->fg_offset = uio_offset(auio);
9690 	}
9691 	if (error) {
9692 		(void)vnode_put(vp);
9693 		goto out;
9694 	}
9695 
9696 #if CONFIG_UNION_MOUNTS
9697 	if ((user_ssize_t)bufsize == uio_resid(auio) &&
9698 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
9699 		vnode_t uvp;
9700 
9701 		if (lookup_traverse_union(vp, &uvp, &context) == 0) {
9702 			if (vnode_ref(uvp) == 0) {
9703 				fp_set_data(fp, uvp);
9704 				fp->fp_glob->fg_offset = 0;
9705 				vnode_rele(vp);
9706 				vnode_put(vp);
9707 				vp = uvp;
9708 				goto unionread;
9709 			} else {
9710 				/* could not get a ref, can't replace in fd */
9711 				vnode_put(uvp);
9712 			}
9713 		}
9714 	}
9715 #endif /* CONFIG_UNION_MOUNTS */
9716 
9717 	vnode_put(vp);
9718 	if (offset) {
9719 		*offset = loff;
9720 	}
9721 
9722 	*bytesread = bufsize - uio_resid(auio);
9723 out:
9724 	vn_offset_unlock(fp->fp_glob);
9725 	file_drop(fd);
9726 	return error;
9727 }
9728 
9729 
9730 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)9731 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
9732 {
9733 	off_t offset;
9734 	ssize_t bytesread;
9735 	int error, eofflag;
9736 
9737 	AUDIT_ARG(fd, uap->fd);
9738 	error = getdirentries_common(uap->fd, uap->buf, uap->count,
9739 	    &bytesread, &offset, &eofflag, 0);
9740 
9741 	if (error == 0) {
9742 		if (proc_is64bit(p)) {
9743 			user64_long_t base = (user64_long_t)offset;
9744 			error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
9745 		} else {
9746 			user32_long_t base = (user32_long_t)offset;
9747 			error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
9748 		}
9749 		*retval = (int)bytesread;
9750 	}
9751 	return error;
9752 }
9753 
9754 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)9755 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
9756 {
9757 	off_t offset;
9758 	ssize_t bytesread;
9759 	int error, eofflag;
9760 	user_size_t bufsize;
9761 
9762 	AUDIT_ARG(fd, uap->fd);
9763 
9764 	/*
9765 	 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
9766 	 * then the kernel carves out the last 4 bytes to return extended
9767 	 * information to userspace (namely whether we reached EOF with this call).
9768 	 */
9769 	if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9770 		bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
9771 	} else {
9772 		bufsize = uap->bufsize;
9773 	}
9774 
9775 	error = getdirentries_common(uap->fd, uap->buf, bufsize,
9776 	    &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
9777 
9778 	if (error == 0) {
9779 		*retval = bytesread;
9780 		error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
9781 
9782 		if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9783 			getdirentries64_flags_t flags = 0;
9784 			if (eofflag) {
9785 				flags |= GETDIRENTRIES64_EOF;
9786 			}
9787 			error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
9788 			    sizeof(flags));
9789 		}
9790 	}
9791 	return error;
9792 }
9793 
9794 
9795 /*
9796  * Set the mode mask for creation of filesystem nodes.
9797  * XXX implement xsecurity
9798  */
9799 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
9800 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)9801 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
9802 {
9803 	AUDIT_ARG(mask, newmask);
9804 	proc_fdlock(p);
9805 	*retval = p->p_fd.fd_cmask;
9806 	p->p_fd.fd_cmask = newmask & ALLPERMS;
9807 	proc_fdunlock(p);
9808 	return 0;
9809 }
9810 
9811 /*
9812  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
9813  *
9814  * Parameters:    p                       Process requesting to set the umask
9815  *                uap                     User argument descriptor (see below)
9816  *                retval                  umask of the process (parameter p)
9817  *
9818  * Indirect:      uap->newmask            umask to set
9819  *                uap->xsecurity          ACL to set
9820  *
9821  * Returns:        0                      Success
9822  *                !0                      Not success
9823  *
9824  */
9825 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)9826 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
9827 {
9828 	return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
9829 }
9830 
9831 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)9832 umask(proc_t p, struct umask_args *uap, int32_t *retval)
9833 {
9834 	return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
9835 }
9836 
9837 /*
9838  * Void all references to file by ripping underlying filesystem
9839  * away from vnode.
9840  */
9841 /* ARGSUSED */
9842 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)9843 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
9844 {
9845 	vnode_t vp;
9846 	struct vnode_attr va;
9847 	vfs_context_t ctx = vfs_context_current();
9848 	int error;
9849 	struct nameidata nd;
9850 
9851 	NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
9852 	    uap->path, ctx);
9853 	error = namei(&nd);
9854 	if (error) {
9855 		return error;
9856 	}
9857 	vp = nd.ni_vp;
9858 
9859 	nameidone(&nd);
9860 
9861 	if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
9862 		error = ENOTSUP;
9863 		goto out;
9864 	}
9865 
9866 	if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
9867 		error = EBUSY;
9868 		goto out;
9869 	}
9870 
9871 #if CONFIG_MACF
9872 	error = mac_vnode_check_revoke(ctx, vp);
9873 	if (error) {
9874 		goto out;
9875 	}
9876 #endif
9877 
9878 	VATTR_INIT(&va);
9879 	VATTR_WANTED(&va, va_uid);
9880 	if ((error = vnode_getattr(vp, &va, ctx))) {
9881 		goto out;
9882 	}
9883 	if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
9884 	    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
9885 		goto out;
9886 	}
9887 	if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
9888 		VNOP_REVOKE(vp, REVOKEALL, ctx);
9889 	}
9890 out:
9891 	vnode_put(vp);
9892 	return error;
9893 }
9894 
9895 
9896 /*
9897  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
9898  *  The following system calls are designed to support features
9899  *  which are specific to the HFS & HFS Plus volume formats
9900  */
9901 
9902 
9903 /*
9904  * Obtain attribute information on objects in a directory while enumerating
9905  * the directory.
9906  */
9907 /* ARGSUSED */
9908 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)9909 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
9910 {
9911 	vnode_t vp;
9912 	struct fileproc *fp;
9913 	uio_t auio = NULL;
9914 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9915 	uint32_t count = 0, savecount = 0;
9916 	uint32_t newstate = 0;
9917 	int error, eofflag;
9918 	off_t loff = 0;
9919 	struct attrlist attributelist;
9920 	vfs_context_t ctx = vfs_context_current();
9921 	int fd = uap->fd;
9922 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
9923 	kauth_action_t action;
9924 
9925 	AUDIT_ARG(fd, fd);
9926 
9927 	/* Get the attributes into kernel space */
9928 	if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
9929 		return error;
9930 	}
9931 	if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
9932 		return error;
9933 	}
9934 	savecount = count;
9935 	if ((error = fp_getfvp(p, fd, &fp, &vp))) {
9936 		return error;
9937 	}
9938 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
9939 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9940 		error = EBADF;
9941 		goto out;
9942 	}
9943 
9944 
9945 #if CONFIG_MACF
9946 	error = mac_file_check_change_offset(vfs_context_ucred(ctx),
9947 	    fp->fp_glob);
9948 	if (error) {
9949 		goto out;
9950 	}
9951 #endif
9952 
9953 
9954 	if ((error = vnode_getwithref(vp))) {
9955 		goto out;
9956 	}
9957 
9958 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9959 
9960 #if CONFIG_UNION_MOUNTS
9961 unionread:
9962 #endif /* CONFIG_UNION_MOUNTS */
9963 	if (vp->v_type != VDIR) {
9964 		(void)vnode_put(vp);
9965 		error = EINVAL;
9966 		goto out;
9967 	}
9968 
9969 #if CONFIG_MACF
9970 	error = mac_vnode_check_readdir(ctx, vp);
9971 	if (error != 0) {
9972 		(void)vnode_put(vp);
9973 		goto out;
9974 	}
9975 #endif /* MAC */
9976 
9977 	/* set up the uio structure which will contain the users return buffer */
9978 	loff = fp->fp_glob->fg_offset;
9979 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9980 	uio_addiov(auio, uap->buffer, uap->buffersize);
9981 
9982 	/*
9983 	 * If the only item requested is file names, we can let that past with
9984 	 * just LIST_DIRECTORY.  If they want any other attributes, that means
9985 	 * they need SEARCH as well.
9986 	 */
9987 	action = KAUTH_VNODE_LIST_DIRECTORY;
9988 	if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
9989 	    attributelist.fileattr || attributelist.dirattr) {
9990 		action |= KAUTH_VNODE_SEARCH;
9991 	}
9992 
9993 	if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
9994 		/* Believe it or not, uap->options only has 32-bits of valid
9995 		 * info, so truncate before extending again */
9996 
9997 		error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
9998 		    (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
9999 	}
10000 
10001 	if (error) {
10002 		(void) vnode_put(vp);
10003 		goto out;
10004 	}
10005 
10006 #if CONFIG_UNION_MOUNTS
10007 	/*
10008 	 * If we've got the last entry of a directory in a union mount
10009 	 * then reset the eofflag and pretend there's still more to come.
10010 	 * The next call will again set eofflag and the buffer will be empty,
10011 	 * so traverse to the underlying directory and do the directory
10012 	 * read there.
10013 	 */
10014 	if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10015 		if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10016 			eofflag = 0;
10017 		} else {                                                // Empty buffer
10018 			struct vnode *tvp = vp;
10019 			if (lookup_traverse_union(tvp, &vp, ctx) == 0) {
10020 				vnode_ref_ext(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0);
10021 				fp_set_data(fp, vp);
10022 				fp->fp_glob->fg_offset = 0; // reset index for new dir
10023 				count = savecount;
10024 				vnode_rele_internal(tvp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10025 				vnode_put(tvp);
10026 				goto unionread;
10027 			}
10028 			vp = tvp;
10029 		}
10030 	}
10031 #endif /* CONFIG_UNION_MOUNTS */
10032 
10033 	(void)vnode_put(vp);
10034 
10035 	if (error) {
10036 		goto out;
10037 	}
10038 	fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10039 
10040 	if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10041 		goto out;
10042 	}
10043 	if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10044 		goto out;
10045 	}
10046 	if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10047 		goto out;
10048 	}
10049 
10050 	*retval = eofflag;  /* similar to getdirentries */
10051 	error = 0;
10052 out:
10053 	file_drop(fd);
10054 	return error; /* return error earlier, an retval of 0 or 1 now */
10055 } /* end of getdirentriesattr system call */
10056 
10057 /*
10058  * Exchange data between two files
10059  */
10060 
10061 /* ARGSUSED */
10062 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10063 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10064 {
10065 	struct nameidata fnd, snd;
10066 	vfs_context_t ctx = vfs_context_current();
10067 	vnode_t fvp;
10068 	vnode_t svp;
10069 	int error;
10070 	u_int32_t nameiflags;
10071 	char *fpath = NULL;
10072 	char *spath = NULL;
10073 	int   flen = 0, slen = 0;
10074 	int from_truncated = 0, to_truncated = 0;
10075 #if CONFIG_FSE
10076 	fse_info f_finfo, s_finfo;
10077 #endif
10078 
10079 	nameiflags = 0;
10080 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10081 		nameiflags |= FOLLOW;
10082 	}
10083 
10084 	NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10085 	    UIO_USERSPACE, uap->path1, ctx);
10086 
10087 	error = namei(&fnd);
10088 	if (error) {
10089 		goto out2;
10090 	}
10091 
10092 	nameidone(&fnd);
10093 	fvp = fnd.ni_vp;
10094 
10095 	NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10096 	    UIO_USERSPACE, uap->path2, ctx);
10097 
10098 	error = namei(&snd);
10099 	if (error) {
10100 		vnode_put(fvp);
10101 		goto out2;
10102 	}
10103 	nameidone(&snd);
10104 	svp = snd.ni_vp;
10105 
10106 	/*
10107 	 * if the files are the same, return an inval error
10108 	 */
10109 	if (svp == fvp) {
10110 		error = EINVAL;
10111 		goto out;
10112 	}
10113 
10114 	/*
10115 	 * if the files are on different volumes, return an error
10116 	 */
10117 	if (svp->v_mount != fvp->v_mount) {
10118 		error = EXDEV;
10119 		goto out;
10120 	}
10121 
10122 	/* If they're not files, return an error */
10123 	if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10124 		error = EINVAL;
10125 		goto out;
10126 	}
10127 
10128 #if CONFIG_MACF
10129 	error = mac_vnode_check_exchangedata(ctx,
10130 	    fvp, svp);
10131 	if (error) {
10132 		goto out;
10133 	}
10134 #endif
10135 	if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10136 	    ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10137 		goto out;
10138 	}
10139 
10140 	if (
10141 #if CONFIG_FSE
10142 		need_fsevent(FSE_EXCHANGE, fvp) ||
10143 #endif
10144 		kauth_authorize_fileop_has_listeners()) {
10145 		GET_PATH(fpath);
10146 		GET_PATH(spath);
10147 
10148 		flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10149 		slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10150 
10151 #if CONFIG_FSE
10152 		get_fse_info(fvp, &f_finfo, ctx);
10153 		get_fse_info(svp, &s_finfo, ctx);
10154 		if (from_truncated || to_truncated) {
10155 			// set it here since only the f_finfo gets reported up to user space
10156 			f_finfo.mode |= FSE_TRUNCATED_PATH;
10157 		}
10158 #endif
10159 	}
10160 	/* Ok, make the call */
10161 	error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10162 
10163 	if (error == 0) {
10164 		const char *tmpname;
10165 
10166 		if (fpath != NULL && spath != NULL) {
10167 			/* call out to allow 3rd party notification of exchangedata.
10168 			 * Ignore result of kauth_authorize_fileop call.
10169 			 */
10170 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10171 			    (uintptr_t)fpath, (uintptr_t)spath);
10172 		}
10173 		name_cache_lock();
10174 
10175 		tmpname     = fvp->v_name;
10176 		fvp->v_name = svp->v_name;
10177 		svp->v_name = tmpname;
10178 
10179 		if (fvp->v_parent != svp->v_parent) {
10180 			vnode_t tmp;
10181 
10182 			tmp           = fvp->v_parent;
10183 			fvp->v_parent = svp->v_parent;
10184 			svp->v_parent = tmp;
10185 		}
10186 		name_cache_unlock();
10187 
10188 #if CONFIG_FSE
10189 		if (fpath != NULL && spath != NULL) {
10190 			add_fsevent(FSE_EXCHANGE, ctx,
10191 			    FSE_ARG_STRING, flen, fpath,
10192 			    FSE_ARG_FINFO, &f_finfo,
10193 			    FSE_ARG_STRING, slen, spath,
10194 			    FSE_ARG_FINFO, &s_finfo,
10195 			    FSE_ARG_DONE);
10196 		}
10197 #endif
10198 	}
10199 
10200 out:
10201 	if (fpath != NULL) {
10202 		RELEASE_PATH(fpath);
10203 	}
10204 	if (spath != NULL) {
10205 		RELEASE_PATH(spath);
10206 	}
10207 	vnode_put(svp);
10208 	vnode_put(fvp);
10209 out2:
10210 	return error;
10211 }
10212 
10213 /*
10214  * Return (in MB) the amount of freespace on the given vnode's volume.
10215  */
10216 uint32_t freespace_mb(vnode_t vp);
10217 
10218 uint32_t
freespace_mb(vnode_t vp)10219 freespace_mb(vnode_t vp)
10220 {
10221 	vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
10222 	return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10223 	       vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10224 }
10225 
10226 #if CONFIG_SEARCHFS
10227 
10228 /* ARGSUSED */
10229 
10230 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)10231 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10232 {
10233 	vnode_t vp, tvp;
10234 	int i, error = 0;
10235 	int fserror = 0;
10236 	struct nameidata nd;
10237 	struct user64_fssearchblock searchblock;
10238 	struct searchstate *state;
10239 	struct attrlist *returnattrs;
10240 	struct timeval timelimit;
10241 	void *searchparams1, *searchparams2;
10242 	uio_t auio = NULL;
10243 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10244 	uint32_t nummatches;
10245 	size_t mallocsize;
10246 	uint32_t nameiflags;
10247 	vfs_context_t ctx = vfs_context_current();
10248 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10249 
10250 	/* Start by copying in fsearchblock parameter list */
10251 	if (IS_64BIT_PROCESS(p)) {
10252 		error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
10253 		timelimit.tv_sec = searchblock.timelimit.tv_sec;
10254 		timelimit.tv_usec = searchblock.timelimit.tv_usec;
10255 	} else {
10256 		struct user32_fssearchblock tmp_searchblock;
10257 
10258 		error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
10259 		// munge into 64-bit version
10260 		searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
10261 		searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
10262 		searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
10263 		searchblock.maxmatches = tmp_searchblock.maxmatches;
10264 		/*
10265 		 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
10266 		 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
10267 		 */
10268 		timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
10269 		timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
10270 		searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
10271 		searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
10272 		searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
10273 		searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
10274 		searchblock.searchattrs = tmp_searchblock.searchattrs;
10275 	}
10276 	if (error) {
10277 		return error;
10278 	}
10279 
10280 	/* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
10281 	 */
10282 	if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10283 	    searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10284 		return EINVAL;
10285 	}
10286 
10287 	/* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
10288 	/* It all has to do into local memory and it's not that big so we might as well  put it all together. */
10289 	/* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
10290 	/* block.                                                                                             */
10291 	/*												      */
10292 	/* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
10293 	/*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
10294 	/*       assumes the size is still 556 bytes it will continue to work				      */
10295 
10296 	mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
10297 	    sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
10298 
10299 	searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
10300 
10301 	/* Now set up the various pointers to the correct place in our newly allocated memory */
10302 
10303 	searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
10304 	returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
10305 	state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
10306 
10307 	/* Now copy in the stuff given our local variables. */
10308 
10309 	if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
10310 		goto freeandexit;
10311 	}
10312 
10313 	if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
10314 		goto freeandexit;
10315 	}
10316 
10317 	if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
10318 		goto freeandexit;
10319 	}
10320 
10321 	if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
10322 		goto freeandexit;
10323 	}
10324 
10325 	/*
10326 	 * When searching a union mount, need to set the
10327 	 * start flag at the first call on each layer to
10328 	 * reset state for the new volume.
10329 	 */
10330 	if (uap->options & SRCHFS_START) {
10331 		state->ss_union_layer = 0;
10332 	} else {
10333 		uap->options |= state->ss_union_flags;
10334 	}
10335 	state->ss_union_flags = 0;
10336 
10337 	/*
10338 	 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
10339 	 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
10340 	 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
10341 	 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
10342 	 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
10343 	 */
10344 
10345 	if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
10346 		attrreference_t* string_ref;
10347 		u_int32_t* start_length;
10348 		user64_size_t param_length;
10349 
10350 		/* validate searchparams1 */
10351 		param_length = searchblock.sizeofsearchparams1;
10352 		/* skip the word that specifies length of the buffer */
10353 		start_length = (u_int32_t*) searchparams1;
10354 		start_length = start_length + 1;
10355 		string_ref = (attrreference_t*) start_length;
10356 
10357 		/* ensure no negative offsets or too big offsets */
10358 		if (string_ref->attr_dataoffset < 0) {
10359 			error = EINVAL;
10360 			goto freeandexit;
10361 		}
10362 		if (string_ref->attr_length > MAXPATHLEN) {
10363 			error = EINVAL;
10364 			goto freeandexit;
10365 		}
10366 
10367 		/* Check for pointer overflow in the string ref */
10368 		if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
10369 			error = EINVAL;
10370 			goto freeandexit;
10371 		}
10372 
10373 		if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
10374 			error = EINVAL;
10375 			goto freeandexit;
10376 		}
10377 		if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
10378 			error = EINVAL;
10379 			goto freeandexit;
10380 		}
10381 	}
10382 
10383 	/* set up the uio structure which will contain the users return buffer */
10384 	auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10385 	uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
10386 
10387 	nameiflags = 0;
10388 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10389 		nameiflags |= FOLLOW;
10390 	}
10391 	NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
10392 	    UIO_USERSPACE, uap->path, ctx);
10393 
10394 	error = namei(&nd);
10395 	if (error) {
10396 		goto freeandexit;
10397 	}
10398 	vp = nd.ni_vp;
10399 	nameidone(&nd);
10400 
10401 	/*
10402 	 * Switch to the root vnode for the volume
10403 	 */
10404 	error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
10405 	vnode_put(vp);
10406 	if (error) {
10407 		goto freeandexit;
10408 	}
10409 	vp = tvp;
10410 
10411 #if CONFIG_UNION_MOUNTS
10412 	/*
10413 	 * If it's a union mount, the path lookup takes
10414 	 * us to the top layer. But we may need to descend
10415 	 * to a lower layer. For non-union mounts the layer
10416 	 * is always zero.
10417 	 */
10418 	for (i = 0; i < (int) state->ss_union_layer; i++) {
10419 		if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
10420 			break;
10421 		}
10422 		tvp = vp;
10423 		vp = vp->v_mount->mnt_vnodecovered;
10424 		if (vp == NULL) {
10425 			vnode_put(tvp);
10426 			error = ENOENT;
10427 			goto freeandexit;
10428 		}
10429 		error = vnode_getwithref(vp);
10430 		vnode_put(tvp);
10431 		if (error) {
10432 			goto freeandexit;
10433 		}
10434 	}
10435 #endif /* CONFIG_UNION_MOUNTS */
10436 
10437 #if CONFIG_MACF
10438 	error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
10439 	if (error) {
10440 		vnode_put(vp);
10441 		goto freeandexit;
10442 	}
10443 #endif
10444 
10445 
10446 	/*
10447 	 * If searchblock.maxmatches == 0, then skip the search. This has happened
10448 	 * before and sometimes the underlying code doesnt deal with it well.
10449 	 */
10450 	if (searchblock.maxmatches == 0) {
10451 		nummatches = 0;
10452 		goto saveandexit;
10453 	}
10454 
10455 	/*
10456 	 * Allright, we have everything we need, so lets make that call.
10457 	 *
10458 	 * We keep special track of the return value from the file system:
10459 	 * EAGAIN is an acceptable error condition that shouldn't keep us
10460 	 * from copying out any results...
10461 	 */
10462 
10463 	fserror = VNOP_SEARCHFS(vp,
10464 	    searchparams1,
10465 	    searchparams2,
10466 	    &searchblock.searchattrs,
10467 	    (uint32_t)searchblock.maxmatches,
10468 	    &timelimit,
10469 	    returnattrs,
10470 	    &nummatches,
10471 	    (uint32_t)uap->scriptcode,
10472 	    (uint32_t)uap->options,
10473 	    auio,
10474 	    (struct searchstate *) &state->ss_fsstate,
10475 	    ctx);
10476 
10477 #if CONFIG_UNION_MOUNTS
10478 	/*
10479 	 * If it's a union mount we need to be called again
10480 	 * to search the mounted-on filesystem.
10481 	 */
10482 	if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
10483 		state->ss_union_flags = SRCHFS_START;
10484 		state->ss_union_layer++;        // search next layer down
10485 		fserror = EAGAIN;
10486 	}
10487 #endif /* CONFIG_UNION_MOUNTS */
10488 
10489 saveandexit:
10490 
10491 	vnode_put(vp);
10492 
10493 	/* Now copy out the stuff that needs copying out. That means the number of matches, the
10494 	 *  search state.  Everything was already put into he return buffer by the vop call. */
10495 
10496 	if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
10497 		goto freeandexit;
10498 	}
10499 
10500 	if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
10501 		goto freeandexit;
10502 	}
10503 
10504 	error = fserror;
10505 
10506 freeandexit:
10507 
10508 	kfree_data(searchparams1, mallocsize);
10509 
10510 	return error;
10511 } /* end of searchfs system call */
10512 
10513 #else /* CONFIG_SEARCHFS */
10514 
10515 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)10516 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
10517 {
10518 	return ENOTSUP;
10519 }
10520 
10521 #endif /* CONFIG_SEARCHFS */
10522 
10523 
10524 #if CONFIG_DATALESS_FILES
10525 
10526 /*
10527  * === Namespace Resolver Up-call Mechanism ===
10528  *
10529  * When I/O is performed to a dataless file or directory (read, write,
10530  * lookup-in, etc.), the file system performs an upcall to the namespace
10531  * resolver (filecoordinationd) to materialize the object.
10532  *
10533  * We need multiple up-calls to be in flight at once, and we need these
10534  * up-calls to be interruptible, thus the following implementation:
10535  *
10536  * => The nspace_resolver_request represents the in-kernel request state.
10537  *    It contains a request ID, storage space for the errno code returned
10538  *    by filecoordinationd, and flags.
10539  *
10540  * => The request ID is simply a global monotonically incrementing 32-bit
10541  *    number.  Outstanding requests are stored in a hash table, and the
10542  *    hash function is extremely simple.
10543  *
10544  * => When an upcall is to be made to filecoordinationd, a request structure
10545  *    is allocated on the stack (it is small, and needs to live only during
10546  *    the duration of the call to resolve_nspace_item_ext()).  It is
10547  *    initialized and inserted into the table.  Some backpressure from
10548  *    filecoordinationd is applied by limiting the numnber of entries that
10549  *    can be inserted into the table (and thus limiting the number of
10550  *    outstanding requests issued to filecoordinationd); waiting for an
10551  *    available slot is interruptible.
10552  *
10553  * => Once the request has been inserted into the table, the up-call is made
10554  *    to filecoordinationd via a MiG-generated stub.  The up-call returns
10555  *    immediately and filecoordinationd processes the request asynchronously.
10556  *
10557  * => The caller now waits for the request to complete.  Tnis is achieved by
10558  *    sleeping on the address of the request structure and waiting for
10559  *    filecoordinationd to mark the request structure as complete.  This
10560  *    is an interruptible sleep call; if interrupted, the request structure
10561  *    is removed from the table and EINTR is returned to the caller.  If
10562  *    this occurs, an advisory up-call is made to filecoordinationd with
10563  *    the request ID to indicate that the request can be aborted or
10564  *    de-prioritized at the discretion of filecoordinationd.
10565  *
10566  * => When filecoordinationd has completed the request, it signals completion
10567  *    by writing to the vfs.nspace.complete sysctl node.  Only a process
10568  *    decorated as a namespace resolver can write to this sysctl node.  The
10569  *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
10570  *    The request ID is looked up in the table, and if the request is found,
10571  *    the error code is stored in the request structure and a wakeup()
10572  *    issued on the address of the request structure.  If the request is not
10573  *    found, we simply drop the completion notification, assuming that the
10574  *    caller was interrupted.
10575  *
10576  * => When the waiting thread wakes up, it extracts the error code from the
10577  *    request structure, removes the request from the table, and returns the
10578  *    error code to the calling function.  Fini!
10579  */
10580 
10581 struct nspace_resolver_request {
10582 	LIST_ENTRY(nspace_resolver_request) r_hashlink;
10583 	vnode_t         r_vp;
10584 	uint32_t        r_req_id;
10585 	int             r_resolver_error;
10586 	int             r_flags;
10587 };
10588 
10589 #define RRF_COMPLETE    0x0001
10590 
10591 static uint32_t
next_nspace_req_id(void)10592 next_nspace_req_id(void)
10593 {
10594 	static uint32_t next_req_id;
10595 
10596 	return OSAddAtomic(1, &next_req_id);
10597 }
10598 
10599 #define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
10600 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
10601 
10602 static LIST_HEAD(nspace_resolver_requesthead,
10603     nspace_resolver_request) * nspace_resolver_request_hashtbl;
10604 static u_long nspace_resolver_request_hashmask;
10605 static u_int nspace_resolver_request_count;
10606 static bool nspace_resolver_request_wait_slot;
10607 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
10608 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
10609     &nspace_resolver_request_lck_grp);
10610 
10611 #define NSPACE_REQ_LOCK() \
10612 	lck_mtx_lock(&nspace_resolver_request_hash_mutex)
10613 #define NSPACE_REQ_UNLOCK() \
10614 	lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
10615 
10616 #define NSPACE_RESOLVER_HASH(req_id)    \
10617 	(&nspace_resolver_request_hashtbl[(req_id) & \
10618 	 nspace_resolver_request_hashmask])
10619 
10620 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id)10621 nspace_resolver_req_lookup(uint32_t req_id)
10622 {
10623 	struct nspace_resolver_requesthead *bucket;
10624 	struct nspace_resolver_request *req;
10625 
10626 	bucket = NSPACE_RESOLVER_HASH(req_id);
10627 	LIST_FOREACH(req, bucket, r_hashlink) {
10628 		if (req->r_req_id == req_id) {
10629 			return req;
10630 		}
10631 	}
10632 
10633 	return NULL;
10634 }
10635 
10636 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)10637 nspace_resolver_req_add(struct nspace_resolver_request *req)
10638 {
10639 	struct nspace_resolver_requesthead *bucket;
10640 	int error;
10641 
10642 	while (nspace_resolver_request_count >=
10643 	    NSPACE_RESOLVER_MAX_OUTSTANDING) {
10644 		nspace_resolver_request_wait_slot = true;
10645 		error = msleep(&nspace_resolver_request_count,
10646 		    &nspace_resolver_request_hash_mutex,
10647 		    PVFS | PCATCH, "nspacerq", NULL);
10648 		if (error) {
10649 			return error;
10650 		}
10651 	}
10652 
10653 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10654 #if DIAGNOSTIC
10655 	assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
10656 #endif /* DIAGNOSTIC */
10657 	LIST_INSERT_HEAD(bucket, req, r_hashlink);
10658 	nspace_resolver_request_count++;
10659 
10660 	return 0;
10661 }
10662 
10663 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)10664 nspace_resolver_req_remove(struct nspace_resolver_request *req)
10665 {
10666 	struct nspace_resolver_requesthead *bucket;
10667 
10668 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10669 #if DIAGNOSTIC
10670 	assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
10671 #endif /* DIAGNOSTIC */
10672 	LIST_REMOVE(req, r_hashlink);
10673 	nspace_resolver_request_count--;
10674 
10675 	if (nspace_resolver_request_wait_slot) {
10676 		nspace_resolver_request_wait_slot = false;
10677 		wakeup(&nspace_resolver_request_count);
10678 	}
10679 }
10680 
10681 static void
nspace_resolver_req_cancel(uint32_t req_id)10682 nspace_resolver_req_cancel(uint32_t req_id)
10683 {
10684 	kern_return_t kr;
10685 	mach_port_t mp;
10686 
10687 	// Failures here aren't fatal -- the cancellation message
10688 	// sent to the resolver is merely advisory.
10689 
10690 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
10691 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
10692 		return;
10693 	}
10694 
10695 	kr = send_nspace_resolve_cancel(mp, req_id);
10696 	if (kr != KERN_SUCCESS) {
10697 		os_log_error(OS_LOG_DEFAULT,
10698 		    "NSPACE send_nspace_resolve_cancel failure: %d", kr);
10699 	}
10700 
10701 	ipc_port_release_send(mp);
10702 }
10703 
10704 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)10705 nspace_resolver_req_wait(struct nspace_resolver_request *req)
10706 {
10707 	bool send_cancel_message = false;
10708 	int error;
10709 
10710 	NSPACE_REQ_LOCK();
10711 
10712 	while ((req->r_flags & RRF_COMPLETE) == 0) {
10713 		error = msleep(req, &nspace_resolver_request_hash_mutex,
10714 		    PVFS | PCATCH, "nspace", NULL);
10715 		if (error && error != ERESTART) {
10716 			req->r_resolver_error = (error == EINTR) ? EINTR :
10717 			    ETIMEDOUT;
10718 			send_cancel_message = true;
10719 			break;
10720 		}
10721 	}
10722 
10723 	nspace_resolver_req_remove(req);
10724 
10725 	NSPACE_REQ_UNLOCK();
10726 
10727 	if (send_cancel_message) {
10728 		nspace_resolver_req_cancel(req->r_req_id);
10729 	}
10730 
10731 	return req->r_resolver_error;
10732 }
10733 
10734 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)10735 nspace_resolver_req_mark_complete(
10736 	struct nspace_resolver_request *req,
10737 	int resolver_error)
10738 {
10739 	req->r_resolver_error = resolver_error;
10740 	req->r_flags |= RRF_COMPLETE;
10741 	wakeup(req);
10742 }
10743 
10744 static void
nspace_resolver_req_completed(uint32_t req_id,int resolver_error,uint64_t orig_gencount)10745 nspace_resolver_req_completed(uint32_t req_id, int resolver_error, uint64_t orig_gencount)
10746 {
10747 	struct nspace_resolver_request *req;
10748 
10749 	NSPACE_REQ_LOCK();
10750 
10751 	// If we don't find the request corresponding to our req_id,
10752 	// just drop the completion signal on the floor; it's likely
10753 	// that the requester interrupted with a signal.
10754 
10755 	req = nspace_resolver_req_lookup(req_id);
10756 	if (req) {
10757 		mount_t locked_mp = NULL;
10758 
10759 		locked_mp = req->r_vp->v_mount;
10760 		mount_ref(locked_mp, 0);
10761 		mount_lock_renames(locked_mp);
10762 
10763 		//
10764 		// if the resolver isn't already returning an error and we have an
10765 		// orig_gencount, then get an iocount on the request vnode and check
10766 		// that the gencount on req->r_vp has not changed.
10767 		//
10768 		// note: a ref was taken on req->r_vp when the request was created
10769 		// and that ref will be dropped by that thread when it wakes up.
10770 		//
10771 		if (resolver_error == 0 &&
10772 		    orig_gencount != 0 &&
10773 		    vnode_getwithref(req->r_vp) == 0) {
10774 			struct vnode_attr va;
10775 			uint64_t cur_gencount;
10776 
10777 			VATTR_INIT(&va);
10778 			VATTR_WANTED(&va, va_recursive_gencount);
10779 
10780 			if (vnode_getattr(req->r_vp, &va, vfs_context_kernel()) == 0) {
10781 				cur_gencount = va.va_recursive_gencount;
10782 			} else {
10783 				cur_gencount = 0;
10784 			}
10785 
10786 			if (resolver_error == 0 && cur_gencount && orig_gencount && cur_gencount != orig_gencount) {
10787 				printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n", orig_gencount, cur_gencount);
10788 
10789 				// this error will be returned to the thread that initiated the
10790 				// materialization of req->r_vp.
10791 				resolver_error = EBUSY;
10792 
10793 				// note: we explicitly do not return an error to the caller (i.e.
10794 				// the thread that did the materialization) because they said they
10795 				// don't want one.
10796 			}
10797 
10798 			vnode_put(req->r_vp);
10799 		}
10800 
10801 		mount_unlock_renames(locked_mp);
10802 		mount_drop(locked_mp, 0);
10803 
10804 		nspace_resolver_req_mark_complete(req, resolver_error);
10805 	}
10806 
10807 	NSPACE_REQ_UNLOCK();
10808 
10809 	return;
10810 }
10811 
10812 static struct proc *nspace_resolver_proc;
10813 
10814 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)10815 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
10816 {
10817 	*is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
10818 	    p == nspace_resolver_proc) ? 1 : 0;
10819 	return 0;
10820 }
10821 
10822 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)10823 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
10824 {
10825 	vfs_context_t ctx = vfs_context_current();
10826 	int error = 0;
10827 
10828 	//
10829 	// The system filecoordinationd runs as uid == 0.  This also
10830 	// has the nice side-effect of filtering out filecoordinationd
10831 	// running in the simulator.
10832 	//
10833 	if (!vfs_context_issuser(ctx)) {
10834 		return EPERM;
10835 	}
10836 
10837 	error = priv_check_cred(vfs_context_ucred(ctx),
10838 	    PRIV_VFS_DATALESS_RESOLVER, 0);
10839 	if (error) {
10840 		return error;
10841 	}
10842 
10843 	if (is_resolver) {
10844 		NSPACE_REQ_LOCK();
10845 
10846 		if (nspace_resolver_proc == NULL) {
10847 			proc_lock(p);
10848 			p->p_lflag |= P_LNSPACE_RESOLVER;
10849 			proc_unlock(p);
10850 			nspace_resolver_proc = p;
10851 		} else {
10852 			error = EBUSY;
10853 		}
10854 
10855 		NSPACE_REQ_UNLOCK();
10856 	} else {
10857 		// This is basically just like the exit case.
10858 		// nspace_resolver_exited() will verify that the
10859 		// process is the resolver, and will clear the
10860 		// global.
10861 		nspace_resolver_exited(p);
10862 	}
10863 
10864 	return error;
10865 }
10866 
10867 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)10868 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
10869 {
10870 	if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
10871 	    (p->p_vfs_iopolicy &
10872 	    P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
10873 		*is_prevented = 1;
10874 	} else {
10875 		*is_prevented = 0;
10876 	}
10877 	return 0;
10878 }
10879 
10880 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)10881 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
10882 {
10883 	if (p->p_lflag & P_LNSPACE_RESOLVER) {
10884 		return is_prevented ? 0 : EBUSY;
10885 	}
10886 
10887 	if (is_prevented) {
10888 		OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
10889 	} else {
10890 		OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
10891 	}
10892 	return 0;
10893 }
10894 
10895 static int
nspace_materialization_get_thread_state(int * is_prevented)10896 nspace_materialization_get_thread_state(int *is_prevented)
10897 {
10898 	uthread_t ut = current_uthread();
10899 
10900 	*is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
10901 	return 0;
10902 }
10903 
10904 static int
nspace_materialization_set_thread_state(int is_prevented)10905 nspace_materialization_set_thread_state(int is_prevented)
10906 {
10907 	uthread_t ut = current_uthread();
10908 
10909 	if (is_prevented) {
10910 		ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
10911 	} else {
10912 		ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
10913 	}
10914 	return 0;
10915 }
10916 
10917 /* the vfs.nspace branch */
10918 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
10919 
10920 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)10921 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
10922     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10923 {
10924 	struct proc *p = req->p;
10925 	int new_value, old_value, changed = 0;
10926 	int error;
10927 
10928 	error = nspace_resolver_get_proc_state(p, &old_value);
10929 	if (error) {
10930 		return error;
10931 	}
10932 
10933 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10934 	    &changed);
10935 	if (error == 0 && changed) {
10936 		error = nspace_resolver_set_proc_state(p, new_value);
10937 	}
10938 	return error;
10939 }
10940 
10941 /* decorate this process as the dataless file resolver */
10942 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
10943     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10944     0, 0, sysctl_nspace_resolver, "I", "");
10945 
10946 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)10947 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
10948     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10949 {
10950 	struct proc *p = req->p;
10951 	int new_value, old_value, changed = 0;
10952 	int error;
10953 
10954 	error = nspace_materialization_get_proc_state(p, &old_value);
10955 	if (error) {
10956 		return error;
10957 	}
10958 
10959 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10960 	    &changed);
10961 	if (error == 0 && changed) {
10962 		error = nspace_materialization_set_proc_state(p, new_value);
10963 	}
10964 	return error;
10965 }
10966 
10967 /* decorate this process as not wanting to materialize dataless files */
10968 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
10969     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10970     0, 0, sysctl_nspace_prevent_materialization, "I", "");
10971 
10972 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)10973 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
10974     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10975 {
10976 	int new_value, old_value, changed = 0;
10977 	int error;
10978 
10979 	error = nspace_materialization_get_thread_state(&old_value);
10980 	if (error) {
10981 		return error;
10982 	}
10983 
10984 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10985 	    &changed);
10986 	if (error == 0 && changed) {
10987 		error = nspace_materialization_set_thread_state(new_value);
10988 	}
10989 	return error;
10990 }
10991 
10992 /* decorate this thread as not wanting to materialize dataless files */
10993 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
10994     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10995     0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
10996 
10997 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)10998 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
10999     __unused int arg2, struct sysctl_req *req)
11000 {
11001 	struct proc *p = req->p;
11002 	uint32_t req_status[2] = { 0, 0 };
11003 	uint64_t gencount = 0;
11004 	int error, is_resolver, changed = 0, gencount_changed;
11005 
11006 	error = nspace_resolver_get_proc_state(p, &is_resolver);
11007 	if (error) {
11008 		return error;
11009 	}
11010 
11011 	if (!is_resolver) {
11012 		return EPERM;
11013 	}
11014 
11015 	error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11016 	    &changed);
11017 	if (error) {
11018 		return error;
11019 	}
11020 
11021 	// get the gencount if it was passed
11022 	error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11023 	    &gencount_changed);
11024 	if (error) {
11025 		gencount = 0;
11026 		// we ignore the error because the gencount was optional
11027 		error = 0;
11028 	}
11029 
11030 	/*
11031 	 * req_status[0] is the req_id
11032 	 *
11033 	 * req_status[1] is the errno
11034 	 */
11035 	if (error == 0 && changed) {
11036 		nspace_resolver_req_completed(req_status[0],
11037 		    (int)req_status[1], gencount);
11038 	}
11039 	return error;
11040 }
11041 
11042 /* Resolver reports completed reqs here. */
11043 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11044     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11045     0, 0, sysctl_nspace_complete, "-", "");
11046 
11047 #endif /* CONFIG_DATALESS_FILES */
11048 
11049 #if CONFIG_DATALESS_FILES
11050 #define __no_dataless_unused    /* nothing */
11051 #else
11052 #define __no_dataless_unused    __unused
11053 #endif
11054 
11055 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)11056 vfs_context_dataless_materialization_is_prevented(
11057 	vfs_context_t const ctx __no_dataless_unused)
11058 {
11059 #if CONFIG_DATALESS_FILES
11060 	proc_t const p = vfs_context_proc(ctx);
11061 	thread_t const t = vfs_context_thread(ctx);
11062 	uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11063 
11064 	/*
11065 	 * Kernel context ==> return EDEADLK, as we would with any random
11066 	 * process decorated as no-materialize.
11067 	 */
11068 	if (ctx == vfs_context_kernel()) {
11069 		return EDEADLK;
11070 	}
11071 
11072 	/*
11073 	 * If the process has the dataless-manipulation entitlement,
11074 	 * materialization is prevented, and depending on the kind
11075 	 * of file system operation, things get to proceed as if the
11076 	 * object is not dataless.
11077 	 */
11078 	if (vfs_context_is_dataless_manipulator(ctx)) {
11079 		return EJUSTRETURN;
11080 	}
11081 
11082 	/*
11083 	 * Per-thread decorations override any process-wide decorations.
11084 	 * (Foundation uses this, and this overrides even the dataless-
11085 	 * manipulation entitlement so as to make API contracts consistent.)
11086 	 */
11087 	if (ut != NULL) {
11088 		if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11089 			return EDEADLK;
11090 		}
11091 		if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11092 			return 0;
11093 		}
11094 	}
11095 
11096 	/*
11097 	 * If the process's iopolicy specifies that dataless files
11098 	 * can be materialized, then we let it go ahead.
11099 	 */
11100 	if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11101 		return 0;
11102 	}
11103 #endif /* CONFIG_DATALESS_FILES */
11104 
11105 	/*
11106 	 * The default behavior is to not materialize dataless files;
11107 	 * return to the caller that deadlock was detected.
11108 	 */
11109 	return EDEADLK;
11110 }
11111 
11112 void
nspace_resolver_init(void)11113 nspace_resolver_init(void)
11114 {
11115 #if CONFIG_DATALESS_FILES
11116 	nspace_resolver_request_hashtbl =
11117 	    hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
11118 	    M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
11119 #endif /* CONFIG_DATALESS_FILES */
11120 }
11121 
11122 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)11123 nspace_resolver_exited(struct proc *p __no_dataless_unused)
11124 {
11125 #if CONFIG_DATALESS_FILES
11126 	struct nspace_resolver_requesthead *bucket;
11127 	struct nspace_resolver_request *req;
11128 	u_long idx;
11129 
11130 	NSPACE_REQ_LOCK();
11131 
11132 	if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11133 	    p == nspace_resolver_proc) {
11134 		for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
11135 			bucket = &nspace_resolver_request_hashtbl[idx];
11136 			LIST_FOREACH(req, bucket, r_hashlink) {
11137 				nspace_resolver_req_mark_complete(req,
11138 				    ETIMEDOUT);
11139 			}
11140 		}
11141 		nspace_resolver_proc = NULL;
11142 	}
11143 
11144 	NSPACE_REQ_UNLOCK();
11145 #endif /* CONFIG_DATALESS_FILES */
11146 }
11147 
11148 int
resolve_nspace_item(struct vnode * vp,uint64_t op)11149 resolve_nspace_item(struct vnode *vp, uint64_t op)
11150 {
11151 	return resolve_nspace_item_ext(vp, op, NULL);
11152 }
11153 
11154 #define DATALESS_RESOLVER_ENTITLEMENT     \
11155 	"com.apple.private.vfs.dataless-resolver"
11156 #define DATALESS_MANIPULATION_ENTITLEMENT \
11157 	"com.apple.private.vfs.dataless-manipulation"
11158 
11159 /*
11160  * Return TRUE if the vfs context is associated with a process entitled
11161  * for dataless manipulation.
11162  *
11163  * XXX Arguably belongs in vfs_subr.c, but is here because of the
11164  * complication around CONFIG_DATALESS_FILES.
11165  */
11166 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __unused)11167 vfs_context_is_dataless_manipulator(vfs_context_t ctx __unused)
11168 {
11169 #if CONFIG_DATALESS_FILES
11170 	assert(ctx->vc_thread == current_thread());
11171 	return IOCurrentTaskHasEntitlement( DATALESS_MANIPULATION_ENTITLEMENT) ||
11172 	       IOCurrentTaskHasEntitlement(DATALESS_RESOLVER_ENTITLEMENT);
11173 #else
11174 	return false;
11175 #endif /* CONFIG_DATALESS_FILES */
11176 }
11177 
11178 #if CONFIG_DATALESS_FILES
11179 static void
log_materialization_prevented(vnode_t vp,uint64_t op)11180 log_materialization_prevented(vnode_t vp, uint64_t op)
11181 {
11182 	char p_name[MAXCOMLEN + 1];
11183 	char *vntype;
11184 	proc_selfname(&p_name[0], sizeof(p_name));
11185 
11186 	if (vp->v_type == VREG) {
11187 		vntype = "File";
11188 	} else if (vp->v_type == VDIR) {
11189 		vntype = "Dir";
11190 	} else if (vp->v_type == VLNK) {
11191 		vntype = "SymLink";
11192 	} else {
11193 		vntype = "Other";
11194 	}
11195 
11196 #if DEVELOPMENT
11197 	char *path = NULL;
11198 	int   len;
11199 
11200 	path = get_pathbuff();
11201 	len = MAXPATHLEN;
11202 	if (path) {
11203 		vn_getpath(vp, path, &len);
11204 	}
11205 
11206 	os_log_debug(OS_LOG_DEFAULT,
11207 	    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) path: %s",
11208 	    p_name, proc_selfpid(),
11209 	    op, vntype, path ? path : "<unknown-path>");
11210 	if (path) {
11211 		release_pathbuff(path);
11212 	}
11213 #else
11214 	os_log_debug(OS_LOG_DEFAULT,
11215 	    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
11216 	    p_name, proc_selfpid(),
11217 	    op, vntype);
11218 #endif
11219 }
11220 #endif /* CONFIG_DATALESS_FILES */
11221 
11222 
11223 static int
vfs_materialize_item(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused)11224 vfs_materialize_item(
11225 	struct vnode *vp __no_dataless_unused,
11226 	uint64_t op __no_dataless_unused,
11227 	int64_t offset __no_dataless_unused,
11228 	int64_t size __no_dataless_unused,
11229 	char *lookup_name __no_dataless_unused,
11230 	size_t const namelen __no_dataless_unused)
11231 {
11232 #if CONFIG_DATALESS_FILES
11233 	struct nspace_resolver_request req;
11234 	kern_return_t kern_ret;
11235 	mach_port_t mach_port;
11236 	char *path = NULL;
11237 	vfs_context_t context;
11238 	int path_len;
11239 	int error;
11240 #ifdef DATALESS_FILES_USE_AUDIT_TOKEN
11241 	audit_token_t atoken;
11242 #endif
11243 
11244 	/*
11245 	 * If this is a snapshot event and the vnode is on a disk image just
11246 	 * pretend nothing happened since any change to the disk image will
11247 	 * cause the disk image itself to get backed up and this avoids multi-
11248 	 * way deadlocks between the snapshot handler and the ever popular
11249 	 * diskimages-helper process. The variable nspace_allow_virtual_devs
11250 	 * allows this behavior to be overridden (for use by the Mobile
11251 	 * TimeMachine testing infrastructure which uses disk images).
11252 	 */
11253 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
11254 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
11255 		return ENOTSUP;
11256 	}
11257 
11258 	context = vfs_context_current();
11259 
11260 	error = vfs_context_dataless_materialization_is_prevented(context);
11261 	if (error) {
11262 		log_materialization_prevented(vp, op);
11263 		return error;
11264 	}
11265 
11266 	kern_ret = host_get_filecoordinationd_port(host_priv_self(),
11267 	    &mach_port);
11268 	if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
11269 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
11270 		/*
11271 		 * Treat this like being unable to access the backing store
11272 		 * server.
11273 		 */
11274 		return ETIMEDOUT;
11275 	}
11276 
11277 	path = zalloc(ZV_NAMEI);
11278 	path_len = MAXPATHLEN;
11279 
11280 	error = vn_getpath(vp, path, &path_len);
11281 	if (error) {
11282 		goto out_release_port;
11283 	}
11284 
11285 #ifdef DATALESS_FILES_USE_AUDIT_TOKEN
11286 	error = vfs_context_copy_audit_token(context, &atoken);
11287 	if (error) {
11288 		goto out_release_port;
11289 	}
11290 #endif
11291 
11292 	req.r_req_id = next_nspace_req_id();
11293 	req.r_resolver_error = 0;
11294 	req.r_flags = 0;
11295 	req.r_vp = vp;
11296 
11297 	NSPACE_REQ_LOCK();
11298 	error = nspace_resolver_req_add(&req);
11299 	NSPACE_REQ_UNLOCK();
11300 	if (error) {
11301 		goto out_release_port;
11302 	}
11303 
11304 	os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
11305 	if (vp->v_type == VDIR) {
11306 		char *tmpname = NULL;
11307 
11308 		/*
11309 		 * If the caller provided a lookup_name *and* a name length,
11310 		 * then we assume the lookup_name is not NUL-terminated.
11311 		 * Allocate a temporary buffer in this case to provide
11312 		 * a NUL-terminated path name to the IPC call.
11313 		 */
11314 		if (lookup_name != NULL && namelen != 0) {
11315 			if (namelen >= PATH_MAX) {
11316 				error = EINVAL;
11317 				goto out_release_port;
11318 			}
11319 			tmpname = zalloc(ZV_NAMEI);
11320 			strlcpy(tmpname, lookup_name, namelen + 1);
11321 			lookup_name = tmpname;
11322 		} else if (lookup_name != NULL) {
11323 			/*
11324 			 * If the caller provided a lookup_name with a
11325 			 * zero name length, then we assume it's NUL-
11326 			 * terminated.  Verify it has a valid length.
11327 			 */
11328 			if (strlen(lookup_name) >= PATH_MAX) {
11329 				error = EINVAL;
11330 				goto out_release_port;
11331 			}
11332 		}
11333 
11334 #ifdef DATALESS_FILES_USE_AUDIT_TOKEN
11335 		kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
11336 		    req.r_req_id, (uint32_t)(op & 0xffffffff),
11337 		    lookup_name == NULL ? "" : lookup_name, path, atoken);
11338 #else
11339 		kern_ret = send_vfs_resolve_dir(mach_port, req.r_req_id,
11340 		    proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
11341 		    lookup_name == NULL ? "" : lookup_name, path);
11342 #endif /* DATALESS_FILES_USE_AUDIT_TOKEN */
11343 
11344 		if (tmpname != NULL) {
11345 			zfree(ZV_NAMEI, tmpname);
11346 
11347 			/*
11348 			 * Poison lookup_name rather than reference
11349 			 * freed memory.
11350 			 */
11351 			lookup_name = NULL;
11352 		}
11353 	} else {
11354 #ifdef DATALESS_FILES_USE_AUDIT_TOKEN
11355 		kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
11356 		    req.r_req_id, (uint32_t)(op & 0xffffffff),
11357 		    offset, size, path, atoken);
11358 #else
11359 		kern_ret = send_vfs_resolve_file(mach_port, req.r_req_id,
11360 		    proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
11361 		    offset, size, path);
11362 #endif /* DATALESS_FILES_USE_AUDIT_TOKEN */
11363 	}
11364 	if (kern_ret != KERN_SUCCESS) {
11365 		/*
11366 		 * Also treat this like being unable to access the backing
11367 		 * store server.
11368 		 */
11369 		os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
11370 		    kern_ret);
11371 		error = ETIMEDOUT;
11372 
11373 		NSPACE_REQ_LOCK();
11374 		nspace_resolver_req_remove(&req);
11375 		NSPACE_REQ_UNLOCK();
11376 		goto out_release_port;
11377 	}
11378 
11379 	/*
11380 	 * Give back the memory we allocated earlier while we wait; we
11381 	 * no longer need it.
11382 	 */
11383 	zfree(ZV_NAMEI, path);
11384 	path = NULL;
11385 
11386 	/*
11387 	 * Request has been submitted to the resolver. Now (interruptibly)
11388 	 * wait for completion. Upon requrn, the request will have been
11389 	 * removed from the lookup table.
11390 	 */
11391 	error = nspace_resolver_req_wait(&req);
11392 
11393 out_release_port:
11394 	if (path != NULL) {
11395 		zfree(ZV_NAMEI, path);
11396 	}
11397 	ipc_port_release_send(mach_port);
11398 
11399 	return error;
11400 #else
11401 	return ENOTSUP;
11402 #endif /* CONFIG_DATALESS_FILES */
11403 }
11404 
11405 /*
11406  * vfs_materialize_file: Materialize a regular file.
11407  *
11408  * Inputs:
11409  * vp		The dataless file to be materialized.
11410  *
11411  * op		What kind of operation is being performed:
11412  *		-> NAMESPACE_HANDLER_READ_OP
11413  *		-> NAMESPACE_HANDLER_WRITE_OP
11414  *		-> NAMESPACE_HANDLER_LINK_CREATE
11415  *		-> NAMESPACE_HANDLER_DELETE_OP
11416  *		-> NAMESPACE_HANDLER_TRUNCATE_OP
11417  *		-> NAMESPACE_HANDLER_RENAME_OP
11418  *
11419  * offset	offset of I/O for READ or WRITE.  Ignored for
11420  *		other ops.
11421  *
11422  * size		size of I/O for READ or WRITE  Ignored for
11423  *		other ops.
11424  *
11425  * If offsize or size are -1 for a READ or WRITE, then the resolver should
11426  * consider the range to be unknown.
11427  *
11428  * Upon successful return, the caller may proceed with the operation.
11429  * N.B. the file may still be "dataless" in this case.
11430  */
11431 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)11432 vfs_materialize_file(
11433 	struct vnode *vp,
11434 	uint64_t op,
11435 	int64_t offset,
11436 	int64_t size)
11437 {
11438 	if (vp->v_type != VREG) {
11439 		return EFTYPE;
11440 	}
11441 	return vfs_materialize_item(vp, op, offset, size, NULL, 0);
11442 }
11443 
11444 /*
11445  * vfs_materialize_dir:
11446  *
11447  * Inputs:
11448  * vp		The dataless directory to be materialized.
11449  *
11450  * op		What kind of operation is being performed:
11451  *		-> NAMESPACE_HANDLER_READ_OP
11452  *		-> NAMESPACE_HANDLER_WRITE_OP
11453  *		-> NAMESPACE_HANDLER_DELETE_OP
11454  *		-> NAMESPACE_HANDLER_RENAME_OP
11455  *		-> NAMESPACE_HANDLER_LOOKUP_OP
11456  *
11457  * lookup_name	Name being looked up for a LOOKUP op.  Ignored for
11458  *		other ops.  May or may not be NUL-terminated; see below.
11459  *
11460  * namelen	If non-zero, then lookup_name is assumed to not be NUL-
11461  *		terminated and namelen is the number of valid bytes in
11462  *		lookup_name. If zero, then lookup_name is assumed to be
11463  *		NUL-terminated.
11464  *
11465  * Upon successful return, the caller may proceed with the operation.
11466  * N.B. the directory may still be "dataless" in this case.
11467  */
11468 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)11469 vfs_materialize_dir(
11470 	struct vnode *vp,
11471 	uint64_t op,
11472 	char *lookup_name,
11473 	size_t namelen)
11474 {
11475 	if (vp->v_type != VDIR) {
11476 		return EFTYPE;
11477 	}
11478 	if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
11479 		return EINVAL;
11480 	}
11481 	return vfs_materialize_item(vp, op, 0, 0, lookup_name, namelen);
11482 }
11483 
11484 int
resolve_nspace_item_ext(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,void * arg __unused)11485 resolve_nspace_item_ext(
11486 	struct vnode *vp __no_dataless_unused,
11487 	uint64_t op __no_dataless_unused,
11488 	void *arg __unused)
11489 {
11490 #if CONFIG_DATALESS_FILES
11491 	int error;
11492 	mach_port_t mp;
11493 	char *path = NULL;
11494 	int path_len;
11495 	kern_return_t kr;
11496 	struct nspace_resolver_request req;
11497 
11498 	// only allow namespace events on regular files, directories and symlinks.
11499 	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
11500 		return EFTYPE;
11501 	}
11502 
11503 	//
11504 	// if this is a snapshot event and the vnode is on a
11505 	// disk image just pretend nothing happened since any
11506 	// change to the disk image will cause the disk image
11507 	// itself to get backed up and this avoids multi-way
11508 	// deadlocks between the snapshot handler and the ever
11509 	// popular diskimages-helper process.  the variable
11510 	// nspace_allow_virtual_devs allows this behavior to
11511 	// be overridden (for use by the Mobile TimeMachine
11512 	// testing infrastructure which uses disk images)
11513 	//
11514 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
11515 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
11516 		return ENOTSUP;
11517 	}
11518 
11519 	error = vfs_context_dataless_materialization_is_prevented(
11520 		vfs_context_current());
11521 	if (error) {
11522 		log_materialization_prevented(vp, op);
11523 		return error;
11524 	}
11525 
11526 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11527 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11528 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
11529 		// Treat this like being unable to access the backing
11530 		// store server.
11531 		return ETIMEDOUT;
11532 	}
11533 
11534 	path = zalloc(ZV_NAMEI);
11535 	path_len = MAXPATHLEN;
11536 
11537 	error = vn_getpath(vp, path, &path_len);
11538 	if (error == 0) {
11539 		int xxx_rdar44371223;   /* XXX Mig bug */
11540 		req.r_req_id = next_nspace_req_id();
11541 		req.r_resolver_error = 0;
11542 		req.r_flags = 0;
11543 
11544 		if ((error = vnode_ref(vp)) == 0) {     // take a ref so that the vnode doesn't go away
11545 			req.r_vp = vp;
11546 		} else {
11547 			goto out_release_port;
11548 		}
11549 
11550 		NSPACE_REQ_LOCK();
11551 		error = nspace_resolver_req_add(&req);
11552 		NSPACE_REQ_UNLOCK();
11553 		if (error) {
11554 			vnode_rele(req.r_vp);
11555 			goto out_release_port;
11556 		}
11557 
11558 		os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
11559 		kr = send_nspace_resolve_path(mp, req.r_req_id,
11560 		    proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
11561 		    path, &xxx_rdar44371223);
11562 		if (kr != KERN_SUCCESS) {
11563 			// Also treat this like being unable to access
11564 			// the backing store server.
11565 			os_log_error(OS_LOG_DEFAULT,
11566 			    "NSPACE resolve_path failure: %d", kr);
11567 			error = ETIMEDOUT;
11568 
11569 			NSPACE_REQ_LOCK();
11570 			nspace_resolver_req_remove(&req);
11571 			NSPACE_REQ_UNLOCK();
11572 			vnode_rele(req.r_vp);
11573 			goto out_release_port;
11574 		}
11575 
11576 		// Give back the memory we allocated earlier while
11577 		// we wait; we no longer need it.
11578 		zfree(ZV_NAMEI, path);
11579 		path = NULL;
11580 
11581 		// Request has been submitted to the resolver.
11582 		// Now (interruptibly) wait for completion.
11583 		// Upon requrn, the request will have been removed
11584 		// from the lookup table.
11585 		error = nspace_resolver_req_wait(&req);
11586 
11587 		vnode_rele(req.r_vp);
11588 	}
11589 
11590 out_release_port:
11591 	if (path != NULL) {
11592 		zfree(ZV_NAMEI, path);
11593 	}
11594 	ipc_port_release_send(mp);
11595 
11596 	return error;
11597 #else
11598 	return ENOTSUP;
11599 #endif /* CONFIG_DATALESS_FILES */
11600 }
11601 
11602 int
nspace_snapshot_event(__unused vnode_t vp,__unused time_t ctime,__unused uint64_t op_type,__unused void * arg)11603 nspace_snapshot_event(__unused vnode_t vp, __unused  time_t ctime,
11604     __unused uint64_t op_type, __unused void *arg)
11605 {
11606 	return 0;
11607 }
11608 
11609 #if 0
11610 static int
11611 build_volfs_path(struct vnode *vp, char *path, int *len)
11612 {
11613 	struct vnode_attr va;
11614 	int ret;
11615 
11616 	VATTR_INIT(&va);
11617 	VATTR_WANTED(&va, va_fsid);
11618 	VATTR_WANTED(&va, va_fileid);
11619 
11620 	if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
11621 		*len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
11622 		ret = -1;
11623 	} else {
11624 		*len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
11625 		ret = 0;
11626 	}
11627 
11628 	return ret;
11629 }
11630 #endif
11631 
11632 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)11633 fsctl_bogus_command_compat(unsigned long cmd)
11634 {
11635 	switch (cmd) {
11636 	case IOCBASECMD(FSIOC_SYNC_VOLUME):
11637 		return FSIOC_SYNC_VOLUME;
11638 	case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
11639 		return FSIOC_ROUTEFS_SETROUTEID;
11640 	case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
11641 		return FSIOC_SET_PACKAGE_EXTS;
11642 	case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
11643 		return FSIOC_SET_FSTYPENAME_OVERRIDE;
11644 	case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
11645 		return DISK_CONDITIONER_IOC_GET;
11646 	case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
11647 		return DISK_CONDITIONER_IOC_SET;
11648 	case IOCBASECMD(FSIOC_FIOSEEKHOLE):
11649 		return FSIOC_FIOSEEKHOLE;
11650 	case IOCBASECMD(FSIOC_FIOSEEKDATA):
11651 		return FSIOC_FIOSEEKDATA;
11652 	case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
11653 		return SPOTLIGHT_IOC_GET_MOUNT_TIME;
11654 	case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
11655 		return SPOTLIGHT_IOC_GET_LAST_MTIME;
11656 	}
11657 
11658 	return cmd;
11659 }
11660 
11661 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)11662 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
11663 {
11664 	return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
11665 }
11666 
11667 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)11668 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
11669 {
11670 	struct vfs_attr vfa;
11671 	mount_t mp = vp->v_mount;
11672 	unsigned arg;
11673 	int error;
11674 
11675 	/* record vid of vp so we can drop it below. */
11676 	uint32_t vvid = vp->v_id;
11677 
11678 	/*
11679 	 * Then grab mount_iterref so that we can release the vnode.
11680 	 * Without this, a thread may call vnode_iterate_prepare then
11681 	 * get into a deadlock because we've never released the root vp
11682 	 */
11683 	error = mount_iterref(mp, 0);
11684 	if (error) {
11685 		return error;
11686 	}
11687 	vnode_put(vp);
11688 
11689 	arg = MNT_NOWAIT;
11690 	if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
11691 		arg = MNT_WAIT;
11692 	}
11693 
11694 	/*
11695 	 * If the filessytem supports multiple filesytems in a
11696 	 * partition (For eg APFS volumes in a container, it knows
11697 	 * that the waitfor argument to VFS_SYNC are flags.
11698 	 */
11699 	VFSATTR_INIT(&vfa);
11700 	VFSATTR_WANTED(&vfa, f_capabilities);
11701 	if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
11702 	    VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
11703 	    ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
11704 	    ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
11705 		arg |= MNT_VOLUME;
11706 	}
11707 
11708 	/* issue the sync for this volume */
11709 	(void)sync_callback(mp, &arg);
11710 
11711 	/*
11712 	 * Then release the mount_iterref once we're done syncing; it's not
11713 	 * needed for the VNOP_IOCTL below
11714 	 */
11715 	mount_iterdrop(mp);
11716 
11717 	if (arg & FSCTL_SYNC_FULLSYNC) {
11718 		/* re-obtain vnode iocount on the root vp, if possible */
11719 		error = vnode_getwithvid(vp, vvid);
11720 		if (error == 0) {
11721 			error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
11722 			vnode_put(vp);
11723 		}
11724 	}
11725 	/* mark the argument VP as having been released */
11726 	*arg_vp = NULL;
11727 	return error;
11728 }
11729 
11730 #if ROUTEFS
11731 static int __attribute__((noinline))
handle_routes(user_addr_t udata)11732 handle_routes(user_addr_t udata)
11733 {
11734 	char routepath[MAXPATHLEN];
11735 	size_t len = 0;
11736 	int error;
11737 
11738 	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11739 		return error;
11740 	}
11741 	bzero(routepath, MAXPATHLEN);
11742 	error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
11743 	if (error) {
11744 		return error;
11745 	}
11746 	error = routefs_kernel_mount(routepath);
11747 	return error;
11748 }
11749 #endif
11750 
11751 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)11752 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
11753 {
11754 	struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
11755 	struct vnode_attr va;
11756 	int error;
11757 
11758 	VATTR_INIT(&va);
11759 	VATTR_SET(&va, va_flags, cas->new_flags);
11760 
11761 	error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
11762 	return error;
11763 }
11764 
11765 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)11766 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
11767 {
11768 	struct mount *mp = NULL;
11769 	errno_t rootauth = 0;
11770 
11771 	mp = vp->v_mount;
11772 
11773 	/*
11774 	 * query the underlying FS and see if it reports something
11775 	 * sane for this vnode. If volume is authenticated via
11776 	 * chunklist, leave that for the caller to determine.
11777 	 */
11778 	rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
11779 
11780 	return rootauth;
11781 }
11782 
11783 /*
11784  * Make a filesystem-specific control call:
11785  */
11786 /* ARGSUSED */
11787 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)11788 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
11789 {
11790 	int error = 0;
11791 	boolean_t is64bit;
11792 	u_int size;
11793 #define STK_PARAMS 128
11794 	char stkbuf[STK_PARAMS] = {0};
11795 	caddr_t data, memp;
11796 	vnode_t vp = *arg_vp;
11797 
11798 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
11799 		return ENOTTY;
11800 	}
11801 
11802 	cmd = fsctl_bogus_command_compat(cmd);
11803 
11804 	size = IOCPARM_LEN(cmd);
11805 	if (size > IOCPARM_MAX) {
11806 		return EINVAL;
11807 	}
11808 
11809 	is64bit = proc_is64bit(p);
11810 
11811 	memp = NULL;
11812 
11813 	if (size > sizeof(stkbuf)) {
11814 		if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
11815 			return ENOMEM;
11816 		}
11817 		data = memp;
11818 	} else {
11819 		data = &stkbuf[0];
11820 	};
11821 
11822 	if (cmd & IOC_IN) {
11823 		if (size) {
11824 			error = copyin(udata, data, size);
11825 			if (error) {
11826 				if (memp) {
11827 					kfree_data(memp, size);
11828 				}
11829 				return error;
11830 			}
11831 		} else {
11832 			if (is64bit) {
11833 				*(user_addr_t *)data = udata;
11834 			} else {
11835 				*(uint32_t *)data = (uint32_t)udata;
11836 			}
11837 		};
11838 	} else if ((cmd & IOC_OUT) && size) {
11839 		/*
11840 		 * Zero the buffer so the user always
11841 		 * gets back something deterministic.
11842 		 */
11843 		bzero(data, size);
11844 	} else if (cmd & IOC_VOID) {
11845 		if (is64bit) {
11846 			*(user_addr_t *)data = udata;
11847 		} else {
11848 			*(uint32_t *)data = (uint32_t)udata;
11849 		}
11850 	}
11851 
11852 	/* Check to see if it's a generic command */
11853 	switch (cmd) {
11854 	case FSIOC_SYNC_VOLUME:
11855 		error = handle_sync_volume(vp, arg_vp, data, ctx);
11856 		break;
11857 
11858 	case FSIOC_ROUTEFS_SETROUTEID:
11859 #if ROUTEFS
11860 		error = handle_routes(udata);
11861 #endif
11862 		break;
11863 
11864 	case FSIOC_SET_PACKAGE_EXTS: {
11865 		user_addr_t ext_strings;
11866 		uint32_t    num_entries;
11867 		uint32_t    max_width;
11868 
11869 		if ((error = priv_check_cred(kauth_cred_get(), PRIV_PACKAGE_EXTENSIONS, 0))) {
11870 			break;
11871 		}
11872 
11873 		if ((is64bit && size != sizeof(user64_package_ext_info))
11874 		    || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
11875 			// either you're 64-bit and passed a 64-bit struct or
11876 			// you're 32-bit and passed a 32-bit struct.  otherwise
11877 			// it's not ok.
11878 			error = EINVAL;
11879 			break;
11880 		}
11881 
11882 		if (is64bit) {
11883 			if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
11884 				assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
11885 			}
11886 			ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
11887 			num_entries = ((user64_package_ext_info *)data)->num_entries;
11888 			max_width   = ((user64_package_ext_info *)data)->max_width;
11889 		} else {
11890 			ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
11891 			num_entries = ((user32_package_ext_info *)data)->num_entries;
11892 			max_width   = ((user32_package_ext_info *)data)->max_width;
11893 		}
11894 		error = set_package_extensions_table(ext_strings, num_entries, max_width);
11895 	}
11896 	break;
11897 
11898 	case FSIOC_SET_FSTYPENAME_OVERRIDE:
11899 	{
11900 		if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11901 			break;
11902 		}
11903 		if (vp->v_mount) {
11904 			mount_lock(vp->v_mount);
11905 			if (data[0] != 0) {
11906 				int i;
11907 				for (i = 0; i < MFSTYPENAMELEN; i++) {
11908 					if (!data[i]) {
11909 						goto continue_copy;
11910 					}
11911 				}
11912 				/*
11913 				 * Getting here means we have a user data string which has no
11914 				 * NULL termination in its first MFSTYPENAMELEN bytes.
11915 				 * This is bogus, let's avoid strlcpy-ing the read data and
11916 				 * return an error.
11917 				 */
11918 				error = EINVAL;
11919 				goto unlock;
11920 continue_copy:
11921 				strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
11922 				vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
11923 				if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
11924 					vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
11925 					vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
11926 				}
11927 			} else {
11928 				if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
11929 					vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
11930 				}
11931 				vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
11932 				vp->v_mount->fstypename_override[0] = '\0';
11933 			}
11934 unlock:
11935 			mount_unlock(vp->v_mount);
11936 		}
11937 	}
11938 	break;
11939 
11940 	case DISK_CONDITIONER_IOC_GET: {
11941 		error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
11942 	}
11943 	break;
11944 
11945 	case DISK_CONDITIONER_IOC_SET: {
11946 		error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
11947 	}
11948 	break;
11949 
11950 	case FSIOC_CAS_BSDFLAGS:
11951 		error = handle_flags(vp, data, ctx);
11952 		break;
11953 
11954 	case FSIOC_FD_ONLY_OPEN_ONCE: {
11955 		error = 0;
11956 		if (vnode_usecount(vp) > 1) {
11957 			vnode_lock_spin(vp);
11958 			if (vp->v_lflag & VL_HASSTREAMS) {
11959 				if (vnode_isinuse_locked(vp, 1, 1)) {
11960 					error = EBUSY;
11961 				}
11962 			} else if (vnode_usecount(vp) > 1) {
11963 				error = EBUSY;
11964 			}
11965 			vnode_unlock(vp);
11966 		}
11967 	}
11968 	break;
11969 
11970 	case FSIOC_EVAL_ROOTAUTH:
11971 		error = handle_auth(vp, cmd, data, options, ctx);
11972 		break;
11973 
11974 	default: {
11975 		/* other, known commands shouldn't be passed down here */
11976 		switch (cmd) {
11977 		case F_PUNCHHOLE:
11978 		case F_TRIM_ACTIVE_FILE:
11979 		case F_RDADVISE:
11980 		case F_TRANSCODEKEY:
11981 		case F_GETPROTECTIONLEVEL:
11982 		case F_GETDEFAULTPROTLEVEL:
11983 		case F_MAKECOMPRESSED:
11984 		case F_SET_GREEDY_MODE:
11985 		case F_SETSTATICCONTENT:
11986 		case F_SETIOTYPE:
11987 		case F_SETBACKINGSTORE:
11988 		case F_GETPATH_MTMINFO:
11989 		case APFSIOC_REVERT_TO_SNAPSHOT:
11990 		case FSIOC_FIOSEEKHOLE:
11991 		case FSIOC_FIOSEEKDATA:
11992 		case HFS_GET_BOOT_INFO:
11993 		case HFS_SET_BOOT_INFO:
11994 		case FIOPINSWAP:
11995 		case F_CHKCLEAN:
11996 		case F_FULLFSYNC:
11997 		case F_BARRIERFSYNC:
11998 		case F_FREEZE_FS:
11999 		case F_THAW_FS:
12000 		case FSIOC_KERNEL_ROOTAUTH:
12001 			error = EINVAL;
12002 			goto outdrop;
12003 		}
12004 		/* Invoke the filesystem-specific code */
12005 		error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12006 	}
12007 	} /* end switch stmt */
12008 
12009 	/*
12010 	 * if no errors, copy any data to user. Size was
12011 	 * already set and checked above.
12012 	 */
12013 	if (error == 0 && (cmd & IOC_OUT) && size) {
12014 		error = copyout(data, udata, size);
12015 	}
12016 
12017 outdrop:
12018 	if (memp) {
12019 		kfree_data(memp, size);
12020 	}
12021 
12022 	return error;
12023 }
12024 
12025 /* ARGSUSED */
12026 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)12027 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
12028 {
12029 	int error;
12030 	struct nameidata nd;
12031 	uint32_t nameiflags;
12032 	vnode_t vp = NULL;
12033 	vfs_context_t ctx = vfs_context_current();
12034 
12035 	AUDIT_ARG(cmd, (int)uap->cmd);
12036 	AUDIT_ARG(value32, uap->options);
12037 	/* Get the vnode for the file we are getting info on:  */
12038 	nameiflags = 0;
12039 	//
12040 	// if we come through fsctl() then the file is by definition not open.
12041 	// therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
12042 	// lest the caller mistakenly thinks the only open is their own (but in
12043 	// reality it's someone elses).
12044 	//
12045 	if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
12046 		return EINVAL;
12047 	}
12048 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
12049 		nameiflags |= FOLLOW;
12050 	}
12051 	if (uap->cmd == FSIOC_FIRMLINK_CTL) {
12052 		nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
12053 	}
12054 	NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
12055 	    UIO_USERSPACE, uap->path, ctx);
12056 	if ((error = namei(&nd))) {
12057 		goto done;
12058 	}
12059 	vp = nd.ni_vp;
12060 	nameidone(&nd);
12061 
12062 #if CONFIG_MACF
12063 	error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
12064 	if (error) {
12065 		goto done;
12066 	}
12067 #endif
12068 
12069 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12070 
12071 done:
12072 	if (vp) {
12073 		vnode_put(vp);
12074 	}
12075 	return error;
12076 }
12077 /* ARGSUSED */
12078 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)12079 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
12080 {
12081 	int error;
12082 	vnode_t vp = NULL;
12083 	vfs_context_t ctx = vfs_context_current();
12084 	int fd = -1;
12085 
12086 	AUDIT_ARG(fd, uap->fd);
12087 	AUDIT_ARG(cmd, (int)uap->cmd);
12088 	AUDIT_ARG(value32, uap->options);
12089 
12090 	/* Get the vnode for the file we are getting info on:  */
12091 	if ((error = file_vnode(uap->fd, &vp))) {
12092 		return error;
12093 	}
12094 	fd = uap->fd;
12095 	if ((error = vnode_getwithref(vp))) {
12096 		file_drop(fd);
12097 		return error;
12098 	}
12099 
12100 #if CONFIG_MACF
12101 	if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
12102 		file_drop(fd);
12103 		vnode_put(vp);
12104 		return error;
12105 	}
12106 #endif
12107 
12108 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12109 
12110 	file_drop(fd);
12111 
12112 	/*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
12113 	if (vp) {
12114 		vnode_put(vp);
12115 	}
12116 
12117 	return error;
12118 }
12119 /* end of fsctl system call */
12120 
12121 #define FILESEC_ACCESS_ENTITLEMENT              \
12122 	"com.apple.private.vfs.filesec-access"
12123 
12124 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)12125 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
12126 {
12127 	if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
12128 		/*
12129 		 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
12130 		 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
12131 		 */
12132 		if ((!setting && vfs_context_issuser(ctx)) ||
12133 		    IOCurrentTaskHasEntitlement(FILESEC_ACCESS_ENTITLEMENT)) {
12134 			return 0;
12135 		}
12136 	}
12137 
12138 	return EPERM;
12139 }
12140 
12141 /*
12142  *  Retrieve the data of an extended attribute.
12143  */
12144 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)12145 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
12146 {
12147 	vnode_t vp;
12148 	struct nameidata nd;
12149 	char attrname[XATTR_MAXNAMELEN + 1];
12150 	vfs_context_t ctx = vfs_context_current();
12151 	uio_t auio = NULL;
12152 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12153 	size_t attrsize = 0;
12154 	size_t namelen;
12155 	u_int32_t nameiflags;
12156 	int error;
12157 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12158 
12159 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12160 		return EINVAL;
12161 	}
12162 
12163 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12164 	NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
12165 	if ((error = namei(&nd))) {
12166 		return error;
12167 	}
12168 	vp = nd.ni_vp;
12169 	nameidone(&nd);
12170 
12171 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12172 	if (error != 0) {
12173 		goto out;
12174 	}
12175 	if (xattr_protected(attrname) &&
12176 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12177 		goto out;
12178 	}
12179 	/*
12180 	 * the specific check for 0xffffffff is a hack to preserve
12181 	 * binaray compatibilty in K64 with applications that discovered
12182 	 * that passing in a buf pointer and a size of -1 resulted in
12183 	 * just the size of the indicated extended attribute being returned.
12184 	 * this isn't part of the documented behavior, but because of the
12185 	 * original implemtation's check for "uap->size > 0", this behavior
12186 	 * was allowed. In K32 that check turned into a signed comparison
12187 	 * even though uap->size is unsigned...  in K64, we blow by that
12188 	 * check because uap->size is unsigned and doesn't get sign smeared
12189 	 * in the munger for a 32 bit user app.  we also need to add a
12190 	 * check to limit the maximum size of the buffer being passed in...
12191 	 * unfortunately, the underlying fileystems seem to just malloc
12192 	 * the requested size even if the actual extended attribute is tiny.
12193 	 * because that malloc is for kernel wired memory, we have to put a
12194 	 * sane limit on it.
12195 	 *
12196 	 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
12197 	 * U64 running on K64 will yield -1 (64 bits wide)
12198 	 * U32/U64 running on K32 will yield -1 (32 bits wide)
12199 	 */
12200 	if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
12201 		goto no_uio;
12202 	}
12203 
12204 	if (uap->value) {
12205 		if (uap->size > (size_t)XATTR_MAXSIZE) {
12206 			uap->size = XATTR_MAXSIZE;
12207 		}
12208 
12209 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12210 		    &uio_buf[0], sizeof(uio_buf));
12211 		uio_addiov(auio, uap->value, uap->size);
12212 	}
12213 no_uio:
12214 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
12215 out:
12216 	vnode_put(vp);
12217 
12218 	if (auio) {
12219 		*retval = uap->size - uio_resid(auio);
12220 	} else {
12221 		*retval = (user_ssize_t)attrsize;
12222 	}
12223 
12224 	return error;
12225 }
12226 
12227 /*
12228  * Retrieve the data of an extended attribute.
12229  */
12230 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)12231 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
12232 {
12233 	vnode_t vp;
12234 	char attrname[XATTR_MAXNAMELEN + 1];
12235 	vfs_context_t ctx = vfs_context_current();
12236 	uio_t auio = NULL;
12237 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12238 	size_t attrsize = 0;
12239 	size_t namelen;
12240 	int error;
12241 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12242 
12243 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12244 		return EINVAL;
12245 	}
12246 
12247 	if ((error = file_vnode(uap->fd, &vp))) {
12248 		return error;
12249 	}
12250 	if ((error = vnode_getwithref(vp))) {
12251 		file_drop(uap->fd);
12252 		return error;
12253 	}
12254 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12255 	if (error != 0) {
12256 		goto out;
12257 	}
12258 	if (xattr_protected(attrname) &&
12259 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12260 		goto out;
12261 	}
12262 	if (uap->value && uap->size > 0) {
12263 		if (uap->size > (size_t)XATTR_MAXSIZE) {
12264 			uap->size = XATTR_MAXSIZE;
12265 		}
12266 
12267 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12268 		    &uio_buf[0], sizeof(uio_buf));
12269 		uio_addiov(auio, uap->value, uap->size);
12270 	}
12271 
12272 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
12273 out:
12274 	(void)vnode_put(vp);
12275 	file_drop(uap->fd);
12276 
12277 	if (auio) {
12278 		*retval = uap->size - uio_resid(auio);
12279 	} else {
12280 		*retval = (user_ssize_t)attrsize;
12281 	}
12282 	return error;
12283 }
12284 
12285 /*
12286  * Set the data of an extended attribute.
12287  */
12288 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)12289 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
12290 {
12291 	vnode_t vp;
12292 	struct nameidata nd;
12293 	char attrname[XATTR_MAXNAMELEN + 1];
12294 	vfs_context_t ctx = vfs_context_current();
12295 	uio_t auio = NULL;
12296 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12297 	size_t namelen;
12298 	u_int32_t nameiflags;
12299 	int error;
12300 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12301 
12302 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12303 		return EINVAL;
12304 	}
12305 
12306 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12307 	if (error != 0) {
12308 		if (error == EPERM) {
12309 			/* if the string won't fit in attrname, copyinstr emits EPERM */
12310 			return ENAMETOOLONG;
12311 		}
12312 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
12313 		return error;
12314 	}
12315 	if (xattr_protected(attrname) &&
12316 	    (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
12317 		return error;
12318 	}
12319 	if (uap->size != 0 && uap->value == 0) {
12320 		return EINVAL;
12321 	}
12322 	if (uap->size > INT_MAX) {
12323 		return E2BIG;
12324 	}
12325 
12326 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12327 	NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
12328 	if ((error = namei(&nd))) {
12329 		return error;
12330 	}
12331 	vp = nd.ni_vp;
12332 	nameidone(&nd);
12333 
12334 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
12335 	    &uio_buf[0], sizeof(uio_buf));
12336 	uio_addiov(auio, uap->value, uap->size);
12337 
12338 	error = vn_setxattr(vp, attrname, auio, uap->options, ctx);
12339 #if CONFIG_FSE
12340 	if (error == 0) {
12341 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
12342 		    FSE_ARG_VNODE, vp,
12343 		    FSE_ARG_DONE);
12344 	}
12345 #endif
12346 	vnode_put(vp);
12347 	*retval = 0;
12348 	return error;
12349 }
12350 
12351 /*
12352  * Set the data of an extended attribute.
12353  */
12354 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)12355 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
12356 {
12357 	vnode_t vp;
12358 	char attrname[XATTR_MAXNAMELEN + 1];
12359 	vfs_context_t ctx = vfs_context_current();
12360 	uio_t auio = NULL;
12361 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12362 	size_t namelen;
12363 	int error;
12364 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12365 
12366 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12367 		return EINVAL;
12368 	}
12369 
12370 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12371 	if (error != 0) {
12372 		if (error == EPERM) {
12373 			/* if the string won't fit in attrname, copyinstr emits EPERM */
12374 			return ENAMETOOLONG;
12375 		}
12376 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
12377 		return error;
12378 	}
12379 	if (xattr_protected(attrname) &&
12380 	    (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
12381 		return error;
12382 	}
12383 	if (uap->size != 0 && uap->value == 0) {
12384 		return EINVAL;
12385 	}
12386 	if (uap->size > INT_MAX) {
12387 		return E2BIG;
12388 	}
12389 	if ((error = file_vnode(uap->fd, &vp))) {
12390 		return error;
12391 	}
12392 	if ((error = vnode_getwithref(vp))) {
12393 		file_drop(uap->fd);
12394 		return error;
12395 	}
12396 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
12397 	    &uio_buf[0], sizeof(uio_buf));
12398 	uio_addiov(auio, uap->value, uap->size);
12399 
12400 	error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
12401 #if CONFIG_FSE
12402 	if (error == 0) {
12403 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
12404 		    FSE_ARG_VNODE, vp,
12405 		    FSE_ARG_DONE);
12406 	}
12407 #endif
12408 	vnode_put(vp);
12409 	file_drop(uap->fd);
12410 	*retval = 0;
12411 	return error;
12412 }
12413 
12414 /*
12415  * Remove an extended attribute.
12416  * XXX Code duplication here.
12417  */
12418 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)12419 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
12420 {
12421 	vnode_t vp;
12422 	struct nameidata nd;
12423 	char attrname[XATTR_MAXNAMELEN + 1];
12424 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12425 	vfs_context_t ctx = vfs_context_current();
12426 	size_t namelen;
12427 	u_int32_t nameiflags;
12428 	int error;
12429 
12430 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12431 		return EINVAL;
12432 	}
12433 
12434 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12435 	if (error != 0) {
12436 		return error;
12437 	}
12438 	if (xattr_protected(attrname)) {
12439 		return EPERM;
12440 	}
12441 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12442 	NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
12443 	if ((error = namei(&nd))) {
12444 		return error;
12445 	}
12446 	vp = nd.ni_vp;
12447 	nameidone(&nd);
12448 
12449 	error = vn_removexattr(vp, attrname, uap->options, ctx);
12450 #if CONFIG_FSE
12451 	if (error == 0) {
12452 		add_fsevent(FSE_XATTR_REMOVED, ctx,
12453 		    FSE_ARG_VNODE, vp,
12454 		    FSE_ARG_DONE);
12455 	}
12456 #endif
12457 	vnode_put(vp);
12458 	*retval = 0;
12459 	return error;
12460 }
12461 
12462 /*
12463  * Remove an extended attribute.
12464  * XXX Code duplication here.
12465  */
12466 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)12467 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
12468 {
12469 	vnode_t vp;
12470 	char attrname[XATTR_MAXNAMELEN + 1];
12471 	size_t namelen;
12472 	int error;
12473 #if CONFIG_FSE
12474 	vfs_context_t ctx = vfs_context_current();
12475 #endif
12476 
12477 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12478 		return EINVAL;
12479 	}
12480 
12481 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12482 	if (error != 0) {
12483 		return error;
12484 	}
12485 	if (xattr_protected(attrname)) {
12486 		return EPERM;
12487 	}
12488 	if ((error = file_vnode(uap->fd, &vp))) {
12489 		return error;
12490 	}
12491 	if ((error = vnode_getwithref(vp))) {
12492 		file_drop(uap->fd);
12493 		return error;
12494 	}
12495 
12496 	error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
12497 #if CONFIG_FSE
12498 	if (error == 0) {
12499 		add_fsevent(FSE_XATTR_REMOVED, ctx,
12500 		    FSE_ARG_VNODE, vp,
12501 		    FSE_ARG_DONE);
12502 	}
12503 #endif
12504 	vnode_put(vp);
12505 	file_drop(uap->fd);
12506 	*retval = 0;
12507 	return error;
12508 }
12509 
12510 /*
12511  * Retrieve the list of extended attribute names.
12512  * XXX Code duplication here.
12513  */
12514 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)12515 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
12516 {
12517 	vnode_t vp;
12518 	struct nameidata nd;
12519 	vfs_context_t ctx = vfs_context_current();
12520 	uio_t auio = NULL;
12521 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12522 	size_t attrsize = 0;
12523 	u_int32_t nameiflags;
12524 	int error;
12525 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12526 
12527 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12528 		return EINVAL;
12529 	}
12530 
12531 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12532 	NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
12533 	if ((error = namei(&nd))) {
12534 		return error;
12535 	}
12536 	vp = nd.ni_vp;
12537 	nameidone(&nd);
12538 	if (uap->namebuf != 0 && uap->bufsize > 0) {
12539 		auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
12540 		    &uio_buf[0], sizeof(uio_buf));
12541 		uio_addiov(auio, uap->namebuf, uap->bufsize);
12542 	}
12543 
12544 	error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
12545 
12546 	vnode_put(vp);
12547 	if (auio) {
12548 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
12549 	} else {
12550 		*retval = (user_ssize_t)attrsize;
12551 	}
12552 	return error;
12553 }
12554 
12555 /*
12556  * Retrieve the list of extended attribute names.
12557  * XXX Code duplication here.
12558  */
12559 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)12560 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
12561 {
12562 	vnode_t vp;
12563 	uio_t auio = NULL;
12564 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12565 	size_t attrsize = 0;
12566 	int error;
12567 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12568 
12569 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12570 		return EINVAL;
12571 	}
12572 
12573 	if ((error = file_vnode(uap->fd, &vp))) {
12574 		return error;
12575 	}
12576 	if ((error = vnode_getwithref(vp))) {
12577 		file_drop(uap->fd);
12578 		return error;
12579 	}
12580 	if (uap->namebuf != 0 && uap->bufsize > 0) {
12581 		auio = uio_createwithbuffer(1, 0, spacetype,
12582 		    UIO_READ, &uio_buf[0], sizeof(uio_buf));
12583 		uio_addiov(auio, uap->namebuf, uap->bufsize);
12584 	}
12585 
12586 	error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
12587 
12588 	vnode_put(vp);
12589 	file_drop(uap->fd);
12590 	if (auio) {
12591 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
12592 	} else {
12593 		*retval = (user_ssize_t)attrsize;
12594 	}
12595 	return error;
12596 }
12597 
12598 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)12599 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
12600     vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
12601 {
12602 	int error;
12603 	struct mount *mp = NULL;
12604 	vnode_t vp;
12605 	int length;
12606 	int bpflags;
12607 	/* maximum number of times to retry build_path */
12608 	unsigned int retries = 0x10;
12609 
12610 	if (bufsize > PAGE_SIZE) {
12611 		return EINVAL;
12612 	}
12613 
12614 	if (buf == NULL) {
12615 		return ENOMEM;
12616 	}
12617 
12618 retry:
12619 	if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
12620 		error = ENOTSUP;  /* unexpected failure */
12621 		return ENOTSUP;
12622 	}
12623 
12624 #if CONFIG_UNION_MOUNTS
12625 unionget:
12626 #endif /* CONFIG_UNION_MOUNTS */
12627 	if (objid == 2) {
12628 		struct vfs_attr vfsattr;
12629 		int use_vfs_root = TRUE;
12630 
12631 		VFSATTR_INIT(&vfsattr);
12632 		VFSATTR_WANTED(&vfsattr, f_capabilities);
12633 		if (!(options & FSOPT_ISREALFSID) &&
12634 		    vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
12635 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
12636 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
12637 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
12638 				use_vfs_root = FALSE;
12639 			}
12640 		}
12641 
12642 		if (use_vfs_root) {
12643 			error = VFS_ROOT(mp, &vp, ctx);
12644 		} else {
12645 			error = VFS_VGET(mp, objid, &vp, ctx);
12646 		}
12647 	} else {
12648 		error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
12649 	}
12650 
12651 #if CONFIG_UNION_MOUNTS
12652 	if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
12653 		/*
12654 		 * If the fileid isn't found and we're in a union
12655 		 * mount volume, then see if the fileid is in the
12656 		 * mounted-on volume.
12657 		 */
12658 		struct mount *tmp = mp;
12659 		mp = vnode_mount(tmp->mnt_vnodecovered);
12660 		vfs_unbusy(tmp);
12661 		if (vfs_busy(mp, LK_NOWAIT) == 0) {
12662 			goto unionget;
12663 		}
12664 	} else {
12665 		vfs_unbusy(mp);
12666 	}
12667 #else
12668 	vfs_unbusy(mp);
12669 #endif /* CONFIG_UNION_MOUNTS */
12670 
12671 	if (error) {
12672 		return error;
12673 	}
12674 
12675 #if CONFIG_MACF
12676 	error = mac_vnode_check_fsgetpath(ctx, vp);
12677 	if (error) {
12678 		vnode_put(vp);
12679 		return error;
12680 	}
12681 #endif
12682 
12683 	/* Obtain the absolute path to this vnode. */
12684 	bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
12685 	if (options & FSOPT_NOFIRMLINKPATH) {
12686 		bpflags |= BUILDPATH_NO_FIRMLINK;
12687 	}
12688 	bpflags |= BUILDPATH_CHECK_MOVED;
12689 	error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
12690 	vnode_put(vp);
12691 
12692 	if (error) {
12693 		/* there was a race building the path, try a few more times */
12694 		if (error == EAGAIN) {
12695 			--retries;
12696 			if (retries > 0) {
12697 				goto retry;
12698 			}
12699 
12700 			error = ENOENT;
12701 		}
12702 		goto out;
12703 	}
12704 
12705 	AUDIT_ARG(text, buf);
12706 
12707 	if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
12708 		unsigned long path_words[NUMPARMS];
12709 		size_t path_len = sizeof(path_words);
12710 
12711 		if ((size_t)length < path_len) {
12712 			memcpy((char *)path_words, buf, length);
12713 			memset((char *)path_words + length, 0, path_len - length);
12714 
12715 			path_len = length;
12716 		} else {
12717 			memcpy((char *)path_words, buf + (length - path_len), path_len);
12718 		}
12719 
12720 		kdebug_vfs_lookup(path_words, (int)path_len, vp,
12721 		    KDBG_VFS_LOOKUP_FLAG_LOOKUP);
12722 	}
12723 
12724 	*pathlen = length; /* may be superseded by error */
12725 
12726 out:
12727 	return error;
12728 }
12729 
12730 /*
12731  * Obtain the full pathname of a file system object by id.
12732  */
12733 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)12734 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
12735     uint32_t options, user_ssize_t *retval)
12736 {
12737 	vfs_context_t ctx = vfs_context_current();
12738 	fsid_t fsid;
12739 	char *realpath;
12740 	int length;
12741 	int error;
12742 
12743 	if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
12744 		return EINVAL;
12745 	}
12746 
12747 	if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
12748 		return error;
12749 	}
12750 	AUDIT_ARG(value32, fsid.val[0]);
12751 	AUDIT_ARG(value64, objid);
12752 	/* Restrict output buffer size for now. */
12753 
12754 	if (bufsize > PAGE_SIZE || bufsize <= 0) {
12755 		return EINVAL;
12756 	}
12757 	realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
12758 	if (realpath == NULL) {
12759 		return ENOMEM;
12760 	}
12761 
12762 	error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
12763 	    options, &length);
12764 
12765 	if (error) {
12766 		goto out;
12767 	}
12768 
12769 	error = copyout((caddr_t)realpath, buf, length);
12770 
12771 	*retval = (user_ssize_t)length; /* may be superseded by error */
12772 out:
12773 	kfree_data(realpath, bufsize);
12774 	return error;
12775 }
12776 
12777 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)12778 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
12779 {
12780 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
12781 	           0, retval);
12782 }
12783 
12784 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)12785 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
12786 {
12787 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
12788 	           uap->options, retval);
12789 }
12790 
12791 /*
12792  * Common routine to handle various flavors of statfs data heading out
12793  *	to user space.
12794  *
12795  * Returns:	0			Success
12796  *		EFAULT
12797  */
12798 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)12799 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
12800     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
12801     boolean_t partial_copy)
12802 {
12803 	int             error;
12804 	int             my_size, copy_size;
12805 
12806 	if (is_64_bit) {
12807 		struct user64_statfs sfs;
12808 		my_size = copy_size = sizeof(sfs);
12809 		bzero(&sfs, my_size);
12810 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12811 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
12812 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12813 		sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
12814 		sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
12815 		sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
12816 		sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
12817 		sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
12818 		sfs.f_files = (user64_long_t)sfsp->f_files;
12819 		sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
12820 		sfs.f_fsid = sfsp->f_fsid;
12821 		sfs.f_owner = sfsp->f_owner;
12822 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12823 			strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12824 		} else {
12825 			strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12826 		}
12827 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12828 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12829 
12830 		if (partial_copy) {
12831 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12832 		}
12833 		error = copyout((caddr_t)&sfs, bufp, copy_size);
12834 	} else {
12835 		struct user32_statfs sfs;
12836 
12837 		my_size = copy_size = sizeof(sfs);
12838 		bzero(&sfs, my_size);
12839 
12840 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12841 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
12842 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12843 
12844 		/*
12845 		 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
12846 		 * have to fudge the numbers here in that case.   We inflate the blocksize in order
12847 		 * to reflect the filesystem size as best we can.
12848 		 */
12849 		if ((sfsp->f_blocks > INT_MAX)
12850 		    /* Hack for 4061702 . I think the real fix is for Carbon to
12851 		     * look for some volume capability and not depend on hidden
12852 		     * semantics agreed between a FS and carbon.
12853 		     * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
12854 		     * for Carbon to set bNoVolumeSizes volume attribute.
12855 		     * Without this the webdavfs files cannot be copied onto
12856 		     * disk as they look huge. This change should not affect
12857 		     * XSAN as they should not setting these to -1..
12858 		     */
12859 		    && (sfsp->f_blocks != 0xffffffffffffffffULL)
12860 		    && (sfsp->f_bfree != 0xffffffffffffffffULL)
12861 		    && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
12862 			int             shift;
12863 
12864 			/*
12865 			 * Work out how far we have to shift the block count down to make it fit.
12866 			 * Note that it's possible to have to shift so far that the resulting
12867 			 * blocksize would be unreportably large.  At that point, we will clip
12868 			 * any values that don't fit.
12869 			 *
12870 			 * For safety's sake, we also ensure that f_iosize is never reported as
12871 			 * being smaller than f_bsize.
12872 			 */
12873 			for (shift = 0; shift < 32; shift++) {
12874 				if ((sfsp->f_blocks >> shift) <= INT_MAX) {
12875 					break;
12876 				}
12877 				if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
12878 					break;
12879 				}
12880 			}
12881 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
12882 			sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
12883 			sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
12884 			sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
12885 #undef __SHIFT_OR_CLIP
12886 			sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
12887 			sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
12888 		} else {
12889 			/* filesystem is small enough to be reported honestly */
12890 			sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
12891 			sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
12892 			sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
12893 			sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
12894 			sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
12895 		}
12896 		sfs.f_files = (user32_long_t)sfsp->f_files;
12897 		sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
12898 		sfs.f_fsid = sfsp->f_fsid;
12899 		sfs.f_owner = sfsp->f_owner;
12900 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12901 			strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12902 		} else {
12903 			strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12904 		}
12905 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12906 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12907 
12908 		if (partial_copy) {
12909 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12910 		}
12911 		error = copyout((caddr_t)&sfs, bufp, copy_size);
12912 	}
12913 
12914 	if (sizep != NULL) {
12915 		*sizep = my_size;
12916 	}
12917 	return error;
12918 }
12919 
12920 /*
12921  * copy stat structure into user_stat structure.
12922  */
12923 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)12924 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
12925 {
12926 	bzero(usbp, sizeof(*usbp));
12927 
12928 	usbp->st_dev = sbp->st_dev;
12929 	usbp->st_ino = sbp->st_ino;
12930 	usbp->st_mode = sbp->st_mode;
12931 	usbp->st_nlink = sbp->st_nlink;
12932 	usbp->st_uid = sbp->st_uid;
12933 	usbp->st_gid = sbp->st_gid;
12934 	usbp->st_rdev = sbp->st_rdev;
12935 #ifndef _POSIX_C_SOURCE
12936 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12937 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12938 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12939 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12940 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12941 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12942 #else
12943 	usbp->st_atime = sbp->st_atime;
12944 	usbp->st_atimensec = sbp->st_atimensec;
12945 	usbp->st_mtime = sbp->st_mtime;
12946 	usbp->st_mtimensec = sbp->st_mtimensec;
12947 	usbp->st_ctime = sbp->st_ctime;
12948 	usbp->st_ctimensec = sbp->st_ctimensec;
12949 #endif
12950 	usbp->st_size = sbp->st_size;
12951 	usbp->st_blocks = sbp->st_blocks;
12952 	usbp->st_blksize = sbp->st_blksize;
12953 	usbp->st_flags = sbp->st_flags;
12954 	usbp->st_gen = sbp->st_gen;
12955 	usbp->st_lspare = sbp->st_lspare;
12956 	usbp->st_qspare[0] = sbp->st_qspare[0];
12957 	usbp->st_qspare[1] = sbp->st_qspare[1];
12958 }
12959 
12960 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)12961 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
12962 {
12963 	bzero(usbp, sizeof(*usbp));
12964 
12965 	usbp->st_dev = sbp->st_dev;
12966 	usbp->st_ino = sbp->st_ino;
12967 	usbp->st_mode = sbp->st_mode;
12968 	usbp->st_nlink = sbp->st_nlink;
12969 	usbp->st_uid = sbp->st_uid;
12970 	usbp->st_gid = sbp->st_gid;
12971 	usbp->st_rdev = sbp->st_rdev;
12972 #ifndef _POSIX_C_SOURCE
12973 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
12974 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
12975 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
12976 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
12977 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
12978 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
12979 #else
12980 	usbp->st_atime = sbp->st_atime;
12981 	usbp->st_atimensec = sbp->st_atimensec;
12982 	usbp->st_mtime = sbp->st_mtime;
12983 	usbp->st_mtimensec = sbp->st_mtimensec;
12984 	usbp->st_ctime = sbp->st_ctime;
12985 	usbp->st_ctimensec = sbp->st_ctimensec;
12986 #endif
12987 	usbp->st_size = sbp->st_size;
12988 	usbp->st_blocks = sbp->st_blocks;
12989 	usbp->st_blksize = sbp->st_blksize;
12990 	usbp->st_flags = sbp->st_flags;
12991 	usbp->st_gen = sbp->st_gen;
12992 	usbp->st_lspare = sbp->st_lspare;
12993 	usbp->st_qspare[0] = sbp->st_qspare[0];
12994 	usbp->st_qspare[1] = sbp->st_qspare[1];
12995 }
12996 
12997 /*
12998  * copy stat64 structure into user_stat64 structure.
12999  */
13000 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)13001 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
13002 {
13003 	bzero(usbp, sizeof(*usbp));
13004 
13005 	usbp->st_dev = sbp->st_dev;
13006 	usbp->st_ino = sbp->st_ino;
13007 	usbp->st_mode = sbp->st_mode;
13008 	usbp->st_nlink = sbp->st_nlink;
13009 	usbp->st_uid = sbp->st_uid;
13010 	usbp->st_gid = sbp->st_gid;
13011 	usbp->st_rdev = sbp->st_rdev;
13012 #ifndef _POSIX_C_SOURCE
13013 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13014 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13015 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13016 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13017 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13018 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13019 	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
13020 	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
13021 #else
13022 	usbp->st_atime = sbp->st_atime;
13023 	usbp->st_atimensec = sbp->st_atimensec;
13024 	usbp->st_mtime = sbp->st_mtime;
13025 	usbp->st_mtimensec = sbp->st_mtimensec;
13026 	usbp->st_ctime = sbp->st_ctime;
13027 	usbp->st_ctimensec = sbp->st_ctimensec;
13028 	usbp->st_birthtime = sbp->st_birthtime;
13029 	usbp->st_birthtimensec = sbp->st_birthtimensec;
13030 #endif
13031 	usbp->st_size = sbp->st_size;
13032 	usbp->st_blocks = sbp->st_blocks;
13033 	usbp->st_blksize = sbp->st_blksize;
13034 	usbp->st_flags = sbp->st_flags;
13035 	usbp->st_gen = sbp->st_gen;
13036 	usbp->st_lspare = sbp->st_lspare;
13037 	usbp->st_qspare[0] = sbp->st_qspare[0];
13038 	usbp->st_qspare[1] = sbp->st_qspare[1];
13039 }
13040 
13041 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)13042 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
13043 {
13044 	bzero(usbp, sizeof(*usbp));
13045 
13046 	usbp->st_dev = sbp->st_dev;
13047 	usbp->st_ino = sbp->st_ino;
13048 	usbp->st_mode = sbp->st_mode;
13049 	usbp->st_nlink = sbp->st_nlink;
13050 	usbp->st_uid = sbp->st_uid;
13051 	usbp->st_gid = sbp->st_gid;
13052 	usbp->st_rdev = sbp->st_rdev;
13053 #ifndef _POSIX_C_SOURCE
13054 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13055 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13056 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13057 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13058 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13059 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13060 	usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
13061 	usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
13062 #else
13063 	usbp->st_atime = sbp->st_atime;
13064 	usbp->st_atimensec = sbp->st_atimensec;
13065 	usbp->st_mtime = sbp->st_mtime;
13066 	usbp->st_mtimensec = sbp->st_mtimensec;
13067 	usbp->st_ctime = sbp->st_ctime;
13068 	usbp->st_ctimensec = sbp->st_ctimensec;
13069 	usbp->st_birthtime = sbp->st_birthtime;
13070 	usbp->st_birthtimensec = sbp->st_birthtimensec;
13071 #endif
13072 	usbp->st_size = sbp->st_size;
13073 	usbp->st_blocks = sbp->st_blocks;
13074 	usbp->st_blksize = sbp->st_blksize;
13075 	usbp->st_flags = sbp->st_flags;
13076 	usbp->st_gen = sbp->st_gen;
13077 	usbp->st_lspare = sbp->st_lspare;
13078 	usbp->st_qspare[0] = sbp->st_qspare[0];
13079 	usbp->st_qspare[1] = sbp->st_qspare[1];
13080 }
13081 
13082 /*
13083  * Purge buffer cache for simulating cold starts
13084  */
13085 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)13086 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
13087 {
13088 	ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
13089 
13090 	return VNODE_RETURNED;
13091 }
13092 
13093 static int
vfs_purge_callback(mount_t mp,__unused void * arg)13094 vfs_purge_callback(mount_t mp, __unused void * arg)
13095 {
13096 	vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
13097 
13098 	return VFS_RETURNED;
13099 }
13100 
13101 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)13102 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
13103 {
13104 	if (!kauth_cred_issuser(kauth_cred_get())) {
13105 		return EPERM;
13106 	}
13107 
13108 	vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
13109 
13110 	return 0;
13111 }
13112 
13113 /*
13114  * gets the vnode associated with the (unnamed) snapshot directory
13115  * for a Filesystem. The snapshot directory vnode is returned with
13116  * an iocount on it.
13117  */
13118 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)13119 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
13120 {
13121 	return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
13122 }
13123 
13124 /*
13125  * Get the snapshot vnode.
13126  *
13127  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
13128  * needs nameidone() on ndp.
13129  *
13130  * If the snapshot vnode exists it is returned in ndp->ni_vp.
13131  *
13132  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
13133  * not needed.
13134  */
13135 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)13136 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
13137     user_addr_t name, struct nameidata *ndp, int32_t op,
13138 #if !CONFIG_TRIGGERS
13139     __unused
13140 #endif
13141     enum path_operation pathop,
13142     vfs_context_t ctx)
13143 {
13144 	int error, i;
13145 	caddr_t name_buf;
13146 	size_t name_len;
13147 	struct vfs_attr vfa;
13148 
13149 	*sdvpp = NULLVP;
13150 	*rvpp = NULLVP;
13151 
13152 	error = vnode_getfromfd(ctx, dirfd, rvpp);
13153 	if (error) {
13154 		return error;
13155 	}
13156 
13157 	if (!vnode_isvroot(*rvpp)) {
13158 		error = EINVAL;
13159 		goto out;
13160 	}
13161 
13162 	/* Make sure the filesystem supports snapshots */
13163 	VFSATTR_INIT(&vfa);
13164 	VFSATTR_WANTED(&vfa, f_capabilities);
13165 	if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
13166 	    !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
13167 	    !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
13168 	    VOL_CAP_INT_SNAPSHOT)) ||
13169 	    !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
13170 	    VOL_CAP_INT_SNAPSHOT))) {
13171 		error = ENOTSUP;
13172 		goto out;
13173 	}
13174 
13175 	error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
13176 	if (error) {
13177 		goto out;
13178 	}
13179 
13180 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13181 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13182 	if (error) {
13183 		goto out1;
13184 	}
13185 
13186 	/*
13187 	 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
13188 	 * (the length returned by copyinstr includes the terminating NUL)
13189 	 */
13190 	if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
13191 	    (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
13192 		error = EINVAL;
13193 		goto out1;
13194 	}
13195 	for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
13196 		;
13197 	}
13198 	if (i < (int)name_len) {
13199 		error = EINVAL;
13200 		goto out1;
13201 	}
13202 
13203 #if CONFIG_MACF
13204 	if (op == CREATE) {
13205 		error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
13206 		    name_buf);
13207 	} else if (op == DELETE) {
13208 		error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
13209 		    name_buf);
13210 	}
13211 	if (error) {
13212 		goto out1;
13213 	}
13214 #endif
13215 
13216 	/* Check if the snapshot already exists ... */
13217 	NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
13218 	    UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
13219 	ndp->ni_dvp = *sdvpp;
13220 
13221 	error = namei(ndp);
13222 out1:
13223 	zfree(ZV_NAMEI, name_buf);
13224 out:
13225 	if (error) {
13226 		if (*sdvpp) {
13227 			vnode_put(*sdvpp);
13228 			*sdvpp = NULLVP;
13229 		}
13230 		if (*rvpp) {
13231 			vnode_put(*rvpp);
13232 			*rvpp = NULLVP;
13233 		}
13234 	}
13235 	return error;
13236 }
13237 
13238 /*
13239  * create a filesystem snapshot (for supporting filesystems)
13240  *
13241  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
13242  * We get to the (unnamed) snapshot directory vnode and create the vnode
13243  * for the snapshot in it.
13244  *
13245  * Restrictions:
13246  *
13247  *    a) Passed in name for snapshot cannot have slashes.
13248  *    b) name can't be "." or ".."
13249  *
13250  * Since this requires superuser privileges, vnode_authorize calls are not
13251  * made.
13252  */
13253 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13254 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
13255     vfs_context_t ctx)
13256 {
13257 	vnode_t rvp, snapdvp;
13258 	int error;
13259 	struct nameidata *ndp;
13260 
13261 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
13262 
13263 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
13264 	    OP_LINK, ctx);
13265 	if (error) {
13266 		goto out;
13267 	}
13268 
13269 	if (ndp->ni_vp) {
13270 		vnode_put(ndp->ni_vp);
13271 		error = EEXIST;
13272 	} else {
13273 		struct vnode_attr *vap;
13274 		vnode_t vp = NULLVP;
13275 
13276 		vap = kalloc_type(struct vnode_attr, Z_WAITOK);
13277 
13278 		VATTR_INIT(vap);
13279 		VATTR_SET(vap, va_type, VREG);
13280 		VATTR_SET(vap, va_mode, 0);
13281 
13282 		error = vn_create(snapdvp, &vp, ndp, vap,
13283 		    VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
13284 		if (!error && vp) {
13285 			vnode_put(vp);
13286 		}
13287 
13288 		kfree_type(struct vnode_attr, vap);
13289 	}
13290 
13291 	nameidone(ndp);
13292 	vnode_put(snapdvp);
13293 	vnode_put(rvp);
13294 out:
13295 	kfree_type(struct nameidata, ndp);
13296 
13297 	return error;
13298 }
13299 
13300 /*
13301  * Delete a Filesystem snapshot
13302  *
13303  * get the vnode for the unnamed snapshot directory and the snapshot and
13304  * delete the snapshot.
13305  */
13306 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13307 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
13308     vfs_context_t ctx)
13309 {
13310 	vnode_t rvp, snapdvp;
13311 	int error;
13312 	struct nameidata *ndp;
13313 
13314 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
13315 
13316 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
13317 	    OP_UNLINK, ctx);
13318 	if (error) {
13319 		goto out;
13320 	}
13321 
13322 	error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
13323 	    VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
13324 
13325 	vnode_put(ndp->ni_vp);
13326 	nameidone(ndp);
13327 	vnode_put(snapdvp);
13328 	vnode_put(rvp);
13329 out:
13330 	kfree_type(struct nameidata, ndp);
13331 
13332 	return error;
13333 }
13334 
13335 /*
13336  * Revert a filesystem to a snapshot
13337  *
13338  * Marks the filesystem to revert to the given snapshot on next mount.
13339  */
13340 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13341 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
13342     vfs_context_t ctx)
13343 {
13344 	int error;
13345 	vnode_t rvp;
13346 	mount_t mp;
13347 	struct fs_snapshot_revert_args revert_data;
13348 	struct componentname cnp;
13349 	caddr_t name_buf;
13350 	size_t name_len;
13351 
13352 	error = vnode_getfromfd(ctx, dirfd, &rvp);
13353 	if (error) {
13354 		return error;
13355 	}
13356 	mp = vnode_mount(rvp);
13357 
13358 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13359 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13360 	if (error) {
13361 		zfree(ZV_NAMEI, name_buf);
13362 		vnode_put(rvp);
13363 		return error;
13364 	}
13365 
13366 #if CONFIG_MACF
13367 	error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
13368 	if (error) {
13369 		zfree(ZV_NAMEI, name_buf);
13370 		vnode_put(rvp);
13371 		return error;
13372 	}
13373 #endif
13374 
13375 	/*
13376 	 * Grab mount_iterref so that we can release the vnode,
13377 	 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
13378 	 */
13379 	error = mount_iterref(mp, 0);
13380 	vnode_put(rvp);
13381 	if (error) {
13382 		zfree(ZV_NAMEI, name_buf);
13383 		return error;
13384 	}
13385 
13386 	memset(&cnp, 0, sizeof(cnp));
13387 	cnp.cn_pnbuf = (char *)name_buf;
13388 	cnp.cn_nameiop = LOOKUP;
13389 	cnp.cn_flags = ISLASTCN | HASBUF;
13390 	cnp.cn_pnlen = MAXPATHLEN;
13391 	cnp.cn_nameptr = cnp.cn_pnbuf;
13392 	cnp.cn_namelen = (int)name_len;
13393 	revert_data.sr_cnp = &cnp;
13394 
13395 	error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
13396 	mount_iterdrop(mp);
13397 	zfree(ZV_NAMEI, name_buf);
13398 
13399 	if (error) {
13400 		/* If there was any error, try again using VNOP_IOCTL */
13401 
13402 		vnode_t snapdvp;
13403 		struct nameidata namend;
13404 
13405 		error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
13406 		    OP_LOOKUP, ctx);
13407 		if (error) {
13408 			return error;
13409 		}
13410 
13411 
13412 		error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
13413 		    0, ctx);
13414 
13415 		vnode_put(namend.ni_vp);
13416 		nameidone(&namend);
13417 		vnode_put(snapdvp);
13418 		vnode_put(rvp);
13419 	}
13420 
13421 	return error;
13422 }
13423 
13424 /*
13425  * rename a Filesystem snapshot
13426  *
13427  * get the vnode for the unnamed snapshot directory and the snapshot and
13428  * rename the snapshot. This is a very specialised (and simple) case of
13429  * rename(2) (which has to deal with a lot more complications). It differs
13430  * slightly from rename(2) in that EEXIST is returned if the new name exists.
13431  */
13432 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)13433 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
13434     __unused uint32_t flags, vfs_context_t ctx)
13435 {
13436 	vnode_t rvp, snapdvp;
13437 	int error, i;
13438 	caddr_t newname_buf;
13439 	size_t name_len;
13440 	vnode_t fvp;
13441 	struct nameidata *fromnd, *tond;
13442 	/* carving out a chunk for structs that are too big to be on stack. */
13443 	struct {
13444 		struct nameidata from_node;
13445 		struct nameidata to_node;
13446 	} * __rename_data;
13447 
13448 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
13449 	fromnd = &__rename_data->from_node;
13450 	tond = &__rename_data->to_node;
13451 
13452 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
13453 	    OP_UNLINK, ctx);
13454 	if (error) {
13455 		goto out;
13456 	}
13457 	fvp  = fromnd->ni_vp;
13458 
13459 	newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13460 	error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
13461 	if (error) {
13462 		goto out1;
13463 	}
13464 
13465 	/*
13466 	 * Some sanity checks- new name can't be empty, "." or ".." or have
13467 	 * slashes.
13468 	 * (the length returned by copyinstr includes the terminating NUL)
13469 	 *
13470 	 * The FS rename VNOP is suppossed to handle this but we'll pick it
13471 	 * off here itself.
13472 	 */
13473 	if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
13474 	    (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
13475 		error = EINVAL;
13476 		goto out1;
13477 	}
13478 	for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
13479 		;
13480 	}
13481 	if (i < (int)name_len) {
13482 		error = EINVAL;
13483 		goto out1;
13484 	}
13485 
13486 #if CONFIG_MACF
13487 	error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
13488 	    newname_buf);
13489 	if (error) {
13490 		goto out1;
13491 	}
13492 #endif
13493 
13494 	NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
13495 	    UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
13496 	tond->ni_dvp = snapdvp;
13497 
13498 	error = namei(tond);
13499 	if (error) {
13500 		goto out2;
13501 	} else if (tond->ni_vp) {
13502 		/*
13503 		 * snapshot rename behaves differently than rename(2) - if the
13504 		 * new name exists, EEXIST is returned.
13505 		 */
13506 		vnode_put(tond->ni_vp);
13507 		error = EEXIST;
13508 		goto out2;
13509 	}
13510 
13511 	error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
13512 	    &tond->ni_cnd, ctx);
13513 
13514 out2:
13515 	nameidone(tond);
13516 out1:
13517 	zfree(ZV_NAMEI, newname_buf);
13518 	vnode_put(fvp);
13519 	vnode_put(snapdvp);
13520 	vnode_put(rvp);
13521 	nameidone(fromnd);
13522 out:
13523 	kfree_type(typeof(*__rename_data), __rename_data);
13524 	return error;
13525 }
13526 
13527 /*
13528  * Mount a Filesystem snapshot
13529  *
13530  * get the vnode for the unnamed snapshot directory and the snapshot and
13531  * mount the snapshot.
13532  */
13533 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)13534 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
13535     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
13536 {
13537 	mount_t mp;
13538 	vnode_t rvp, snapdvp, snapvp, vp, pvp;
13539 	struct fs_snapshot_mount_args smnt_data;
13540 	int error;
13541 	struct nameidata *snapndp, *dirndp;
13542 	/* carving out a chunk for structs that are too big to be on stack. */
13543 	struct {
13544 		struct nameidata snapnd;
13545 		struct nameidata dirnd;
13546 	} * __snapshot_mount_data;
13547 
13548 	__snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
13549 	snapndp = &__snapshot_mount_data->snapnd;
13550 	dirndp = &__snapshot_mount_data->dirnd;
13551 
13552 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
13553 	    OP_LOOKUP, ctx);
13554 	if (error) {
13555 		goto out;
13556 	}
13557 
13558 	snapvp  = snapndp->ni_vp;
13559 	if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
13560 		error = EIO;
13561 		goto out1;
13562 	}
13563 
13564 	/* Get the vnode to be covered */
13565 	NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
13566 	    UIO_USERSPACE, directory, ctx);
13567 	error = namei(dirndp);
13568 	if (error) {
13569 		goto out1;
13570 	}
13571 
13572 	vp = dirndp->ni_vp;
13573 	pvp = dirndp->ni_dvp;
13574 	mp = vnode_mount(rvp);
13575 
13576 	if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
13577 		error = EINVAL;
13578 		goto out2;
13579 	}
13580 
13581 #if CONFIG_MACF
13582 	error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
13583 	    mp->mnt_vfsstat.f_fstypename);
13584 	if (error) {
13585 		goto out2;
13586 	}
13587 #endif
13588 
13589 	smnt_data.sm_mp  = mp;
13590 	smnt_data.sm_cnp = &snapndp->ni_cnd;
13591 	error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
13592 	    &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
13593 	    KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
13594 
13595 out2:
13596 	vnode_put(vp);
13597 	vnode_put(pvp);
13598 	nameidone(dirndp);
13599 out1:
13600 	vnode_put(snapvp);
13601 	vnode_put(snapdvp);
13602 	vnode_put(rvp);
13603 	nameidone(snapndp);
13604 out:
13605 	kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
13606 	return error;
13607 }
13608 
13609 /*
13610  * Root from a snapshot of the filesystem
13611  *
13612  * Marks the filesystem to root from the given snapshot on next boot.
13613  */
13614 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13615 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
13616     vfs_context_t ctx)
13617 {
13618 	int error;
13619 	vnode_t rvp;
13620 	mount_t mp;
13621 	struct fs_snapshot_root_args root_data;
13622 	struct componentname cnp;
13623 	caddr_t name_buf;
13624 	size_t name_len;
13625 
13626 	error = vnode_getfromfd(ctx, dirfd, &rvp);
13627 	if (error) {
13628 		return error;
13629 	}
13630 	mp = vnode_mount(rvp);
13631 
13632 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13633 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13634 	if (error) {
13635 		zfree(ZV_NAMEI, name_buf);
13636 		vnode_put(rvp);
13637 		return error;
13638 	}
13639 
13640 	// XXX MAC checks ?
13641 
13642 	/*
13643 	 * Grab mount_iterref so that we can release the vnode,
13644 	 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
13645 	 */
13646 	error = mount_iterref(mp, 0);
13647 	vnode_put(rvp);
13648 	if (error) {
13649 		zfree(ZV_NAMEI, name_buf);
13650 		return error;
13651 	}
13652 
13653 	memset(&cnp, 0, sizeof(cnp));
13654 	cnp.cn_pnbuf = (char *)name_buf;
13655 	cnp.cn_nameiop = LOOKUP;
13656 	cnp.cn_flags = ISLASTCN | HASBUF;
13657 	cnp.cn_pnlen = MAXPATHLEN;
13658 	cnp.cn_nameptr = cnp.cn_pnbuf;
13659 	cnp.cn_namelen = (int)name_len;
13660 	root_data.sr_cnp = &cnp;
13661 
13662 	error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
13663 
13664 	mount_iterdrop(mp);
13665 	zfree(ZV_NAMEI, name_buf);
13666 
13667 	return error;
13668 }
13669 
13670 /*
13671  * FS snapshot operations dispatcher
13672  */
13673 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)13674 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
13675     __unused int32_t *retval)
13676 {
13677 	int error;
13678 	vfs_context_t ctx = vfs_context_current();
13679 
13680 	AUDIT_ARG(fd, uap->dirfd);
13681 	AUDIT_ARG(value32, uap->op);
13682 
13683 	error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_SNAPSHOT, 0);
13684 	if (error) {
13685 		return error;
13686 	}
13687 
13688 	/*
13689 	 * Enforce user authorization for snapshot modification operations,
13690 	 * or if trying to root from snapshot.
13691 	 */
13692 	if (uap->op != SNAPSHOT_OP_MOUNT) {
13693 		vnode_t dvp = NULLVP;
13694 		vnode_t devvp = NULLVP;
13695 		mount_t mp;
13696 
13697 		error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
13698 		if (error) {
13699 			return error;
13700 		}
13701 		mp = vnode_mount(dvp);
13702 		devvp = mp->mnt_devvp;
13703 
13704 		/* get an iocount on devvp */
13705 		if (devvp == NULLVP) {
13706 			error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
13707 			/* for mounts which arent block devices */
13708 			if (error == ENOENT) {
13709 				error = ENXIO;
13710 			}
13711 		} else {
13712 			error = vnode_getwithref(devvp);
13713 		}
13714 
13715 		if (error) {
13716 			vnode_put(dvp);
13717 			return error;
13718 		}
13719 
13720 		if ((vfs_context_issuser(ctx) == 0) &&
13721 		    (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
13722 		    (!IOCurrentTaskHasEntitlement("com.apple.private.vfs.snapshot.user"))) {
13723 			error = EPERM;
13724 		}
13725 		vnode_put(dvp);
13726 		vnode_put(devvp);
13727 
13728 		if (error) {
13729 			return error;
13730 		}
13731 	}
13732 
13733 	switch (uap->op) {
13734 	case SNAPSHOT_OP_CREATE:
13735 		error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
13736 		break;
13737 	case SNAPSHOT_OP_DELETE:
13738 		error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
13739 		break;
13740 	case SNAPSHOT_OP_RENAME:
13741 		error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
13742 		    uap->flags, ctx);
13743 		break;
13744 	case SNAPSHOT_OP_MOUNT:
13745 		error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
13746 		    uap->data, uap->flags, ctx);
13747 		break;
13748 	case SNAPSHOT_OP_REVERT:
13749 		error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
13750 		break;
13751 #if CONFIG_MNT_ROOTSNAP
13752 	case SNAPSHOT_OP_ROOT:
13753 		error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
13754 		break;
13755 #endif /* CONFIG_MNT_ROOTSNAP */
13756 	default:
13757 		error = ENOSYS;
13758 	}
13759 
13760 	return error;
13761 }
13762