xref: /xnu-10002.81.5/bsd/vfs/vfs_subr.c (revision 5e3eaea39dcf651e66cb99ba7d70e32cc4a99587)
1 /*
2  *
3  * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
4  *
5  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6  *
7  * This file contains Original Code and/or Modifications of Original Code
8  * as defined in and that are subject to the Apple Public Source License
9  * Version 2.0 (the 'License'). You may not use this file except in
10  * compliance with the License. The rights granted to you under the License
11  * may not be used to create, or enable the creation or redistribution of,
12  * unlawful or unlicensed copies of an Apple operating system, or to
13  * circumvent, violate, or enable the circumvention or violation of, any
14  * terms of an Apple operating system software license agreement.
15  *
16  * Please obtain a copy of the License at
17  * http://www.opensource.apple.com/apsl/ and read it before using this file.
18  *
19  * The Original Code and all software distributed under the License are
20  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
21  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
22  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
24  * Please see the License for the specific language governing rights and
25  * limitations under the License.
26  *
27  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28  */
29 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 /*
31  * Copyright (c) 1989, 1993
32  *	The Regents of the University of California.  All rights reserved.
33  * (c) UNIX System Laboratories, Inc.
34  * All or some portions of this file are derived from material licensed
35  * to the University of California by American Telephone and Telegraph
36  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
37  * the permission of UNIX System Laboratories, Inc.
38  *
39  * Redistribution and use in source and binary forms, with or without
40  * modification, are permitted provided that the following conditions
41  * are met:
42  * 1. Redistributions of source code must retain the above copyright
43  *    notice, this list of conditions and the following disclaimer.
44  * 2. Redistributions in binary form must reproduce the above copyright
45  *    notice, this list of conditions and the following disclaimer in the
46  *    documentation and/or other materials provided with the distribution.
47  * 3. All advertising materials mentioning features or use of this software
48  *    must display the following acknowledgement:
49  *	This product includes software developed by the University of
50  *	California, Berkeley and its contributors.
51  * 4. Neither the name of the University nor the names of its contributors
52  *    may be used to endorse or promote products derived from this software
53  *    without specific prior written permission.
54  *
55  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65  * SUCH DAMAGE.
66  *
67  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
68  */
69 /*
70  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
71  * support for mandatory and extensible security protections.  This notice
72  * is included in support of clause 2.2 (b) of the Apple Public License,
73  * Version 2.0.
74  */
75 
76 /*
77  * External virtual filesystem routines
78  */
79 
80 #include <sys/param.h>
81 #include <sys/systm.h>
82 #include <sys/proc_internal.h>
83 #include <sys/kauth.h>
84 #include <sys/mount_internal.h>
85 #include <sys/time.h>
86 #include <sys/lock.h>
87 #include <sys/vnode.h>
88 #include <sys/vnode_internal.h>
89 #include <sys/stat.h>
90 #include <sys/namei.h>
91 #include <sys/ucred.h>
92 #include <sys/buf_internal.h>
93 #include <sys/errno.h>
94 #include <kern/kalloc.h>
95 #include <sys/uio_internal.h>
96 #include <sys/uio.h>
97 #include <sys/domain.h>
98 #include <sys/mbuf.h>
99 #include <sys/syslog.h>
100 #include <sys/ubc_internal.h>
101 #include <sys/vm.h>
102 #include <sys/sysctl.h>
103 #include <sys/filedesc.h>
104 #include <sys/event.h>
105 #include <sys/kdebug.h>
106 #include <sys/kauth.h>
107 #include <sys/user.h>
108 #include <sys/systm.h>
109 #include <sys/kern_memorystatus.h>
110 #include <sys/lockf.h>
111 #include <sys/reboot.h>
112 #include <miscfs/fifofs/fifo.h>
113 
114 #include <nfs/nfs.h>
115 
116 #include <string.h>
117 #include <machine/machine_routines.h>
118 
119 #include <kern/assert.h>
120 #include <mach/kern_return.h>
121 #include <kern/thread.h>
122 #include <kern/sched_prim.h>
123 #include <kern/smr.h>
124 
125 #include <miscfs/specfs/specdev.h>
126 
127 #include <mach/mach_types.h>
128 #include <mach/memory_object_types.h>
129 #include <mach/memory_object_control.h>
130 
131 #include <kern/kalloc.h>        /* kalloc()/kfree() */
132 #include <kern/clock.h>         /* delay_for_interval() */
133 #include <libkern/coreanalytics/coreanalytics.h>
134 #include <libkern/OSAtomic.h>   /* OSAddAtomic() */
135 #include <os/atomic_private.h>
136 #if defined(XNU_TARGET_OS_OSX)
137 #include <console/video_console.h>
138 #endif
139 
140 #ifdef CONFIG_IOCOUNT_TRACE
141 #include <libkern/OSDebug.h>
142 #endif
143 
144 #include <vm/vm_protos.h>       /* vnode_pager_vrele() */
145 
146 #if CONFIG_MACF
147 #include <security/mac_framework.h>
148 #endif
149 
150 #include <vfs/vfs_disk_conditioner.h>
151 #include <libkern/section_keywords.h>
152 
153 static LCK_GRP_DECLARE(vnode_lck_grp, "vnode");
154 static LCK_ATTR_DECLARE(vnode_lck_attr, 0, 0);
155 
156 #if CONFIG_TRIGGERS
157 static LCK_GRP_DECLARE(trigger_vnode_lck_grp, "trigger_vnode");
158 static LCK_ATTR_DECLARE(trigger_vnode_lck_attr, 0, 0);
159 #endif
160 
161 extern lck_mtx_t mnt_list_mtx_lock;
162 
163 static KALLOC_TYPE_DEFINE(specinfo_zone, struct specinfo, KT_DEFAULT);
164 
165 ZONE_DEFINE(vnode_zone, "vnodes",
166     sizeof(struct vnode), ZC_NOGC | ZC_ZFREE_CLEARMEM);
167 
168 enum vtype iftovt_tab[16] = {
169 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
170 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
171 };
172 int     vttoif_tab[9] = {
173 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
174 	S_IFSOCK, S_IFIFO, S_IFMT,
175 };
176 
177 /* XXX These should be in a BSD accessible Mach header, but aren't. */
178 extern void             memory_object_mark_used(
179 	memory_object_control_t         control);
180 
181 extern void             memory_object_mark_unused(
182 	memory_object_control_t         control,
183 	boolean_t                       rage);
184 
185 extern void             memory_object_mark_io_tracking(
186 	memory_object_control_t         control);
187 
188 extern int paniclog_append_noflush(const char *format, ...);
189 
190 /* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */
191 __private_extern__ void qsort(
192 	void * array,
193 	size_t nmembers,
194 	size_t member_size,
195 	int (*)(const void *, const void *));
196 
197 __private_extern__ void vntblinit(void);
198 __private_extern__ int unlink1(vfs_context_t, vnode_t, user_addr_t,
199     enum uio_seg, int);
200 
201 static void vnode_list_add(vnode_t);
202 static void vnode_async_list_add(vnode_t);
203 static void vnode_list_remove(vnode_t);
204 static void vnode_list_remove_locked(vnode_t);
205 
206 static void vnode_abort_advlocks(vnode_t);
207 static errno_t vnode_drain(vnode_t);
208 static void vgone(vnode_t, int flags);
209 static void vclean(vnode_t vp, int flag);
210 static void vnode_reclaim_internal(vnode_t, int, int, int);
211 
212 static void vnode_dropiocount(vnode_t);
213 
214 static vnode_t checkalias(vnode_t vp, dev_t nvp_rdev);
215 static int  vnode_reload(vnode_t);
216 
217 static int unmount_callback(mount_t, __unused void *);
218 
219 static void insmntque(vnode_t vp, mount_t mp);
220 static int mount_getvfscnt(void);
221 static int mount_fillfsids(fsid_t *, int );
222 static void vnode_iterate_setup(mount_t);
223 int vnode_umount_preflight(mount_t, vnode_t, int);
224 static int vnode_iterate_prepare(mount_t);
225 static int vnode_iterate_reloadq(mount_t);
226 static void vnode_iterate_clear(mount_t);
227 static mount_t vfs_getvfs_locked(fsid_t *);
228 static int vn_create_reg(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp,
229     struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx);
230 static int vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx);
231 
232 errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
233 
234 #ifdef CONFIG_IOCOUNT_TRACE
235 static void record_vp(vnode_t vp, int count);
236 static TUNABLE(int, bootarg_vnode_iocount_trace, "vnode_iocount_trace", 0);
237 static TUNABLE(int, bootarg_uthread_iocount_trace, "uthread_iocount_trace", 0);
238 #endif /* CONFIG_IOCOUNT_TRACE */
239 
240 #if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
241 static TUNABLE(bool, bootarg_no_vnode_jetsam, "-no_vnode_jetsam", false);
242 #endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
243 
244 static TUNABLE(bool, bootarg_no_vnode_drain, "-no_vnode_drain", false);
245 
246 __options_decl(freeable_vnode_level_t, uint32_t, {
247 	DEALLOC_VNODE_NONE = 0,
248 	DEALLOC_VNODE_ONLY_OVERFLOW = 1,
249 	DEALLOC_VNODE_ALL = 2
250 });
251 
252 #if XNU_TARGET_OS_OSX
253 static TUNABLE(freeable_vnode_level_t, bootarg_vn_dealloc_level, "vn_dealloc_level", DEALLOC_VNODE_NONE);
254 #else
255 static TUNABLE(freeable_vnode_level_t, bootarg_vn_dealloc_level, "vn_dealloc_level", DEALLOC_VNODE_ONLY_OVERFLOW);
256 #endif /* CONFIG_VNDEALLOC */
257 
258 static freeable_vnode_level_t vn_dealloc_level = DEALLOC_VNODE_NONE;
259 
260 boolean_t root_is_CF_drive = FALSE;
261 
262 #if CONFIG_TRIGGERS
263 static int vnode_resolver_create(mount_t, vnode_t, struct vnode_trigger_param *, boolean_t external);
264 static void vnode_resolver_detach(vnode_t);
265 #endif
266 
267 TAILQ_HEAD(freelst, vnode) vnode_free_list;     /* vnode free list */
268 TAILQ_HEAD(deadlst, vnode) vnode_dead_list;     /* vnode dead list */
269 TAILQ_HEAD(async_work_lst, vnode) vnode_async_work_list;
270 
271 
272 TAILQ_HEAD(ragelst, vnode) vnode_rage_list;     /* vnode rapid age list */
273 struct timeval rage_tv;
274 int     rage_limit = 0;
275 int     ragevnodes = 0;
276 
277 long  reusablevnodes_max = LONG_MAX;
278 long  reusablevnodes = 0;
279 int   deadvnodes_low = 0;
280 int   deadvnodes_high = 0;
281 int   numvnodes_min = 0;
282 int   numvnodes_max = 0;
283 
284 uint64_t newvnode = 0;
285 unsigned long newvnode_nodead = 0;
286 
287 static  int vfs_unmountall_started = 0;
288 static  int vfs_unmountall_finished = 0;
289 static  uint64_t vfs_shutdown_last_completion_time;
290 
291 #define RAGE_LIMIT_MIN  100
292 #define RAGE_TIME_LIMIT 5
293 
294 VFS_SMR_DECLARE;
295 extern uint32_t nc_smr_enabled;
296 
297 /*
298  * ROSV definitions
299  * NOTE: These are shadowed from PlatformSupport definitions, but XNU
300  * builds standalone.
301  */
302 #define PLATFORM_DATA_VOLUME_MOUNT_POINT "/System/Volumes/Data"
303 
304 /*
305  * These could be in PlatformSupport but aren't yet
306  */
307 #define PLATFORM_PREBOOT_VOLUME_MOUNT_POINT "/System/Volumes/Preboot"
308 #define PLATFORM_RECOVERY_VOLUME_MOUNT_POINT "/System/Volumes/Recovery"
309 
310 #if CONFIG_MOUNT_VM
311 #define PLATFORM_VM_VOLUME_MOUNT_POINT "/System/Volumes/VM"
312 #endif
313 
314 struct mntlist mountlist;                       /* mounted filesystem list */
315 static int nummounts = 0;
316 
317 static int print_busy_vnodes = 0;                               /* print out busy vnodes */
318 
319 #if DIAGNOSTIC
320 #define VLISTCHECK(fun, vp, list)       \
321 	if ((vp)->v_freelist.tqe_prev == (struct vnode **)0xdeadb) \
322 	        panic("%s: %s vnode not on %slist", (fun), (list), (list));
323 #else
324 #define VLISTCHECK(fun, vp, list)
325 #endif /* DIAGNOSTIC */
326 
327 #define VLISTNONE(vp)   \
328 	do {    \
329 	        (vp)->v_freelist.tqe_next = (struct vnode *)0;  \
330 	        (vp)->v_freelist.tqe_prev = (struct vnode **)0xdeadb;   \
331 	} while(0)
332 
333 #define VONLIST(vp)     \
334 	((vp)->v_freelist.tqe_prev != (struct vnode **)0xdeadb)
335 
336 /* remove a vnode from free vnode list */
337 #define VREMFREE(fun, vp)       \
338 	do {    \
339 	        VLISTCHECK((fun), (vp), "free");        \
340 	        TAILQ_REMOVE(&vnode_free_list, (vp), v_freelist);       \
341 	        VLISTNONE((vp));        \
342 	        freevnodes--;   \
343 	        reusablevnodes--;    \
344 	} while(0)
345 
346 
347 /* remove a vnode from dead vnode list */
348 #define VREMDEAD(fun, vp)       \
349 	do {    \
350 	        VLISTCHECK((fun), (vp), "dead");        \
351 	        TAILQ_REMOVE(&vnode_dead_list, (vp), v_freelist);       \
352 	        VLISTNONE((vp));        \
353 	        vp->v_listflag &= ~VLIST_DEAD;  \
354 	        deadvnodes--;   \
355 	        if (vp->v_listflag & VLIST_NO_REUSE) {        \
356 	                deadvnodes_noreuse--;        \
357 	        }        \
358 	} while(0)
359 
360 
361 /* remove a vnode from async work vnode list */
362 #define VREMASYNC_WORK(fun, vp) \
363 	do {    \
364 	        VLISTCHECK((fun), (vp), "async_work");  \
365 	        TAILQ_REMOVE(&vnode_async_work_list, (vp), v_freelist); \
366 	        VLISTNONE((vp));        \
367 	        vp->v_listflag &= ~VLIST_ASYNC_WORK;    \
368 	        async_work_vnodes--;    \
369 	        if (!(vp->v_listflag & VLIST_NO_REUSE)) {        \
370 	                reusablevnodes--;    \
371 	        }        \
372 	} while(0)
373 
374 
375 /* remove a vnode from rage vnode list */
376 #define VREMRAGE(fun, vp)       \
377 	do {    \
378 	        if ( !(vp->v_listflag & VLIST_RAGE))                    \
379 	                panic("VREMRAGE: vp not on rage list");         \
380 	        VLISTCHECK((fun), (vp), "rage");                        \
381 	        TAILQ_REMOVE(&vnode_rage_list, (vp), v_freelist);       \
382 	        VLISTNONE((vp));                \
383 	        vp->v_listflag &= ~VLIST_RAGE;  \
384 	        ragevnodes--;                   \
385 	        reusablevnodes--;    \
386 	} while(0)
387 
388 static void async_work_continue(void);
389 static void vn_laundry_continue(void);
390 static void wakeup_laundry_thread(void);
391 static void vnode_smr_free(void *, size_t);
392 
393 CA_EVENT(freeable_vnodes,
394     CA_INT, numvnodes_min,
395     CA_INT, numvnodes_max,
396     CA_INT, desiredvnodes,
397     CA_INT, numvnodes,
398     CA_INT, freevnodes,
399     CA_INT, deadvnodes,
400     CA_INT, freeablevnodes,
401     CA_INT, busyvnodes,
402     CA_BOOL, threshold_crossed);
403 static CA_EVENT_TYPE(freeable_vnodes) freeable_vnodes_telemetry;
404 
405 static bool freeablevnodes_threshold_crossed = false;
406 
407 /*
408  * Initialize the vnode management data structures.
409  */
410 __private_extern__ void
vntblinit(void)411 vntblinit(void)
412 {
413 	thread_t        thread = THREAD_NULL;
414 	int desiredvnodes_one_percent = desiredvnodes / 100;
415 
416 	TAILQ_INIT(&vnode_free_list);
417 	TAILQ_INIT(&vnode_rage_list);
418 	TAILQ_INIT(&vnode_dead_list);
419 	TAILQ_INIT(&vnode_async_work_list);
420 	TAILQ_INIT(&mountlist);
421 
422 	microuptime(&rage_tv);
423 	rage_limit = desiredvnodes_one_percent;
424 	if (rage_limit < RAGE_LIMIT_MIN) {
425 		rage_limit = RAGE_LIMIT_MIN;
426 	}
427 
428 	deadvnodes_low = desiredvnodes_one_percent;
429 	if (deadvnodes_low > 300) {
430 		deadvnodes_low = 300;
431 	}
432 	deadvnodes_high = deadvnodes_low * 2;
433 
434 	numvnodes_min = numvnodes_max = desiredvnodes;
435 	if (bootarg_vn_dealloc_level == DEALLOC_VNODE_ONLY_OVERFLOW) {
436 		numvnodes_max = desiredvnodes * 2;
437 		vn_dealloc_level = bootarg_vn_dealloc_level;
438 	} else if (bootarg_vn_dealloc_level == DEALLOC_VNODE_ALL) {
439 		numvnodes_min = desiredvnodes_one_percent * 40;
440 		numvnodes_max = desiredvnodes * 2;
441 		reusablevnodes_max = (desiredvnodes_one_percent * 20) - deadvnodes_low;
442 		vn_dealloc_level = bootarg_vn_dealloc_level;
443 	}
444 
445 	bzero(&freeable_vnodes_telemetry, sizeof(CA_EVENT_TYPE(freeable_vnodes)));
446 	freeable_vnodes_telemetry.numvnodes_min = numvnodes_min;
447 	freeable_vnodes_telemetry.numvnodes_max = numvnodes_max;
448 	freeable_vnodes_telemetry.desiredvnodes = desiredvnodes;
449 
450 	if (nc_smr_enabled) {
451 		zone_enable_smr(vnode_zone, VFS_SMR(), &vnode_smr_free);
452 	}
453 
454 	/*
455 	 * create worker threads
456 	 */
457 	kernel_thread_start((thread_continue_t)async_work_continue, NULL, &thread);
458 	thread_deallocate(thread);
459 	kernel_thread_start((thread_continue_t)vn_laundry_continue, NULL, &thread);
460 	thread_deallocate(thread);
461 }
462 
463 /* the timeout is in 10 msecs */
464 int
vnode_waitforwrites(vnode_t vp,int output_target,int slpflag,int slptimeout,const char * msg)465 vnode_waitforwrites(vnode_t vp, int output_target, int slpflag, int slptimeout, const char *msg)
466 {
467 	int error = 0;
468 	struct timespec ts;
469 
470 	if (output_target < 0) {
471 		return EINVAL;
472 	}
473 
474 	KERNEL_DEBUG(0x3010280 | DBG_FUNC_START, (int)vp, output_target, vp->v_numoutput, 0, 0);
475 
476 	if (vp->v_numoutput > output_target) {
477 		slpflag |= PDROP;
478 
479 		vnode_lock_spin(vp);
480 
481 		while ((vp->v_numoutput > output_target) && error == 0) {
482 			if (output_target) {
483 				vp->v_flag |= VTHROTTLED;
484 			} else {
485 				vp->v_flag |= VBWAIT;
486 			}
487 
488 			ts.tv_sec = (slptimeout / 100);
489 			ts.tv_nsec = (slptimeout % 1000)  * 10 * NSEC_PER_USEC * 1000;
490 			error = msleep((caddr_t)&vp->v_numoutput, &vp->v_lock, (slpflag | (PRIBIO + 1)), msg, &ts);
491 
492 			vnode_lock_spin(vp);
493 		}
494 		vnode_unlock(vp);
495 	}
496 	KERNEL_DEBUG(0x3010280 | DBG_FUNC_END, (int)vp, output_target, vp->v_numoutput, error, 0);
497 
498 	return error;
499 }
500 
501 
502 void
vnode_startwrite(vnode_t vp)503 vnode_startwrite(vnode_t vp)
504 {
505 	OSAddAtomic(1, &vp->v_numoutput);
506 }
507 
508 
509 void
vnode_writedone(vnode_t vp)510 vnode_writedone(vnode_t vp)
511 {
512 	if (vp) {
513 		int need_wakeup = 0;
514 
515 		OSAddAtomic(-1, &vp->v_numoutput);
516 
517 		vnode_lock_spin(vp);
518 
519 		if (vp->v_numoutput < 0) {
520 			panic("vnode_writedone: numoutput < 0");
521 		}
522 
523 		if ((vp->v_flag & VTHROTTLED)) {
524 			vp->v_flag &= ~VTHROTTLED;
525 			need_wakeup = 1;
526 		}
527 		if ((vp->v_flag & VBWAIT) && (vp->v_numoutput == 0)) {
528 			vp->v_flag &= ~VBWAIT;
529 			need_wakeup = 1;
530 		}
531 		vnode_unlock(vp);
532 
533 		if (need_wakeup) {
534 			wakeup((caddr_t)&vp->v_numoutput);
535 		}
536 	}
537 }
538 
539 
540 
541 int
vnode_hasdirtyblks(vnode_t vp)542 vnode_hasdirtyblks(vnode_t vp)
543 {
544 	struct cl_writebehind *wbp;
545 
546 	/*
547 	 * Not taking the buf_mtx as there is little
548 	 * point doing it. Even if the lock is taken the
549 	 * state can change right after that. If their
550 	 * needs to be a synchronization, it must be driven
551 	 * by the caller
552 	 */
553 	if (vp->v_dirtyblkhd.lh_first) {
554 		return 1;
555 	}
556 
557 	if (!UBCINFOEXISTS(vp)) {
558 		return 0;
559 	}
560 
561 	wbp = vp->v_ubcinfo->cl_wbehind;
562 
563 	if (wbp && (wbp->cl_number || wbp->cl_scmap)) {
564 		return 1;
565 	}
566 
567 	return 0;
568 }
569 
570 int
vnode_hascleanblks(vnode_t vp)571 vnode_hascleanblks(vnode_t vp)
572 {
573 	/*
574 	 * Not taking the buf_mtx as there is little
575 	 * point doing it. Even if the lock is taken the
576 	 * state can change right after that. If their
577 	 * needs to be a synchronization, it must be driven
578 	 * by the caller
579 	 */
580 	if (vp->v_cleanblkhd.lh_first) {
581 		return 1;
582 	}
583 	return 0;
584 }
585 
586 void
vnode_iterate_setup(mount_t mp)587 vnode_iterate_setup(mount_t mp)
588 {
589 	mp->mnt_lflag |= MNT_LITER;
590 }
591 
592 int
vnode_umount_preflight(mount_t mp,vnode_t skipvp,int flags)593 vnode_umount_preflight(mount_t mp, vnode_t skipvp, int flags)
594 {
595 	vnode_t vp;
596 	int ret = 0;
597 
598 	TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
599 		if (vp->v_type == VDIR) {
600 			continue;
601 		}
602 		if (vp == skipvp) {
603 			continue;
604 		}
605 		if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) || (vp->v_flag & VNOFLUSH))) {
606 			continue;
607 		}
608 		if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) {
609 			continue;
610 		}
611 		if ((flags & WRITECLOSE) && (vp->v_writecount == 0 || vp->v_type != VREG)) {
612 			continue;
613 		}
614 
615 		/* Look for busy vnode */
616 		if ((vp->v_usecount != 0) && ((vp->v_usecount - vp->v_kusecount) != 0)) {
617 			ret = 1;
618 			if (print_busy_vnodes && ((flags & FORCECLOSE) == 0)) {
619 				vprint("vnode_umount_preflight - busy vnode", vp);
620 			} else {
621 				return ret;
622 			}
623 		} else if (vp->v_iocount > 0) {
624 			/* Busy if iocount is > 0 for more than 3 seconds */
625 			tsleep(&vp->v_iocount, PVFS, "vnode_drain_network", 3 * hz);
626 			if (vp->v_iocount > 0) {
627 				ret = 1;
628 				if (print_busy_vnodes && ((flags & FORCECLOSE) == 0)) {
629 					vprint("vnode_umount_preflight - busy vnode", vp);
630 				} else {
631 					return ret;
632 				}
633 			}
634 			continue;
635 		}
636 	}
637 
638 	return ret;
639 }
640 
641 /*
642  * This routine prepares iteration by moving all the vnodes to worker queue
643  * called with mount lock held
644  */
645 int
vnode_iterate_prepare(mount_t mp)646 vnode_iterate_prepare(mount_t mp)
647 {
648 	vnode_t vp;
649 
650 	if (TAILQ_EMPTY(&mp->mnt_vnodelist)) {
651 		/* nothing to do */
652 		return 0;
653 	}
654 
655 	vp = TAILQ_FIRST(&mp->mnt_vnodelist);
656 	vp->v_mntvnodes.tqe_prev = &(mp->mnt_workerqueue.tqh_first);
657 	mp->mnt_workerqueue.tqh_first = mp->mnt_vnodelist.tqh_first;
658 	mp->mnt_workerqueue.tqh_last = mp->mnt_vnodelist.tqh_last;
659 
660 	TAILQ_INIT(&mp->mnt_vnodelist);
661 	if (mp->mnt_newvnodes.tqh_first != NULL) {
662 		panic("vnode_iterate_prepare: newvnode when entering vnode");
663 	}
664 	TAILQ_INIT(&mp->mnt_newvnodes);
665 
666 	return 1;
667 }
668 
669 
670 /* called with mount lock held */
671 int
vnode_iterate_reloadq(mount_t mp)672 vnode_iterate_reloadq(mount_t mp)
673 {
674 	int moved = 0;
675 
676 	/* add the remaining entries in workerq to the end of mount vnode list */
677 	if (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
678 		struct vnode * mvp;
679 		mvp = TAILQ_LAST(&mp->mnt_vnodelist, vnodelst);
680 
681 		/* Joining the workerque entities to mount vnode list */
682 		if (mvp) {
683 			mvp->v_mntvnodes.tqe_next = mp->mnt_workerqueue.tqh_first;
684 		} else {
685 			mp->mnt_vnodelist.tqh_first = mp->mnt_workerqueue.tqh_first;
686 		}
687 		mp->mnt_workerqueue.tqh_first->v_mntvnodes.tqe_prev = mp->mnt_vnodelist.tqh_last;
688 		mp->mnt_vnodelist.tqh_last = mp->mnt_workerqueue.tqh_last;
689 		TAILQ_INIT(&mp->mnt_workerqueue);
690 	}
691 
692 	/* add the newvnodes to the head of mount vnode list */
693 	if (!TAILQ_EMPTY(&mp->mnt_newvnodes)) {
694 		struct vnode * nlvp;
695 		nlvp = TAILQ_LAST(&mp->mnt_newvnodes, vnodelst);
696 
697 		mp->mnt_newvnodes.tqh_first->v_mntvnodes.tqe_prev = &mp->mnt_vnodelist.tqh_first;
698 		nlvp->v_mntvnodes.tqe_next = mp->mnt_vnodelist.tqh_first;
699 		if (mp->mnt_vnodelist.tqh_first) {
700 			mp->mnt_vnodelist.tqh_first->v_mntvnodes.tqe_prev = &nlvp->v_mntvnodes.tqe_next;
701 		} else {
702 			mp->mnt_vnodelist.tqh_last = mp->mnt_newvnodes.tqh_last;
703 		}
704 		mp->mnt_vnodelist.tqh_first = mp->mnt_newvnodes.tqh_first;
705 		TAILQ_INIT(&mp->mnt_newvnodes);
706 		moved = 1;
707 	}
708 
709 	return moved;
710 }
711 
712 
713 void
vnode_iterate_clear(mount_t mp)714 vnode_iterate_clear(mount_t mp)
715 {
716 	mp->mnt_lflag &= ~MNT_LITER;
717 }
718 
719 #if defined(__x86_64__)
720 
721 #include <i386/panic_hooks.h>
722 
723 struct vnode_iterate_panic_hook {
724 	panic_hook_t hook;
725 	mount_t mp;
726 	struct vnode *vp;
727 };
728 
729 static void
vnode_iterate_panic_hook(panic_hook_t * hook_)730 vnode_iterate_panic_hook(panic_hook_t *hook_)
731 {
732 	struct vnode_iterate_panic_hook *hook = (struct vnode_iterate_panic_hook *)hook_;
733 	panic_phys_range_t range;
734 	uint64_t phys;
735 
736 	if (panic_phys_range_before(hook->mp, &phys, &range)) {
737 		paniclog_append_noflush("mp = %p, phys = %p, prev (%p: %p-%p)\n",
738 		    hook->mp, phys, range.type, range.phys_start,
739 		    range.phys_start + range.len);
740 	} else {
741 		paniclog_append_noflush("mp = %p, phys = %p, prev (!)\n", hook->mp, phys);
742 	}
743 
744 	if (panic_phys_range_before(hook->vp, &phys, &range)) {
745 		paniclog_append_noflush("vp = %p, phys = %p, prev (%p: %p-%p)\n",
746 		    hook->vp, phys, range.type, range.phys_start,
747 		    range.phys_start + range.len);
748 	} else {
749 		paniclog_append_noflush("vp = %p, phys = %p, prev (!)\n", hook->vp, phys);
750 	}
751 	panic_dump_mem((void *)(((vm_offset_t)hook->mp - 4096) & ~4095), 12288);
752 }
753 #endif /* defined(__x86_64__) */
754 
755 int
vnode_iterate(mount_t mp,int flags,int (* callout)(struct vnode *,void *),void * arg)756 vnode_iterate(mount_t mp, int flags, int (*callout)(struct vnode *, void *),
757     void *arg)
758 {
759 	struct vnode *vp;
760 	int vid, retval;
761 	int ret = 0;
762 
763 	/*
764 	 * The mount iterate mutex is held for the duration of the iteration.
765 	 * This can be done by a state flag on the mount structure but we can
766 	 * run into priority inversion issues sometimes.
767 	 * Using a mutex allows us to benefit from the priority donation
768 	 * mechanisms in the kernel for locks. This mutex should never be
769 	 * acquired in spin mode and it should be acquired before attempting to
770 	 * acquire the mount lock.
771 	 */
772 	mount_iterate_lock(mp);
773 
774 	mount_lock(mp);
775 
776 	vnode_iterate_setup(mp);
777 
778 	/* If it returns 0 then there is nothing to do */
779 	retval = vnode_iterate_prepare(mp);
780 
781 	if (retval == 0) {
782 		vnode_iterate_clear(mp);
783 		mount_unlock(mp);
784 		mount_iterate_unlock(mp);
785 		return ret;
786 	}
787 
788 #if defined(__x86_64__)
789 	struct vnode_iterate_panic_hook hook;
790 	hook.mp = mp;
791 	hook.vp = NULL;
792 	panic_hook(&hook.hook, vnode_iterate_panic_hook);
793 #endif
794 	/* iterate over all the vnodes */
795 	while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
796 		vp = TAILQ_FIRST(&mp->mnt_workerqueue);
797 #if defined(__x86_64__)
798 		hook.vp = vp;
799 #endif
800 		TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
801 		TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
802 		vid = vp->v_id;
803 		if ((vp->v_data == NULL) || (vp->v_type == VNON) || (vp->v_mount != mp)) {
804 			continue;
805 		}
806 		vnode_hold(vp);
807 		mount_unlock(mp);
808 
809 		if (vget_internal(vp, vid, (flags | VNODE_NODEAD | VNODE_WITHID | VNODE_NOSUSPEND))) {
810 			mount_lock(mp);
811 			vnode_drop(vp);
812 			continue;
813 		}
814 		vnode_drop(vp);
815 		if (flags & VNODE_RELOAD) {
816 			/*
817 			 * we're reloading the filesystem
818 			 * cast out any inactive vnodes...
819 			 */
820 			if (vnode_reload(vp)) {
821 				/* vnode will be recycled on the refcount drop */
822 				vnode_put(vp);
823 				mount_lock(mp);
824 				continue;
825 			}
826 		}
827 
828 		retval = callout(vp, arg);
829 
830 		switch (retval) {
831 		case VNODE_RETURNED:
832 		case VNODE_RETURNED_DONE:
833 			vnode_put(vp);
834 			if (retval == VNODE_RETURNED_DONE) {
835 				mount_lock(mp);
836 				ret = 0;
837 				goto out;
838 			}
839 			break;
840 
841 		case VNODE_CLAIMED_DONE:
842 			mount_lock(mp);
843 			ret = 0;
844 			goto out;
845 		case VNODE_CLAIMED:
846 		default:
847 			break;
848 		}
849 		mount_lock(mp);
850 	}
851 
852 out:
853 #if defined(__x86_64__)
854 	panic_unhook(&hook.hook);
855 #endif
856 	(void)vnode_iterate_reloadq(mp);
857 	vnode_iterate_clear(mp);
858 	mount_unlock(mp);
859 	mount_iterate_unlock(mp);
860 	return ret;
861 }
862 
863 void
mount_lock_renames(mount_t mp)864 mount_lock_renames(mount_t mp)
865 {
866 	lck_mtx_lock(&mp->mnt_renamelock);
867 }
868 
869 void
mount_unlock_renames(mount_t mp)870 mount_unlock_renames(mount_t mp)
871 {
872 	lck_mtx_unlock(&mp->mnt_renamelock);
873 }
874 
875 void
mount_iterate_lock(mount_t mp)876 mount_iterate_lock(mount_t mp)
877 {
878 	lck_mtx_lock(&mp->mnt_iter_lock);
879 }
880 
881 void
mount_iterate_unlock(mount_t mp)882 mount_iterate_unlock(mount_t mp)
883 {
884 	lck_mtx_unlock(&mp->mnt_iter_lock);
885 }
886 
887 void
mount_lock(mount_t mp)888 mount_lock(mount_t mp)
889 {
890 	lck_mtx_lock(&mp->mnt_mlock);
891 }
892 
893 void
mount_lock_spin(mount_t mp)894 mount_lock_spin(mount_t mp)
895 {
896 	lck_mtx_lock_spin(&mp->mnt_mlock);
897 }
898 
899 void
mount_unlock(mount_t mp)900 mount_unlock(mount_t mp)
901 {
902 	lck_mtx_unlock(&mp->mnt_mlock);
903 }
904 
905 
906 void
mount_ref(mount_t mp,int locked)907 mount_ref(mount_t mp, int locked)
908 {
909 	if (!locked) {
910 		mount_lock_spin(mp);
911 	}
912 
913 	mp->mnt_count++;
914 
915 	if (!locked) {
916 		mount_unlock(mp);
917 	}
918 }
919 
920 
921 void
mount_drop(mount_t mp,int locked)922 mount_drop(mount_t mp, int locked)
923 {
924 	if (!locked) {
925 		mount_lock_spin(mp);
926 	}
927 
928 	mp->mnt_count--;
929 
930 	if (mp->mnt_count == 0 && (mp->mnt_lflag & MNT_LDRAIN)) {
931 		wakeup(&mp->mnt_lflag);
932 	}
933 
934 	if (!locked) {
935 		mount_unlock(mp);
936 	}
937 }
938 
939 
940 int
mount_iterref(mount_t mp,int locked)941 mount_iterref(mount_t mp, int locked)
942 {
943 	int retval = 0;
944 
945 	if (!locked) {
946 		mount_list_lock();
947 	}
948 	if (mp->mnt_iterref < 0) {
949 		retval = 1;
950 	} else {
951 		mp->mnt_iterref++;
952 	}
953 	if (!locked) {
954 		mount_list_unlock();
955 	}
956 	return retval;
957 }
958 
959 int
mount_isdrained(mount_t mp,int locked)960 mount_isdrained(mount_t mp, int locked)
961 {
962 	int retval;
963 
964 	if (!locked) {
965 		mount_list_lock();
966 	}
967 	if (mp->mnt_iterref < 0) {
968 		retval = 1;
969 	} else {
970 		retval = 0;
971 	}
972 	if (!locked) {
973 		mount_list_unlock();
974 	}
975 	return retval;
976 }
977 
978 void
mount_iterdrop(mount_t mp)979 mount_iterdrop(mount_t mp)
980 {
981 	mount_list_lock();
982 	mp->mnt_iterref--;
983 	wakeup(&mp->mnt_iterref);
984 	mount_list_unlock();
985 }
986 
987 void
mount_iterdrain(mount_t mp)988 mount_iterdrain(mount_t mp)
989 {
990 	mount_list_lock();
991 	while (mp->mnt_iterref) {
992 		msleep((caddr_t)&mp->mnt_iterref, &mnt_list_mtx_lock, PVFS, "mount_iterdrain", NULL);
993 	}
994 	/* mount iterations drained */
995 	mp->mnt_iterref = -1;
996 	mount_list_unlock();
997 }
998 void
mount_iterreset(mount_t mp)999 mount_iterreset(mount_t mp)
1000 {
1001 	mount_list_lock();
1002 	if (mp->mnt_iterref == -1) {
1003 		mp->mnt_iterref = 0;
1004 	}
1005 	mount_list_unlock();
1006 }
1007 
1008 /* always called with  mount lock held */
1009 int
mount_refdrain(mount_t mp)1010 mount_refdrain(mount_t mp)
1011 {
1012 	if (mp->mnt_lflag & MNT_LDRAIN) {
1013 		panic("already in drain");
1014 	}
1015 	mp->mnt_lflag |= MNT_LDRAIN;
1016 
1017 	while (mp->mnt_count) {
1018 		msleep((caddr_t)&mp->mnt_lflag, &mp->mnt_mlock, PVFS, "mount_drain", NULL);
1019 	}
1020 
1021 	if (mp->mnt_vnodelist.tqh_first != NULL) {
1022 		panic("mount_refdrain: dangling vnode");
1023 	}
1024 
1025 	mp->mnt_lflag &= ~MNT_LDRAIN;
1026 
1027 	return 0;
1028 }
1029 
1030 /* Tags the mount point as not supportine extended readdir for NFS exports */
1031 void
mount_set_noreaddirext(mount_t mp)1032 mount_set_noreaddirext(mount_t mp)
1033 {
1034 	mount_lock(mp);
1035 	mp->mnt_kern_flag |= MNTK_DENY_READDIREXT;
1036 	mount_unlock(mp);
1037 }
1038 
1039 /*
1040  * Mark a mount point as busy. Used to synchronize access and to delay
1041  * unmounting.
1042  */
1043 int
vfs_busy(mount_t mp,int flags)1044 vfs_busy(mount_t mp, int flags)
1045 {
1046 restart:
1047 	if (mp->mnt_lflag & MNT_LDEAD) {
1048 		return ENOENT;
1049 	}
1050 
1051 	mount_lock(mp);
1052 
1053 	if (mp->mnt_lflag & MNT_LUNMOUNT) {
1054 		if (flags & LK_NOWAIT || mp->mnt_lflag & MNT_LDEAD) {
1055 			mount_unlock(mp);
1056 			return ENOENT;
1057 		}
1058 
1059 		/*
1060 		 * Since all busy locks are shared except the exclusive
1061 		 * lock granted when unmounting, the only place that a
1062 		 * wakeup needs to be done is at the release of the
1063 		 * exclusive lock at the end of dounmount.
1064 		 */
1065 		mp->mnt_lflag |= MNT_LWAIT;
1066 		msleep((caddr_t)mp, &mp->mnt_mlock, (PVFS | PDROP), "vfsbusy", NULL);
1067 		return ENOENT;
1068 	}
1069 
1070 	mount_unlock(mp);
1071 
1072 	lck_rw_lock_shared(&mp->mnt_rwlock);
1073 
1074 	/*
1075 	 * Until we are granted the rwlock, it's possible for the mount point to
1076 	 * change state, so re-evaluate before granting the vfs_busy.
1077 	 */
1078 	if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) {
1079 		lck_rw_done(&mp->mnt_rwlock);
1080 		goto restart;
1081 	}
1082 	return 0;
1083 }
1084 
1085 /*
1086  * Free a busy filesystem.
1087  */
1088 void
vfs_unbusy(mount_t mp)1089 vfs_unbusy(mount_t mp)
1090 {
1091 	lck_rw_done(&mp->mnt_rwlock);
1092 }
1093 
1094 
1095 
1096 static void
vfs_rootmountfailed(mount_t mp)1097 vfs_rootmountfailed(mount_t mp)
1098 {
1099 	mount_list_lock();
1100 	mp->mnt_vtable->vfc_refcount--;
1101 	mount_list_unlock();
1102 
1103 	vfs_unbusy(mp);
1104 
1105 	if (nc_smr_enabled) {
1106 		vfs_smr_synchronize();
1107 	}
1108 
1109 	mount_lock_destroy(mp);
1110 
1111 #if CONFIG_MACF
1112 	mac_mount_label_destroy(mp);
1113 #endif
1114 
1115 	zfree(mount_zone, mp);
1116 }
1117 
1118 /*
1119  * Lookup a filesystem type, and if found allocate and initialize
1120  * a mount structure for it.
1121  *
1122  * Devname is usually updated by mount(8) after booting.
1123  */
1124 static mount_t
vfs_rootmountalloc_internal(struct vfstable * vfsp,const char * devname)1125 vfs_rootmountalloc_internal(struct vfstable *vfsp, const char *devname)
1126 {
1127 	mount_t mp;
1128 
1129 	mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1130 	/* Initialize the default IO constraints */
1131 	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1132 	mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1133 	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1134 	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1135 	mp->mnt_devblocksize = DEV_BSIZE;
1136 	mp->mnt_alignmentmask = PAGE_MASK;
1137 	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1138 	mp->mnt_ioscale = 1;
1139 	mp->mnt_ioflags = 0;
1140 	mp->mnt_realrootvp = NULLVP;
1141 	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1142 	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1143 	mp->mnt_devbsdunit = 0;
1144 
1145 	mount_lock_init(mp);
1146 	(void)vfs_busy(mp, LK_NOWAIT);
1147 
1148 	TAILQ_INIT(&mp->mnt_vnodelist);
1149 	TAILQ_INIT(&mp->mnt_workerqueue);
1150 	TAILQ_INIT(&mp->mnt_newvnodes);
1151 
1152 	mp->mnt_vtable = vfsp;
1153 	mp->mnt_op = vfsp->vfc_vfsops;
1154 	mp->mnt_flag = MNT_RDONLY | MNT_ROOTFS;
1155 	mp->mnt_vnodecovered = NULLVP;
1156 	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
1157 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1158 
1159 	mount_list_lock();
1160 	vfsp->vfc_refcount++;
1161 	mount_list_unlock();
1162 
1163 	strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1164 	mp->mnt_vfsstat.f_mntonname[0] = '/';
1165 	/* XXX const poisoning layering violation */
1166 	(void) copystr((const void *)devname, mp->mnt_vfsstat.f_mntfromname, MAXPATHLEN - 1, NULL);
1167 
1168 #if CONFIG_MACF
1169 	mac_mount_label_init(mp);
1170 	mac_mount_label_associate(vfs_context_kernel(), mp);
1171 #endif
1172 	return mp;
1173 }
1174 
1175 errno_t
vfs_rootmountalloc(const char * fstypename,const char * devname,mount_t * mpp)1176 vfs_rootmountalloc(const char *fstypename, const char *devname, mount_t *mpp)
1177 {
1178 	struct vfstable *vfsp;
1179 
1180 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1181 		if (!strncmp(vfsp->vfc_name, fstypename,
1182 		    sizeof(vfsp->vfc_name))) {
1183 			break;
1184 		}
1185 	}
1186 	if (vfsp == NULL) {
1187 		return ENODEV;
1188 	}
1189 
1190 	*mpp = vfs_rootmountalloc_internal(vfsp, devname);
1191 
1192 	if (*mpp) {
1193 		return 0;
1194 	}
1195 
1196 	return ENOMEM;
1197 }
1198 
1199 #define DBG_MOUNTROOT (FSDBG_CODE(DBG_MOUNT, 0))
1200 
1201 /*
1202  * Find an appropriate filesystem to use for the root. If a filesystem
1203  * has not been preselected, walk through the list of known filesystems
1204  * trying those that have mountroot routines, and try them until one
1205  * works or we have tried them all.
1206  */
1207 extern int (*mountroot)(void);
1208 
1209 int
vfs_mountroot(void)1210 vfs_mountroot(void)
1211 {
1212 #if CONFIG_MACF
1213 	struct vnode *vp;
1214 #endif
1215 	struct vfstable *vfsp;
1216 	vfs_context_t ctx = vfs_context_kernel();
1217 	struct vfs_attr vfsattr;
1218 	int     error;
1219 	mount_t mp;
1220 	vnode_t bdevvp_rootvp;
1221 
1222 	/*
1223 	 * Reset any prior "unmounting everything" state.  This handles the
1224 	 * situation where mount root and then unmountall and re-mountroot
1225 	 * a new image (see bsd/kern/imageboot.c).
1226 	 */
1227 	vfs_unmountall_started = vfs_unmountall_finished = 0;
1228 	OSMemoryBarrier();
1229 
1230 	KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_START);
1231 	if (mountroot != NULL) {
1232 		/*
1233 		 * used for netboot which follows a different set of rules
1234 		 */
1235 		error = (*mountroot)();
1236 
1237 		KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, error, 0);
1238 		return error;
1239 	}
1240 	if ((error = bdevvp(rootdev, &rootvp))) {
1241 		printf("vfs_mountroot: can't setup bdevvp\n");
1242 
1243 		KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, error, 1);
1244 		return error;
1245 	}
1246 	/*
1247 	 * 4951998 - code we call in vfc_mountroot may replace rootvp
1248 	 * so keep a local copy for some house keeping.
1249 	 */
1250 	bdevvp_rootvp = rootvp;
1251 
1252 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1253 		if (vfsp->vfc_mountroot == NULL
1254 		    && !ISSET(vfsp->vfc_vfsflags, VFC_VFSCANMOUNTROOT)) {
1255 			continue;
1256 		}
1257 
1258 		mp = vfs_rootmountalloc_internal(vfsp, "root_device");
1259 		mp->mnt_devvp = rootvp;
1260 
1261 		if (vfsp->vfc_mountroot) {
1262 			error = (*vfsp->vfc_mountroot)(mp, rootvp, ctx);
1263 		} else {
1264 			error = VFS_MOUNT(mp, rootvp, 0, ctx);
1265 		}
1266 
1267 		if (!error) {
1268 			if (bdevvp_rootvp != rootvp) {
1269 				/*
1270 				 * rootvp changed...
1271 				 *   bump the iocount and fix up mnt_devvp for the
1272 				 *   new rootvp (it will already have a usecount taken)...
1273 				 *   drop the iocount and the usecount on the orignal
1274 				 *   since we are no longer going to use it...
1275 				 */
1276 				vnode_getwithref(rootvp);
1277 				mp->mnt_devvp = rootvp;
1278 
1279 				vnode_rele(bdevvp_rootvp);
1280 				vnode_put(bdevvp_rootvp);
1281 			}
1282 			mp->mnt_devvp->v_specflags |= SI_MOUNTEDON;
1283 
1284 			vfs_unbusy(mp);
1285 
1286 			mount_list_add(mp);
1287 
1288 			/*
1289 			 *   cache the IO attributes for the underlying physical media...
1290 			 *   an error return indicates the underlying driver doesn't
1291 			 *   support all the queries necessary... however, reasonable
1292 			 *   defaults will have been set, so no reason to bail or care
1293 			 */
1294 			vfs_init_io_attributes(rootvp, mp);
1295 
1296 			if (mp->mnt_ioflags & MNT_IOFLAGS_FUSION_DRIVE) {
1297 				root_is_CF_drive = TRUE;
1298 			}
1299 
1300 			/*
1301 			 * Shadow the VFC_VFSNATIVEXATTR flag to MNTK_EXTENDED_ATTRS.
1302 			 */
1303 			if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1304 				mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1305 			}
1306 			if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1307 				mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1308 			}
1309 
1310 #if defined(XNU_TARGET_OS_OSX)
1311 			uint32_t speed;
1312 
1313 			if (MNTK_VIRTUALDEV & mp->mnt_kern_flag) {
1314 				speed = 128;
1315 			} else if (disk_conditioner_mount_is_ssd(mp)) {
1316 				speed = 7 * 256;
1317 			} else {
1318 				speed = 256;
1319 			}
1320 			vc_progress_setdiskspeed(speed);
1321 #endif /* XNU_TARGET_OS_OSX */
1322 			/*
1323 			 * Probe root file system for additional features.
1324 			 */
1325 			(void)VFS_START(mp, 0, ctx);
1326 
1327 			VFSATTR_INIT(&vfsattr);
1328 			VFSATTR_WANTED(&vfsattr, f_capabilities);
1329 			if (vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1330 			    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1331 				if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1332 				    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1333 					mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1334 				}
1335 #if NAMEDSTREAMS
1336 				if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1337 				    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1338 					mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1339 				}
1340 #endif
1341 				if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1342 				    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1343 					mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1344 				}
1345 
1346 				if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1347 				    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1348 					mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1349 				}
1350 			}
1351 
1352 			/*
1353 			 * get rid of iocount reference returned
1354 			 * by bdevvp (or picked up by us on the substitued
1355 			 * rootvp)... it (or we) will have also taken
1356 			 * a usecount reference which we want to keep
1357 			 */
1358 			vnode_put(rootvp);
1359 
1360 #if CONFIG_MACF
1361 			if ((vfs_flags(mp) & MNT_MULTILABEL) == 0) {
1362 				KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, 0, 2);
1363 				return 0;
1364 			}
1365 
1366 			error = VFS_ROOT(mp, &vp, ctx);
1367 			if (error) {
1368 				printf("%s() VFS_ROOT() returned %d\n",
1369 				    __func__, error);
1370 				dounmount(mp, MNT_FORCE, 0, ctx);
1371 				goto fail;
1372 			}
1373 			error = vnode_label(mp, NULL, vp, NULL, 0, ctx);
1374 			/*
1375 			 * get rid of reference provided by VFS_ROOT
1376 			 */
1377 			vnode_put(vp);
1378 
1379 			if (error) {
1380 				printf("%s() vnode_label() returned %d\n",
1381 				    __func__, error);
1382 				dounmount(mp, MNT_FORCE, 0, ctx);
1383 				goto fail;
1384 			}
1385 #endif
1386 			KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, 0, 3);
1387 			return 0;
1388 		}
1389 		vfs_rootmountfailed(mp);
1390 #if CONFIG_MACF
1391 fail:
1392 #endif
1393 		if (error != EINVAL) {
1394 			printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
1395 		}
1396 	}
1397 	KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, error ? error : ENODEV, 4);
1398 	return ENODEV;
1399 }
1400 
1401 static int
cache_purge_callback(mount_t mp,__unused void * arg)1402 cache_purge_callback(mount_t mp, __unused void * arg)
1403 {
1404 	cache_purgevfs(mp);
1405 	return VFS_RETURNED;
1406 }
1407 
1408 extern lck_rw_t rootvnode_rw_lock;
1409 extern void set_rootvnode(vnode_t);
1410 
1411 
1412 static int
mntonname_fixup_callback(mount_t mp,__unused void * arg)1413 mntonname_fixup_callback(mount_t mp, __unused void *arg)
1414 {
1415 	int error = 0;
1416 
1417 	if ((strncmp(&mp->mnt_vfsstat.f_mntonname[0], "/", sizeof("/")) == 0) ||
1418 	    (strncmp(&mp->mnt_vfsstat.f_mntonname[0], "/dev", sizeof("/dev")) == 0)) {
1419 		return 0;
1420 	}
1421 
1422 	if ((error = vfs_busy(mp, LK_NOWAIT))) {
1423 		printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
1424 		return -1;
1425 	}
1426 
1427 	size_t pathlen = MAXPATHLEN;
1428 	if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
1429 		printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
1430 	}
1431 
1432 	vfs_unbusy(mp);
1433 
1434 	return error;
1435 }
1436 
1437 static int
clear_mntk_backs_root_callback(mount_t mp,__unused void * arg)1438 clear_mntk_backs_root_callback(mount_t mp, __unused void *arg)
1439 {
1440 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
1441 	mp->mnt_kern_flag &= ~MNTK_BACKS_ROOT;
1442 	lck_rw_done(&mp->mnt_rwlock);
1443 	return VFS_RETURNED;
1444 }
1445 
1446 static int
verify_incoming_rootfs(vnode_t * incoming_rootvnodep,vfs_context_t ctx,vfs_switch_root_flags_t flags)1447 verify_incoming_rootfs(vnode_t *incoming_rootvnodep, vfs_context_t ctx,
1448     vfs_switch_root_flags_t flags)
1449 {
1450 	mount_t mp;
1451 	vnode_t tdp;
1452 	vnode_t incoming_rootvnode_with_iocount = *incoming_rootvnodep;
1453 	vnode_t incoming_rootvnode_with_usecount = NULLVP;
1454 	int error = 0;
1455 
1456 	if (vnode_vtype(incoming_rootvnode_with_iocount) != VDIR) {
1457 		printf("Incoming rootfs path not a directory\n");
1458 		error = ENOTDIR;
1459 		goto done;
1460 	}
1461 
1462 	/*
1463 	 * Before we call VFS_ROOT, we have to let go of the iocount already
1464 	 * acquired, but before doing that get a usecount.
1465 	 */
1466 	vnode_ref_ext(incoming_rootvnode_with_iocount, 0, VNODE_REF_FORCE);
1467 	incoming_rootvnode_with_usecount = incoming_rootvnode_with_iocount;
1468 	vnode_lock_spin(incoming_rootvnode_with_usecount);
1469 	if ((mp = incoming_rootvnode_with_usecount->v_mount)) {
1470 		mp->mnt_crossref++;
1471 		vnode_unlock(incoming_rootvnode_with_usecount);
1472 	} else {
1473 		vnode_unlock(incoming_rootvnode_with_usecount);
1474 		printf("Incoming rootfs root vnode does not have associated mount\n");
1475 		error = ENOTDIR;
1476 		goto done;
1477 	}
1478 
1479 	if (vfs_busy(mp, LK_NOWAIT)) {
1480 		printf("Incoming rootfs root vnode mount is busy\n");
1481 		error = ENOENT;
1482 		goto out;
1483 	}
1484 
1485 	vnode_put(incoming_rootvnode_with_iocount);
1486 	incoming_rootvnode_with_iocount = NULLVP;
1487 
1488 	error = VFS_ROOT(mp, &tdp, ctx);
1489 
1490 	if (error) {
1491 		printf("Could not get rootvnode of incoming rootfs\n");
1492 	} else if (tdp != incoming_rootvnode_with_usecount) {
1493 		vnode_put(tdp);
1494 		tdp = NULLVP;
1495 		printf("Incoming rootfs root vnode mount is is not a mountpoint\n");
1496 		error = EINVAL;
1497 		goto out_busy;
1498 	} else {
1499 		incoming_rootvnode_with_iocount = tdp;
1500 		tdp = NULLVP;
1501 	}
1502 
1503 	if ((flags & VFSSR_VIRTUALDEV_PROHIBITED) != 0) {
1504 		if (mp->mnt_flag & MNTK_VIRTUALDEV) {
1505 			error = ENODEV;
1506 		}
1507 		if (error) {
1508 			printf("Incoming rootfs is backed by a virtual device; cannot switch to it");
1509 			goto out_busy;
1510 		}
1511 	}
1512 
1513 out_busy:
1514 	vfs_unbusy(mp);
1515 
1516 out:
1517 	vnode_lock(incoming_rootvnode_with_usecount);
1518 	mp->mnt_crossref--;
1519 	if (mp->mnt_crossref < 0) {
1520 		panic("mount cross refs -ve");
1521 	}
1522 	vnode_unlock(incoming_rootvnode_with_usecount);
1523 
1524 done:
1525 	if (incoming_rootvnode_with_usecount) {
1526 		vnode_rele(incoming_rootvnode_with_usecount);
1527 		incoming_rootvnode_with_usecount = NULLVP;
1528 	}
1529 
1530 	if (error && incoming_rootvnode_with_iocount) {
1531 		vnode_put(incoming_rootvnode_with_iocount);
1532 		incoming_rootvnode_with_iocount = NULLVP;
1533 	}
1534 
1535 	*incoming_rootvnodep = incoming_rootvnode_with_iocount;
1536 	return error;
1537 }
1538 
1539 /*
1540  * vfs_switch_root()
1541  *
1542  * Move the current root volume, and put a different volume at the root.
1543  *
1544  * incoming_vol_old_path: This is the path where the incoming root volume
1545  *	is mounted when this function begins.
1546  * outgoing_vol_new_path: This is the path where the outgoing root volume
1547  *	will be mounted when this function (successfully) ends.
1548  *	Note: Do not use a leading slash.
1549  *
1550  * Volumes mounted at several fixed points (including /dev) will be preserved
1551  * at the same absolute path. That means they will move within the folder
1552  * hierarchy during the pivot operation. For example, /dev before the pivot
1553  * will be at /dev after the pivot.
1554  *
1555  * If any filesystem has MNTK_BACKS_ROOT set, it will be cleared. If the
1556  * incoming root volume is actually a disk image backed by some other
1557  * filesystem, it is the caller's responsibility to re-set MNTK_BACKS_ROOT
1558  * as appropriate.
1559  */
1560 int
vfs_switch_root(const char * incoming_vol_old_path,const char * outgoing_vol_new_path,vfs_switch_root_flags_t flags)1561 vfs_switch_root(const char *incoming_vol_old_path,
1562     const char *outgoing_vol_new_path,
1563     vfs_switch_root_flags_t flags)
1564 {
1565 	// grumble grumble
1566 #define countof(x) (sizeof(x) / sizeof(x[0]))
1567 
1568 	struct preserved_mount {
1569 		vnode_t pm_rootvnode;
1570 		mount_t pm_mount;
1571 		vnode_t pm_new_covered_vp;
1572 		vnode_t pm_old_covered_vp;
1573 		const char *pm_path;
1574 	};
1575 
1576 	vfs_context_t ctx = vfs_context_kernel();
1577 	vnode_t incoming_rootvnode = NULLVP;
1578 	vnode_t outgoing_vol_new_covered_vp = NULLVP;
1579 	vnode_t incoming_vol_old_covered_vp = NULLVP;
1580 	mount_t outgoing = NULL;
1581 	mount_t incoming = NULL;
1582 
1583 	struct preserved_mount devfs = { NULLVP, NULL, NULLVP, NULLVP, "dev" };
1584 	struct preserved_mount preboot = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/Preboot" };
1585 	struct preserved_mount recovery = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/Recovery" };
1586 	struct preserved_mount vm = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/VM" };
1587 	struct preserved_mount update = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/Update" };
1588 	struct preserved_mount iscPreboot = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/iSCPreboot" };
1589 	struct preserved_mount hardware = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/Hardware" };
1590 	struct preserved_mount xarts = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/xarts" };
1591 	struct preserved_mount factorylogs = { NULLVP, NULL, NULLVP, NULLVP, "FactoryLogs" };
1592 	struct preserved_mount idiags = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/Diags" };
1593 
1594 	struct preserved_mount *preserved[10];
1595 	preserved[0] = &devfs;
1596 	preserved[1] = &preboot;
1597 	preserved[2] = &recovery;
1598 	preserved[3] = &vm;
1599 	preserved[4] = &update;
1600 	preserved[5] = &iscPreboot;
1601 	preserved[6] = &hardware;
1602 	preserved[7] = &xarts;
1603 	preserved[8] = &factorylogs;
1604 	preserved[9] = &idiags;
1605 
1606 	int error;
1607 
1608 	printf("%s : shuffling mount points : %s <-> / <-> %s\n", __FUNCTION__, incoming_vol_old_path, outgoing_vol_new_path);
1609 
1610 	if (outgoing_vol_new_path[0] == '/') {
1611 		// I should have written this to be more helpful and just advance the pointer forward past the slash
1612 		printf("Do not use a leading slash in outgoing_vol_new_path\n");
1613 		return EINVAL;
1614 	}
1615 
1616 	// Set incoming_rootvnode.
1617 	// Find the vnode representing the mountpoint of the new root
1618 	// filesystem. That will be the new root directory.
1619 	error = vnode_lookup(incoming_vol_old_path, 0, &incoming_rootvnode, ctx);
1620 	if (error) {
1621 		printf("Incoming rootfs root vnode not found\n");
1622 		error = ENOENT;
1623 		goto done;
1624 	}
1625 
1626 	/*
1627 	 * This function drops the icoount and sets the vnode to NULL on error.
1628 	 */
1629 	error = verify_incoming_rootfs(&incoming_rootvnode, ctx, flags);
1630 	if (error) {
1631 		goto done;
1632 	}
1633 
1634 	/*
1635 	 * Set outgoing_vol_new_covered_vp.
1636 	 * Find the vnode representing the future mountpoint of the old
1637 	 * root filesystem, inside the directory incoming_rootvnode.
1638 	 * Right now it's at "/incoming_vol_old_path/outgoing_vol_new_path".
1639 	 * soon it will become "/oldrootfs_path_after", which will be covered.
1640 	 */
1641 	error = vnode_lookupat(outgoing_vol_new_path, 0, &outgoing_vol_new_covered_vp, ctx, incoming_rootvnode);
1642 	if (error) {
1643 		printf("Outgoing rootfs path not found, abandoning / switch, error = %d\n", error);
1644 		error = ENOENT;
1645 		goto done;
1646 	}
1647 	if (vnode_vtype(outgoing_vol_new_covered_vp) != VDIR) {
1648 		printf("Outgoing rootfs path is not a directory, abandoning / switch\n");
1649 		error = ENOTDIR;
1650 		goto done;
1651 	}
1652 
1653 	/*
1654 	 * Find the preserved mounts - see if they are mounted. Get their root
1655 	 * vnode if they are. If they aren't, leave rootvnode NULL which will
1656 	 * be the signal to ignore this mount later on.
1657 	 *
1658 	 * Also get preserved mounts' new_covered_vp.
1659 	 * Find the node representing the folder "dev" inside the directory newrootvnode.
1660 	 * Right now it's at "/incoming_vol_old_path/dev".
1661 	 * Soon it will become /dev, which will be covered by the devfs mountpoint.
1662 	 */
1663 	for (size_t i = 0; i < countof(preserved); i++) {
1664 		struct preserved_mount *pmi = preserved[i];
1665 
1666 		error = vnode_lookupat(pmi->pm_path, 0, &pmi->pm_rootvnode, ctx, rootvnode);
1667 		if (error) {
1668 			printf("skipping preserved mountpoint because not found or error: %d: %s\n", error, pmi->pm_path);
1669 			// not fatal. try the next one in the list.
1670 			continue;
1671 		}
1672 		bool is_mountpoint = false;
1673 		vnode_lock_spin(pmi->pm_rootvnode);
1674 		if ((pmi->pm_rootvnode->v_flag & VROOT) != 0) {
1675 			is_mountpoint = true;
1676 		}
1677 		vnode_unlock(pmi->pm_rootvnode);
1678 		if (!is_mountpoint) {
1679 			printf("skipping preserved mountpoint because not a mountpoint: %s\n", pmi->pm_path);
1680 			vnode_put(pmi->pm_rootvnode);
1681 			pmi->pm_rootvnode = NULLVP;
1682 			// not fatal. try the next one in the list.
1683 			continue;
1684 		}
1685 
1686 		error = vnode_lookupat(pmi->pm_path, 0, &pmi->pm_new_covered_vp, ctx, incoming_rootvnode);
1687 		if (error) {
1688 			printf("preserved new mount directory not found or error: %d: %s\n", error, pmi->pm_path);
1689 			error = ENOENT;
1690 			goto done;
1691 		}
1692 		if (vnode_vtype(pmi->pm_new_covered_vp) != VDIR) {
1693 			printf("preserved new mount directory not directory: %s\n", pmi->pm_path);
1694 			error = ENOTDIR;
1695 			goto done;
1696 		}
1697 
1698 		printf("will preserve mountpoint across pivot: /%s\n", pmi->pm_path);
1699 	}
1700 
1701 	/*
1702 	 * --
1703 	 * At this point, everything has been prepared and all error conditions
1704 	 * have been checked. We check everything we can before this point;
1705 	 * from now on we start making destructive changes, and we can't stop
1706 	 * until we reach the end.
1707 	 * ----
1708 	 */
1709 
1710 	/* this usecount is transferred to the mnt_vnodecovered */
1711 	vnode_ref_ext(outgoing_vol_new_covered_vp, 0, VNODE_REF_FORCE);
1712 	/* this usecount is transferred to set_rootvnode */
1713 	vnode_ref_ext(incoming_rootvnode, 0, VNODE_REF_FORCE);
1714 
1715 
1716 	for (size_t i = 0; i < countof(preserved); i++) {
1717 		struct preserved_mount *pmi = preserved[i];
1718 		if (pmi->pm_rootvnode == NULLVP) {
1719 			continue;
1720 		}
1721 
1722 		/* this usecount is transferred to the mnt_vnodecovered */
1723 		vnode_ref_ext(pmi->pm_new_covered_vp, 0, VNODE_REF_FORCE);
1724 
1725 		/* The new_covered_vp is a mountpoint from now on. */
1726 		vnode_lock_spin(pmi->pm_new_covered_vp);
1727 		pmi->pm_new_covered_vp->v_flag |= VMOUNTEDHERE;
1728 		vnode_unlock(pmi->pm_new_covered_vp);
1729 	}
1730 
1731 	/* The outgoing_vol_new_covered_vp is a mountpoint from now on. */
1732 	vnode_lock_spin(outgoing_vol_new_covered_vp);
1733 	outgoing_vol_new_covered_vp->v_flag |= VMOUNTEDHERE;
1734 	vnode_unlock(outgoing_vol_new_covered_vp);
1735 
1736 
1737 	/*
1738 	 * Identify the mount_ts of the mounted filesystems that are being
1739 	 * manipulated: outgoing rootfs, incoming rootfs, and the preserved
1740 	 * mounts.
1741 	 */
1742 	outgoing = rootvnode->v_mount;
1743 	incoming = incoming_rootvnode->v_mount;
1744 	for (size_t i = 0; i < countof(preserved); i++) {
1745 		struct preserved_mount *pmi = preserved[i];
1746 		if (pmi->pm_rootvnode == NULLVP) {
1747 			continue;
1748 		}
1749 
1750 		pmi->pm_mount = pmi->pm_rootvnode->v_mount;
1751 	}
1752 
1753 	lck_rw_lock_exclusive(&rootvnode_rw_lock);
1754 
1755 	/* Setup incoming as the new rootfs */
1756 	lck_rw_lock_exclusive(&incoming->mnt_rwlock);
1757 	incoming_vol_old_covered_vp = incoming->mnt_vnodecovered;
1758 	incoming->mnt_vnodecovered = NULLVP;
1759 	strlcpy(incoming->mnt_vfsstat.f_mntonname, "/", MAXPATHLEN);
1760 	incoming->mnt_flag |= MNT_ROOTFS;
1761 	lck_rw_done(&incoming->mnt_rwlock);
1762 
1763 	/*
1764 	 * The preserved mountpoints will now be moved to
1765 	 * incoming_rootnode/pm_path, and then by the end of the function,
1766 	 * since incoming_rootnode is going to /, the preserved mounts
1767 	 * will be end up back at /pm_path
1768 	 */
1769 	for (size_t i = 0; i < countof(preserved); i++) {
1770 		struct preserved_mount *pmi = preserved[i];
1771 		if (pmi->pm_rootvnode == NULLVP) {
1772 			continue;
1773 		}
1774 
1775 		lck_rw_lock_exclusive(&pmi->pm_mount->mnt_rwlock);
1776 		pmi->pm_old_covered_vp = pmi->pm_mount->mnt_vnodecovered;
1777 		pmi->pm_mount->mnt_vnodecovered = pmi->pm_new_covered_vp;
1778 		vnode_lock_spin(pmi->pm_new_covered_vp);
1779 		pmi->pm_new_covered_vp->v_mountedhere = pmi->pm_mount;
1780 		SET(pmi->pm_new_covered_vp->v_flag, VMOUNTEDHERE);
1781 		vnode_unlock(pmi->pm_new_covered_vp);
1782 		lck_rw_done(&pmi->pm_mount->mnt_rwlock);
1783 	}
1784 
1785 	/*
1786 	 * The old root volume now covers outgoing_vol_new_covered_vp
1787 	 * on the new root volume. Remove the ROOTFS marker.
1788 	 * Now it is to be found at outgoing_vol_new_path
1789 	 */
1790 	lck_rw_lock_exclusive(&outgoing->mnt_rwlock);
1791 	outgoing->mnt_vnodecovered = outgoing_vol_new_covered_vp;
1792 	strlcpy(outgoing->mnt_vfsstat.f_mntonname, "/", MAXPATHLEN);
1793 	strlcat(outgoing->mnt_vfsstat.f_mntonname, outgoing_vol_new_path, MAXPATHLEN);
1794 	outgoing->mnt_flag &= ~MNT_ROOTFS;
1795 	vnode_lock_spin(outgoing_vol_new_covered_vp);
1796 	outgoing_vol_new_covered_vp->v_mountedhere = outgoing;
1797 	vnode_unlock(outgoing_vol_new_covered_vp);
1798 	lck_rw_done(&outgoing->mnt_rwlock);
1799 
1800 	if (!(outgoing->mnt_kern_flag & MNTK_VIRTUALDEV) &&
1801 	    (TAILQ_FIRST(&mountlist) == outgoing)) {
1802 		vfs_setmntsystem(outgoing);
1803 	}
1804 
1805 	/*
1806 	 * Finally, remove the mount_t linkage from the previously covered
1807 	 * vnodes on the old root volume. These were incoming_vol_old_path,
1808 	 * and each preserved mounts's "/pm_path". The filesystems previously
1809 	 * mounted there have already been moved away.
1810 	 */
1811 	vnode_lock_spin(incoming_vol_old_covered_vp);
1812 	incoming_vol_old_covered_vp->v_flag &= ~VMOUNT;
1813 	incoming_vol_old_covered_vp->v_mountedhere = NULL;
1814 	vnode_unlock(incoming_vol_old_covered_vp);
1815 
1816 	for (size_t i = 0; i < countof(preserved); i++) {
1817 		struct preserved_mount *pmi = preserved[i];
1818 		if (pmi->pm_rootvnode == NULLVP) {
1819 			continue;
1820 		}
1821 
1822 		vnode_lock_spin(pmi->pm_old_covered_vp);
1823 		CLR(pmi->pm_old_covered_vp->v_flag, VMOUNTEDHERE);
1824 		pmi->pm_old_covered_vp->v_mountedhere = NULL;
1825 		vnode_unlock(pmi->pm_old_covered_vp);
1826 	}
1827 
1828 	/*
1829 	 * Clear the name cache since many cached names are now invalid.
1830 	 */
1831 	vfs_iterate(0 /* flags */, cache_purge_callback, NULL);
1832 
1833 	/*
1834 	 * Actually change the rootvnode! And finally drop the lock that
1835 	 * prevents concurrent vnode_lookups.
1836 	 */
1837 	set_rootvnode(incoming_rootvnode);
1838 	lck_rw_unlock_exclusive(&rootvnode_rw_lock);
1839 
1840 	if (!(incoming->mnt_kern_flag & MNTK_VIRTUALDEV) &&
1841 	    !(outgoing->mnt_kern_flag & MNTK_VIRTUALDEV)) {
1842 		/*
1843 		 * Switch the order of mount structures in the mountlist, new root
1844 		 * mount moves to the head of the list followed by /dev and the other
1845 		 * preserved mounts then all the preexisting mounts (old rootfs + any
1846 		 * others)
1847 		 */
1848 		mount_list_lock();
1849 		for (size_t i = 0; i < countof(preserved); i++) {
1850 			struct preserved_mount *pmi = preserved[i];
1851 			if (pmi->pm_rootvnode == NULLVP) {
1852 				continue;
1853 			}
1854 
1855 			TAILQ_REMOVE(&mountlist, pmi->pm_mount, mnt_list);
1856 			TAILQ_INSERT_HEAD(&mountlist, pmi->pm_mount, mnt_list);
1857 		}
1858 		TAILQ_REMOVE(&mountlist, incoming, mnt_list);
1859 		TAILQ_INSERT_HEAD(&mountlist, incoming, mnt_list);
1860 		mount_list_unlock();
1861 	}
1862 
1863 	/*
1864 	 * Fixups across all volumes
1865 	 */
1866 	vfs_iterate(0 /* flags */, mntonname_fixup_callback, NULL);
1867 	vfs_iterate(0 /* flags */, clear_mntk_backs_root_callback, NULL);
1868 
1869 	error = 0;
1870 
1871 done:
1872 	for (size_t i = 0; i < countof(preserved); i++) {
1873 		struct preserved_mount *pmi = preserved[i];
1874 
1875 		if (pmi->pm_rootvnode) {
1876 			vnode_put(pmi->pm_rootvnode);
1877 		}
1878 		if (pmi->pm_new_covered_vp) {
1879 			vnode_put(pmi->pm_new_covered_vp);
1880 		}
1881 		if (pmi->pm_old_covered_vp) {
1882 			vnode_rele(pmi->pm_old_covered_vp);
1883 		}
1884 	}
1885 
1886 	if (outgoing_vol_new_covered_vp) {
1887 		vnode_put(outgoing_vol_new_covered_vp);
1888 	}
1889 
1890 	if (incoming_vol_old_covered_vp) {
1891 		vnode_rele(incoming_vol_old_covered_vp);
1892 	}
1893 
1894 	if (incoming_rootvnode) {
1895 		vnode_put(incoming_rootvnode);
1896 	}
1897 
1898 	printf("%s : done shuffling mount points with error: %d\n", __FUNCTION__, error);
1899 	return error;
1900 }
1901 
1902 /*
1903  * Mount the Recovery volume of a container
1904  */
1905 int
vfs_mount_recovery(void)1906 vfs_mount_recovery(void)
1907 {
1908 #if CONFIG_MOUNT_PREBOOTRECOVERY
1909 	int error = 0;
1910 
1911 	error = vnode_get(rootvnode);
1912 	if (error) {
1913 		/* root must be mounted first */
1914 		printf("vnode_get(rootvnode) failed with error %d\n", error);
1915 		return error;
1916 	}
1917 
1918 	char recoverypath[] = PLATFORM_RECOVERY_VOLUME_MOUNT_POINT; /* !const because of internal casting */
1919 
1920 	/* Mount the recovery volume */
1921 	printf("attempting kernel mount for recovery volume... \n");
1922 	error = kernel_mount(rootvnode->v_mount->mnt_vfsstat.f_fstypename, NULLVP, NULLVP,
1923 	    recoverypath, (rootvnode->v_mount), 0, 0, (KERNEL_MOUNT_RECOVERYVOL), vfs_context_kernel());
1924 
1925 	if (error) {
1926 		printf("Failed to mount recovery volume (%d)\n", error);
1927 	} else {
1928 		printf("mounted recovery volume\n");
1929 	}
1930 
1931 	vnode_put(rootvnode);
1932 	return error;
1933 #else
1934 	return 0;
1935 #endif
1936 }
1937 
1938 /*
1939  * Lookup a mount point by filesystem identifier.
1940  */
1941 
1942 struct mount *
vfs_getvfs(fsid_t * fsid)1943 vfs_getvfs(fsid_t *fsid)
1944 {
1945 	return mount_list_lookupby_fsid(fsid, 0, 0);
1946 }
1947 
1948 static struct mount *
vfs_getvfs_locked(fsid_t * fsid)1949 vfs_getvfs_locked(fsid_t *fsid)
1950 {
1951 	return mount_list_lookupby_fsid(fsid, 1, 0);
1952 }
1953 
1954 struct mount *
vfs_getvfs_with_vfsops(fsid_t * fsid,const struct vfsops * const ops)1955 vfs_getvfs_with_vfsops(fsid_t *fsid, const struct vfsops * const ops)
1956 {
1957 	mount_t mp = mount_list_lookupby_fsid(fsid, 0, 0);
1958 
1959 	if (mp != NULL && mp->mnt_op != ops) {
1960 		mp = NULL;
1961 	}
1962 	return mp;
1963 }
1964 
1965 struct mount *
vfs_getvfs_by_mntonname(char * path)1966 vfs_getvfs_by_mntonname(char *path)
1967 {
1968 	mount_t retmp = (mount_t)0;
1969 	mount_t mp;
1970 
1971 	mount_list_lock();
1972 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
1973 		if (!strncmp(mp->mnt_vfsstat.f_mntonname, path,
1974 		    sizeof(mp->mnt_vfsstat.f_mntonname))) {
1975 			retmp = mp;
1976 			if (mount_iterref(retmp, 1)) {
1977 				retmp = NULL;
1978 			}
1979 			goto out;
1980 		}
1981 	}
1982 out:
1983 	mount_list_unlock();
1984 	return retmp;
1985 }
1986 
1987 /* generation number for creation of new fsids */
1988 u_short mntid_gen = 0;
1989 /*
1990  * Get a new unique fsid
1991  */
1992 void
vfs_getnewfsid(struct mount * mp)1993 vfs_getnewfsid(struct mount *mp)
1994 {
1995 	fsid_t tfsid;
1996 	int mtype;
1997 
1998 	mount_list_lock();
1999 
2000 	/* generate a new fsid */
2001 	mtype = mp->mnt_vtable->vfc_typenum;
2002 	if (++mntid_gen == 0) {
2003 		mntid_gen++;
2004 	}
2005 	tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen);
2006 	tfsid.val[1] = mtype;
2007 
2008 	while (vfs_getvfs_locked(&tfsid)) {
2009 		if (++mntid_gen == 0) {
2010 			mntid_gen++;
2011 		}
2012 		tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen);
2013 	}
2014 
2015 	mp->mnt_vfsstat.f_fsid.val[0] = tfsid.val[0];
2016 	mp->mnt_vfsstat.f_fsid.val[1] = tfsid.val[1];
2017 	mount_list_unlock();
2018 }
2019 
2020 /*
2021  * Routines having to do with the management of the vnode table.
2022  */
2023 extern int(**dead_vnodeop_p)(void *);
2024 long numvnodes, freevnodes, deadvnodes, async_work_vnodes;
2025 long busyvnodes = 0;
2026 long deadvnodes_noreuse = 0;
2027 int32_t freeablevnodes = 0;
2028 uint64_t allocedvnodes = 0;
2029 uint64_t deallocedvnodes = 0;
2030 
2031 
2032 int async_work_timed_out = 0;
2033 int async_work_handled = 0;
2034 int dead_vnode_wanted = 0;
2035 int dead_vnode_waited = 0;
2036 
2037 /*
2038  * Move a vnode from one mount queue to another.
2039  */
2040 static void
insmntque(vnode_t vp,mount_t mp)2041 insmntque(vnode_t vp, mount_t mp)
2042 {
2043 	mount_t lmp;
2044 	/*
2045 	 * Delete from old mount point vnode list, if on one.
2046 	 */
2047 	if ((lmp = vp->v_mount) != NULL && lmp != dead_mountp) {
2048 		if ((vp->v_lflag & VNAMED_MOUNT) == 0) {
2049 			panic("insmntque: vp not in mount vnode list");
2050 		}
2051 		vp->v_lflag &= ~VNAMED_MOUNT;
2052 
2053 		mount_lock_spin(lmp);
2054 
2055 		mount_drop(lmp, 1);
2056 
2057 		if (vp->v_mntvnodes.tqe_next == NULL) {
2058 			if (TAILQ_LAST(&lmp->mnt_vnodelist, vnodelst) == vp) {
2059 				TAILQ_REMOVE(&lmp->mnt_vnodelist, vp, v_mntvnodes);
2060 			} else if (TAILQ_LAST(&lmp->mnt_newvnodes, vnodelst) == vp) {
2061 				TAILQ_REMOVE(&lmp->mnt_newvnodes, vp, v_mntvnodes);
2062 			} else if (TAILQ_LAST(&lmp->mnt_workerqueue, vnodelst) == vp) {
2063 				TAILQ_REMOVE(&lmp->mnt_workerqueue, vp, v_mntvnodes);
2064 			}
2065 		} else {
2066 			vp->v_mntvnodes.tqe_next->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_prev;
2067 			*vp->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_next;
2068 		}
2069 		vp->v_mntvnodes.tqe_next = NULL;
2070 		vp->v_mntvnodes.tqe_prev = NULL;
2071 		mount_unlock(lmp);
2072 		vnode_drop(vp);
2073 		return;
2074 	}
2075 
2076 	/*
2077 	 * Insert into list of vnodes for the new mount point, if available.
2078 	 */
2079 	if ((vp->v_mount = mp) != NULL) {
2080 		mount_lock_spin(mp);
2081 		if ((vp->v_mntvnodes.tqe_next != 0) && (vp->v_mntvnodes.tqe_prev != 0)) {
2082 			panic("vp already in mount list");
2083 		}
2084 		if (mp->mnt_lflag & MNT_LITER) {
2085 			TAILQ_INSERT_HEAD(&mp->mnt_newvnodes, vp, v_mntvnodes);
2086 		} else {
2087 			TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
2088 		}
2089 		if (vp->v_lflag & VNAMED_MOUNT) {
2090 			panic("insmntque: vp already in mount vnode list");
2091 		}
2092 		vnode_hold(vp);
2093 		vp->v_lflag |= VNAMED_MOUNT;
2094 		mount_ref(mp, 1);
2095 		mount_unlock(mp);
2096 	}
2097 }
2098 
2099 
2100 /*
2101  * Create a vnode for a block device.
2102  * Used for root filesystem, argdev, and swap areas.
2103  * Also used for memory file system special devices.
2104  */
2105 int
bdevvp(dev_t dev,vnode_t * vpp)2106 bdevvp(dev_t dev, vnode_t *vpp)
2107 {
2108 	vnode_t nvp;
2109 	int     error;
2110 	struct vnode_fsparam vfsp;
2111 	struct vfs_context context;
2112 
2113 	if (dev == NODEV) {
2114 		*vpp = NULLVP;
2115 		return ENODEV;
2116 	}
2117 
2118 	context.vc_thread = current_thread();
2119 	context.vc_ucred = FSCRED;
2120 
2121 	vfsp.vnfs_mp = (struct mount *)0;
2122 	vfsp.vnfs_vtype = VBLK;
2123 	vfsp.vnfs_str = "bdevvp";
2124 	vfsp.vnfs_dvp = NULL;
2125 	vfsp.vnfs_fsnode = NULL;
2126 	vfsp.vnfs_cnp = NULL;
2127 	vfsp.vnfs_vops = spec_vnodeop_p;
2128 	vfsp.vnfs_rdev = dev;
2129 	vfsp.vnfs_filesize = 0;
2130 
2131 	vfsp.vnfs_flags = VNFS_NOCACHE | VNFS_CANTCACHE;
2132 
2133 	vfsp.vnfs_marksystem = 0;
2134 	vfsp.vnfs_markroot = 0;
2135 
2136 	if ((error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &nvp))) {
2137 		*vpp = NULLVP;
2138 		return error;
2139 	}
2140 	vnode_lock_spin(nvp);
2141 	nvp->v_flag |= VBDEVVP;
2142 	nvp->v_tag = VT_NON;    /* set this to VT_NON so during aliasing it can be replaced */
2143 	vnode_unlock(nvp);
2144 	if ((error = vnode_ref(nvp))) {
2145 		panic("bdevvp failed: vnode_ref");
2146 		return error;
2147 	}
2148 	if ((error = VNOP_FSYNC(nvp, MNT_WAIT, &context))) {
2149 		panic("bdevvp failed: fsync");
2150 		return error;
2151 	}
2152 	if ((error = buf_invalidateblks(nvp, BUF_WRITE_DATA, 0, 0))) {
2153 		panic("bdevvp failed: invalidateblks");
2154 		return error;
2155 	}
2156 
2157 #if CONFIG_MACF
2158 	/*
2159 	 * XXXMAC: We can't put a MAC check here, the system will
2160 	 * panic without this vnode.
2161 	 */
2162 #endif /* MAC */
2163 
2164 	if ((error = VNOP_OPEN(nvp, FREAD, &context))) {
2165 		panic("bdevvp failed: open");
2166 		return error;
2167 	}
2168 	*vpp = nvp;
2169 
2170 	return 0;
2171 }
2172 
2173 /*
2174  * Check to see if the new vnode represents a special device
2175  * for which we already have a vnode (either because of
2176  * bdevvp() or because of a different vnode representing
2177  * the same block device). If such an alias exists, deallocate
2178  * the existing contents and return the aliased vnode. The
2179  * caller is responsible for filling it with its new contents.
2180  */
2181 static vnode_t
checkalias(struct vnode * nvp,dev_t nvp_rdev)2182 checkalias(struct vnode *nvp, dev_t nvp_rdev)
2183 {
2184 	struct vnode *vp;
2185 	struct vnode **vpp;
2186 	struct specinfo *sin = NULL;
2187 	int vid = 0;
2188 
2189 	vpp = &speclisth[SPECHASH(nvp_rdev)];
2190 loop:
2191 	SPECHASH_LOCK();
2192 
2193 	for (vp = *vpp; vp; vp = vp->v_specnext) {
2194 		if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) {
2195 			vid = vp->v_id;
2196 			vnode_hold(vp);
2197 			break;
2198 		}
2199 	}
2200 	SPECHASH_UNLOCK();
2201 
2202 	if (vp) {
2203 found_alias:
2204 		if (vnode_getwithvid(vp, vid)) {
2205 			vnode_drop(vp);
2206 			goto loop;
2207 		}
2208 		vnode_drop(vp);
2209 		/*
2210 		 * Termination state is checked in vnode_getwithvid
2211 		 */
2212 		vnode_lock(vp);
2213 
2214 		/*
2215 		 * Alias, but not in use, so flush it out.
2216 		 */
2217 		if ((vp->v_iocount == 1) && (vp->v_usecount == 0)) {
2218 			vnode_hold(vp);
2219 			vnode_reclaim_internal(vp, 1, 1, 0);
2220 			vnode_put_locked(vp);
2221 			vnode_drop_and_unlock(vp);
2222 			goto loop;
2223 		}
2224 	}
2225 	if (vp == NULL || vp->v_tag != VT_NON) {
2226 		if (sin == NULL) {
2227 			sin = zalloc_flags(specinfo_zone, Z_WAITOK | Z_ZERO);
2228 		} else {
2229 			bzero(sin, sizeof(struct specinfo));
2230 		}
2231 
2232 		nvp->v_specinfo = sin;
2233 		nvp->v_rdev = nvp_rdev;
2234 		nvp->v_specflags = 0;
2235 		nvp->v_speclastr = -1;
2236 		nvp->v_specinfo->si_opencount = 0;
2237 		nvp->v_specinfo->si_initted = 0;
2238 		nvp->v_specinfo->si_throttleable = 0;
2239 		nvp->v_specinfo->si_devbsdunit = LOWPRI_MAX_NUM_DEV;
2240 
2241 		SPECHASH_LOCK();
2242 
2243 		/* We dropped the lock, someone could have added */
2244 		if (vp == NULLVP) {
2245 			for (vp = *vpp; vp; vp = vp->v_specnext) {
2246 				if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) {
2247 					vid = vp->v_id;
2248 					vnode_hold(vp);
2249 					SPECHASH_UNLOCK();
2250 					goto found_alias;
2251 				}
2252 			}
2253 		}
2254 
2255 		nvp->v_hashchain = vpp;
2256 		nvp->v_specnext = *vpp;
2257 		*vpp = nvp;
2258 
2259 		if (vp != NULLVP) {
2260 			nvp->v_specflags |= SI_ALIASED;
2261 			vp->v_specflags |= SI_ALIASED;
2262 			SPECHASH_UNLOCK();
2263 			vnode_put_locked(vp);
2264 			vnode_unlock(vp);
2265 		} else {
2266 			SPECHASH_UNLOCK();
2267 		}
2268 
2269 		return NULLVP;
2270 	}
2271 
2272 	if (sin) {
2273 		zfree(specinfo_zone, sin);
2274 	}
2275 
2276 	if ((vp->v_flag & (VBDEVVP | VDEVFLUSH)) != 0) {
2277 		return vp;
2278 	}
2279 
2280 	panic("checkalias with VT_NON vp that shouldn't: %p", vp);
2281 
2282 	return vp;
2283 }
2284 
2285 
2286 /*
2287  * Get a reference on a particular vnode and lock it if requested.
2288  * If the vnode was on the inactive list, remove it from the list.
2289  * If the vnode was on the free list, remove it from the list and
2290  * move it to inactive list as needed.
2291  * The vnode lock bit is set if the vnode is being eliminated in
2292  * vgone. The process is awakened when the transition is completed,
2293  * and an error returned to indicate that the vnode is no longer
2294  * usable (possibly having been changed to a new file system type).
2295  */
2296 int
vget_internal(vnode_t vp,int vid,int vflags)2297 vget_internal(vnode_t vp, int vid, int vflags)
2298 {
2299 	int error = 0;
2300 
2301 	vnode_lock_spin(vp);
2302 
2303 	if ((vflags & VNODE_WRITEABLE) && (vp->v_writecount == 0)) {
2304 		/*
2305 		 * vnode to be returned only if it has writers opened
2306 		 */
2307 		error = EINVAL;
2308 	} else {
2309 		error = vnode_getiocount(vp, vid, vflags);
2310 	}
2311 
2312 	vnode_unlock(vp);
2313 
2314 	return error;
2315 }
2316 
2317 /*
2318  * Returns:	0			Success
2319  *		ENOENT			No such file or directory [terminating]
2320  */
2321 int
vnode_ref(vnode_t vp)2322 vnode_ref(vnode_t vp)
2323 {
2324 	return vnode_ref_ext(vp, 0, 0);
2325 }
2326 
2327 /*
2328  * Returns:	0			Success
2329  *		ENOENT			No such file or directory [terminating]
2330  */
2331 int
vnode_ref_ext(vnode_t vp,int fmode,int flags)2332 vnode_ref_ext(vnode_t vp, int fmode, int flags)
2333 {
2334 	int     error = 0;
2335 
2336 	vnode_lock_spin(vp);
2337 
2338 	/*
2339 	 * once all the current call sites have been fixed to insure they have
2340 	 * taken an iocount, we can toughen this assert up and insist that the
2341 	 * iocount is non-zero... a non-zero usecount doesn't insure correctness
2342 	 */
2343 	if (vp->v_iocount <= 0 && vp->v_usecount <= 0) {
2344 		panic("vnode_ref_ext: vp %p has no valid reference %d, %d", vp, vp->v_iocount, vp->v_usecount);
2345 	}
2346 
2347 	/*
2348 	 * if you are the owner of drain/termination, can acquire usecount
2349 	 */
2350 	if ((flags & VNODE_REF_FORCE) == 0) {
2351 		if ((vp->v_lflag & (VL_DRAIN | VL_TERMINATE | VL_DEAD))) {
2352 			if (vp->v_owner != current_thread()) {
2353 				error = ENOENT;
2354 				goto out;
2355 			}
2356 		}
2357 	}
2358 
2359 	/* Enable atomic ops on v_usecount without the vnode lock */
2360 	os_atomic_inc(&vp->v_usecount, relaxed);
2361 
2362 	if (fmode & FWRITE) {
2363 		if (++vp->v_writecount <= 0) {
2364 			panic("vnode_ref_ext: v_writecount");
2365 		}
2366 	}
2367 	if (fmode & O_EVTONLY) {
2368 		if (++vp->v_kusecount <= 0) {
2369 			panic("vnode_ref_ext: v_kusecount");
2370 		}
2371 	}
2372 	if (vp->v_flag & VRAGE) {
2373 		struct  uthread *ut;
2374 
2375 		ut = current_uthread();
2376 
2377 		if (!(current_proc()->p_lflag & P_LRAGE_VNODES) &&
2378 		    !(ut->uu_flag & UT_RAGE_VNODES)) {
2379 			/*
2380 			 * a 'normal' process accessed this vnode
2381 			 * so make sure its no longer marked
2382 			 * for rapid aging...  also, make sure
2383 			 * it gets removed from the rage list...
2384 			 * when v_usecount drops back to 0, it
2385 			 * will be put back on the real free list
2386 			 */
2387 			vp->v_flag &= ~VRAGE;
2388 			vp->v_references = 0;
2389 			vnode_list_remove(vp);
2390 		}
2391 	}
2392 	if (vp->v_usecount == 1 && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) {
2393 		if (vp->v_ubcinfo) {
2394 			vnode_lock_convert(vp);
2395 			memory_object_mark_used(vp->v_ubcinfo->ui_control);
2396 		}
2397 	}
2398 out:
2399 	vnode_unlock(vp);
2400 
2401 	return error;
2402 }
2403 
2404 
2405 boolean_t
vnode_on_reliable_media(vnode_t vp)2406 vnode_on_reliable_media(vnode_t vp)
2407 {
2408 	mount_t mp = vp->v_mount;
2409 
2410 	/*
2411 	 * A NULL mountpoint would imply it's not attached to a any filesystem.
2412 	 * This can only happen with a vnode created by bdevvp(). We'll consider
2413 	 * those as not unreliable as the primary use of this function is determine
2414 	 * which vnodes are to be handed off to the async cleaner thread for
2415 	 * reclaim.
2416 	 */
2417 	if (!mp || (!(mp->mnt_kern_flag & MNTK_VIRTUALDEV) && (mp->mnt_flag & MNT_LOCAL))) {
2418 		return TRUE;
2419 	}
2420 
2421 	return FALSE;
2422 }
2423 
2424 static void
vnode_async_list_add_locked(vnode_t vp)2425 vnode_async_list_add_locked(vnode_t vp)
2426 {
2427 	if (VONLIST(vp) || (vp->v_lflag & (VL_TERMINATE | VL_DEAD))) {
2428 		panic("vnode_async_list_add: %p is in wrong state", vp);
2429 	}
2430 
2431 	TAILQ_INSERT_HEAD(&vnode_async_work_list, vp, v_freelist);
2432 	vp->v_listflag |= VLIST_ASYNC_WORK;
2433 
2434 	async_work_vnodes++;
2435 	if (!(vp->v_listflag & VLIST_NO_REUSE)) {
2436 		reusablevnodes++;
2437 	}
2438 	if (vp->v_flag & VCANDEALLOC) {
2439 		os_atomic_dec(&busyvnodes, relaxed);
2440 	}
2441 }
2442 
2443 static void
vnode_async_list_add(vnode_t vp)2444 vnode_async_list_add(vnode_t vp)
2445 {
2446 	vnode_list_lock();
2447 
2448 	if (VONLIST(vp)) {
2449 		if (!(vp->v_listflag & VLIST_ASYNC_WORK)) {
2450 			vnode_list_remove_locked(vp);
2451 			vnode_async_list_add_locked(vp);
2452 		}
2453 	} else {
2454 		vnode_async_list_add_locked(vp);
2455 	}
2456 
2457 	vnode_list_unlock();
2458 
2459 	wakeup(&vnode_async_work_list);
2460 }
2461 
2462 
2463 /*
2464  * put the vnode on appropriate free list.
2465  * called with vnode LOCKED
2466  */
2467 static void
vnode_list_add(vnode_t vp)2468 vnode_list_add(vnode_t vp)
2469 {
2470 	boolean_t need_dead_wakeup = FALSE;
2471 	bool no_busy_decrement = false;
2472 
2473 #if DIAGNOSTIC
2474 	lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
2475 #endif
2476 
2477 again:
2478 
2479 	/*
2480 	 * if it is already on a list or non zero references return
2481 	 */
2482 	if (VONLIST(vp) || (vp->v_usecount != 0) || (vp->v_iocount != 0) || (vp->v_lflag & VL_TERMINATE)) {
2483 		return;
2484 	}
2485 
2486 	/*
2487 	 * In vclean, we might have deferred ditching locked buffers
2488 	 * because something was still referencing them (indicated by
2489 	 * usecount).  We can ditch them now.
2490 	 */
2491 	if (ISSET(vp->v_lflag, VL_DEAD)
2492 	    && (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd))) {
2493 		++vp->v_iocount;        // Probably not necessary, but harmless
2494 #ifdef CONFIG_IOCOUNT_TRACE
2495 		record_vp(vp, 1);
2496 #endif
2497 		vnode_unlock(vp);
2498 		buf_invalidateblks(vp, BUF_INVALIDATE_LOCKED, 0, 0);
2499 		vnode_lock(vp);
2500 		vnode_dropiocount(vp);
2501 		goto again;
2502 	}
2503 
2504 	vnode_list_lock();
2505 
2506 	if (!(vp->v_lflag & VL_DEAD) && (vp->v_listflag & VLIST_NO_REUSE)) {
2507 		if (!(vp->v_listflag & VLIST_ASYNC_WORK)) {
2508 			vnode_async_list_add_locked(vp);
2509 		}
2510 		no_busy_decrement = true;
2511 	} else if ((vp->v_flag & VRAGE) && !(vp->v_lflag & VL_DEAD)) {
2512 		/*
2513 		 * add the new guy to the appropriate end of the RAGE list
2514 		 */
2515 		if ((vp->v_flag & VAGE)) {
2516 			TAILQ_INSERT_HEAD(&vnode_rage_list, vp, v_freelist);
2517 		} else {
2518 			TAILQ_INSERT_TAIL(&vnode_rage_list, vp, v_freelist);
2519 		}
2520 
2521 		vp->v_listflag |= VLIST_RAGE;
2522 		ragevnodes++;
2523 		reusablevnodes++;
2524 		wakeup_laundry_thread();
2525 
2526 		/*
2527 		 * reset the timestamp for the last inserted vp on the RAGE
2528 		 * queue to let new_vnode know that its not ok to start stealing
2529 		 * from this list... as long as we're actively adding to this list
2530 		 * we'll push out the vnodes we want to donate to the real free list
2531 		 * once we stop pushing, we'll let some time elapse before we start
2532 		 * stealing them in the new_vnode routine
2533 		 */
2534 		microuptime(&rage_tv);
2535 	} else {
2536 		/*
2537 		 * if VL_DEAD, insert it at head of the dead list
2538 		 * else insert at tail of LRU list or at head if VAGE is set
2539 		 */
2540 		if ((vp->v_lflag & VL_DEAD)) {
2541 			if (vp->v_flag & VCANDEALLOC) {
2542 				TAILQ_INSERT_TAIL(&vnode_dead_list, vp, v_freelist);
2543 				if (vp->v_listflag & VLIST_NO_REUSE) {
2544 					deadvnodes_noreuse++;
2545 				}
2546 			} else {
2547 				TAILQ_INSERT_HEAD(&vnode_dead_list, vp, v_freelist);
2548 			}
2549 			vp->v_listflag |= VLIST_DEAD;
2550 			deadvnodes++;
2551 
2552 			if (dead_vnode_wanted) {
2553 				dead_vnode_wanted--;
2554 				need_dead_wakeup = TRUE;
2555 			}
2556 		} else if ((vp->v_flag & VAGE)) {
2557 			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2558 			vp->v_flag &= ~VAGE;
2559 			freevnodes++;
2560 			reusablevnodes++;
2561 			wakeup_laundry_thread();
2562 		} else {
2563 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
2564 			freevnodes++;
2565 			reusablevnodes++;
2566 			wakeup_laundry_thread();
2567 		}
2568 	}
2569 	if ((vp->v_flag & VCANDEALLOC) && !no_busy_decrement) {
2570 		os_atomic_dec(&busyvnodes, relaxed);
2571 	}
2572 	vnode_list_unlock();
2573 
2574 	if (need_dead_wakeup == TRUE) {
2575 		wakeup_one((caddr_t)&dead_vnode_wanted);
2576 	}
2577 }
2578 
2579 
2580 /*
2581  * remove the vnode from appropriate free list.
2582  * called with vnode LOCKED and
2583  * the list lock held
2584  */
2585 static void
vnode_list_remove_locked(vnode_t vp)2586 vnode_list_remove_locked(vnode_t vp)
2587 {
2588 	if (VONLIST(vp)) {
2589 		/*
2590 		 * the v_listflag field is
2591 		 * protected by the vnode_list_lock
2592 		 */
2593 		if (vp->v_listflag & VLIST_RAGE) {
2594 			VREMRAGE("vnode_list_remove", vp);
2595 		} else if (vp->v_listflag & VLIST_DEAD) {
2596 			VREMDEAD("vnode_list_remove", vp);
2597 			wakeup_laundry_thread();
2598 		} else if (vp->v_listflag & VLIST_ASYNC_WORK) {
2599 			VREMASYNC_WORK("vnode_list_remove", vp);
2600 		} else {
2601 			VREMFREE("vnode_list_remove", vp);
2602 		}
2603 		if (vp->v_flag & VCANDEALLOC) {
2604 			os_atomic_inc(&busyvnodes, relaxed);
2605 		}
2606 	}
2607 }
2608 
2609 
2610 /*
2611  * remove the vnode from appropriate free list.
2612  * called with vnode LOCKED
2613  */
2614 static void
vnode_list_remove(vnode_t vp)2615 vnode_list_remove(vnode_t vp)
2616 {
2617 #if DIAGNOSTIC
2618 	lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
2619 #endif
2620 	/*
2621 	 * we want to avoid taking the list lock
2622 	 * in the case where we're not on the free
2623 	 * list... this will be true for most
2624 	 * directories and any currently in use files
2625 	 *
2626 	 * we're guaranteed that we can't go from
2627 	 * the not-on-list state to the on-list
2628 	 * state since we hold the vnode lock...
2629 	 * all calls to vnode_list_add are done
2630 	 * under the vnode lock... so we can
2631 	 * check for that condition (the prevelant one)
2632 	 * without taking the list lock
2633 	 */
2634 	if (VONLIST(vp)) {
2635 		vnode_list_lock();
2636 		/*
2637 		 * however, we're not guaranteed that
2638 		 * we won't go from the on-list state
2639 		 * to the not-on-list state until we
2640 		 * hold the vnode_list_lock... this
2641 		 * is due to "new_vnode" removing vnodes
2642 		 * from the free list uder the list_lock
2643 		 * w/o the vnode lock... so we need to
2644 		 * check again whether we're currently
2645 		 * on the free list
2646 		 */
2647 		vnode_list_remove_locked(vp);
2648 
2649 		vnode_list_unlock();
2650 	}
2651 }
2652 
2653 
2654 void
vnode_rele(vnode_t vp)2655 vnode_rele(vnode_t vp)
2656 {
2657 	vnode_rele_internal(vp, 0, 0, 0);
2658 }
2659 
2660 
2661 void
vnode_rele_ext(vnode_t vp,int fmode,int dont_reenter)2662 vnode_rele_ext(vnode_t vp, int fmode, int dont_reenter)
2663 {
2664 	vnode_rele_internal(vp, fmode, dont_reenter, 0);
2665 }
2666 
2667 
2668 void
vnode_rele_internal(vnode_t vp,int fmode,int dont_reenter,int locked)2669 vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked)
2670 {
2671 	int32_t old_usecount;
2672 
2673 	if (!locked) {
2674 		vnode_hold(vp);
2675 		vnode_lock_spin(vp);
2676 	}
2677 #if DIAGNOSTIC
2678 	else {
2679 		lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
2680 	}
2681 #endif
2682 	/* Enable atomic ops on v_usecount without the vnode lock */
2683 	old_usecount = os_atomic_dec_orig(&vp->v_usecount, relaxed);
2684 	if (old_usecount < 1) {
2685 		/*
2686 		 * Because we allow atomic ops on usecount (in lookup only, under
2687 		 * specific conditions of already having a usecount) it is
2688 		 * possible that when the vnode is examined, its usecount is
2689 		 * different than what will be printed in this panic message.
2690 		 */
2691 		panic("vnode_rele_ext: vp %p usecount -ve : %d.  v_tag = %d, v_type = %d, v_flag = %x.",
2692 		    vp, old_usecount - 1, vp->v_tag, vp->v_type, vp->v_flag);
2693 	}
2694 
2695 	if (fmode & FWRITE) {
2696 		if (--vp->v_writecount < 0) {
2697 			panic("vnode_rele_ext: vp %p writecount -ve : %d.  v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_writecount, vp->v_tag, vp->v_type, vp->v_flag);
2698 		}
2699 	}
2700 	if (fmode & O_EVTONLY) {
2701 		if (--vp->v_kusecount < 0) {
2702 			panic("vnode_rele_ext: vp %p kusecount -ve : %d.  v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_kusecount, vp->v_tag, vp->v_type, vp->v_flag);
2703 		}
2704 	}
2705 	if (vp->v_kusecount > vp->v_usecount) {
2706 		panic("vnode_rele_ext: vp %p kusecount(%d) out of balance with usecount(%d).  v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_kusecount, vp->v_usecount, vp->v_tag, vp->v_type, vp->v_flag);
2707 	}
2708 
2709 	if ((vp->v_iocount > 0) || (vp->v_usecount > 0)) {
2710 		/*
2711 		 * vnode is still busy... if we're the last
2712 		 * usecount, mark for a future call to VNOP_INACTIVE
2713 		 * when the iocount finally drops to 0
2714 		 */
2715 		if (vp->v_usecount == 0) {
2716 			vp->v_lflag |= VL_NEEDINACTIVE;
2717 			vp->v_flag  &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT);
2718 		}
2719 		goto done;
2720 	}
2721 	vp->v_flag  &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT);
2722 
2723 	if (ISSET(vp->v_lflag, VL_TERMINATE | VL_DEAD) || dont_reenter) {
2724 		/*
2725 		 * vnode is being cleaned, or
2726 		 * we've requested that we don't reenter
2727 		 * the filesystem on this release...in
2728 		 * the latter case, we'll mark the vnode aged
2729 		 */
2730 		if (dont_reenter) {
2731 			if (!(vp->v_lflag & (VL_TERMINATE | VL_DEAD | VL_MARKTERM))) {
2732 				vp->v_lflag |= VL_NEEDINACTIVE;
2733 
2734 				if (vnode_on_reliable_media(vp) == FALSE || vp->v_flag & VISDIRTY) {
2735 					vnode_async_list_add(vp);
2736 					goto done;
2737 				}
2738 			}
2739 			vp->v_flag |= VAGE;
2740 		}
2741 		vnode_list_add(vp);
2742 
2743 		goto done;
2744 	}
2745 	/*
2746 	 * at this point both the iocount and usecount
2747 	 * are zero
2748 	 * pick up an iocount so that we can call
2749 	 * VNOP_INACTIVE with the vnode lock unheld
2750 	 */
2751 	vp->v_iocount++;
2752 #ifdef CONFIG_IOCOUNT_TRACE
2753 	record_vp(vp, 1);
2754 #endif
2755 	vp->v_lflag &= ~VL_NEEDINACTIVE;
2756 
2757 	if (UBCINFOEXISTS(vp)) {
2758 		ubc_cs_free_and_vnode_unlock(vp);
2759 	} else {
2760 		vnode_unlock(vp);
2761 	}
2762 
2763 	VNOP_INACTIVE(vp, vfs_context_current());
2764 
2765 	vnode_lock_spin(vp);
2766 
2767 	/*
2768 	 * because we dropped the vnode lock to call VNOP_INACTIVE
2769 	 * the state of the vnode may have changed... we may have
2770 	 * picked up an iocount, usecount or the MARKTERM may have
2771 	 * been set... we need to reevaluate the reference counts
2772 	 * to determine if we can call vnode_reclaim_internal at
2773 	 * this point... if the reference counts are up, we'll pick
2774 	 * up the MARKTERM state when they get subsequently dropped
2775 	 */
2776 	if ((vp->v_iocount == 1) && (vp->v_usecount == 0) &&
2777 	    ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM)) {
2778 		struct  uthread *ut;
2779 
2780 		ut = current_uthread();
2781 
2782 		if (ut->uu_defer_reclaims) {
2783 			vp->v_defer_reclaimlist = ut->uu_vreclaims;
2784 			ut->uu_vreclaims = vp;
2785 			goto done;
2786 		}
2787 		vnode_lock_convert(vp);
2788 		vnode_reclaim_internal(vp, 1, 1, 0);
2789 	}
2790 	vnode_dropiocount(vp);
2791 	vnode_list_add(vp);
2792 done:
2793 	if (vp->v_usecount == 0 && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) {
2794 		if (vp->v_ubcinfo) {
2795 			vnode_lock_convert(vp);
2796 			memory_object_mark_unused(vp->v_ubcinfo->ui_control, (vp->v_flag & VRAGE) == VRAGE);
2797 		}
2798 	}
2799 	if (!locked) {
2800 		vnode_drop_and_unlock(vp);
2801 	}
2802 	return;
2803 }
2804 
2805 /*
2806  * Remove any vnodes in the vnode table belonging to mount point mp.
2807  *
2808  * If MNT_NOFORCE is specified, there should not be any active ones,
2809  * return error if any are found (nb: this is a user error, not a
2810  * system error). If MNT_FORCE is specified, detach any active vnodes
2811  * that are found.
2812  */
2813 
2814 int
vflush(struct mount * mp,struct vnode * skipvp,int flags)2815 vflush(struct mount *mp, struct vnode *skipvp, int flags)
2816 {
2817 	struct vnode *vp;
2818 	int busy = 0;
2819 	int reclaimed = 0;
2820 	int retval;
2821 	unsigned int vid;
2822 	bool first_try = true;
2823 
2824 	/*
2825 	 * See comments in vnode_iterate() for the rationale for this lock
2826 	 */
2827 	mount_iterate_lock(mp);
2828 
2829 	mount_lock(mp);
2830 	vnode_iterate_setup(mp);
2831 	/*
2832 	 * On regular unmounts(not forced) do a
2833 	 * quick check for vnodes to be in use. This
2834 	 * preserves the caching of vnodes. automounter
2835 	 * tries unmounting every so often to see whether
2836 	 * it is still busy or not.
2837 	 */
2838 	if (((flags & FORCECLOSE) == 0) && ((mp->mnt_kern_flag & MNTK_UNMOUNT_PREFLIGHT) != 0)) {
2839 		if (vnode_umount_preflight(mp, skipvp, flags)) {
2840 			vnode_iterate_clear(mp);
2841 			mount_unlock(mp);
2842 			mount_iterate_unlock(mp);
2843 			return EBUSY;
2844 		}
2845 	}
2846 loop:
2847 	/* If it returns 0 then there is nothing to do */
2848 	retval = vnode_iterate_prepare(mp);
2849 
2850 	if (retval == 0) {
2851 		vnode_iterate_clear(mp);
2852 		mount_unlock(mp);
2853 		mount_iterate_unlock(mp);
2854 		return retval;
2855 	}
2856 
2857 	/* iterate over all the vnodes */
2858 	while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
2859 		vp = TAILQ_FIRST(&mp->mnt_workerqueue);
2860 		TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
2861 		TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
2862 
2863 		if ((vp->v_mount != mp) || (vp == skipvp)) {
2864 			continue;
2865 		}
2866 		vid = vp->v_id;
2867 		mount_unlock(mp);
2868 
2869 		vnode_lock_spin(vp);
2870 
2871 		// If vnode is already terminating, wait for it...
2872 		while (vp->v_id == vid && ISSET(vp->v_lflag, VL_TERMINATE)) {
2873 			vp->v_lflag |= VL_TERMWANT;
2874 			msleep(&vp->v_lflag, &vp->v_lock, PVFS, "vflush", NULL);
2875 		}
2876 
2877 		if ((vp->v_id != vid) || ISSET(vp->v_lflag, VL_DEAD)) {
2878 			vnode_unlock(vp);
2879 			mount_lock(mp);
2880 			continue;
2881 		}
2882 
2883 		/*
2884 		 * If requested, skip over vnodes marked VSYSTEM.
2885 		 * Skip over all vnodes marked VNOFLUSH.
2886 		 */
2887 		if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) ||
2888 		    (vp->v_flag & VNOFLUSH))) {
2889 			vnode_unlock(vp);
2890 			mount_lock(mp);
2891 			continue;
2892 		}
2893 		/*
2894 		 * If requested, skip over vnodes marked VSWAP.
2895 		 */
2896 		if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) {
2897 			vnode_unlock(vp);
2898 			mount_lock(mp);
2899 			continue;
2900 		}
2901 		/*
2902 		 * If requested, skip over vnodes marked VROOT.
2903 		 */
2904 		if ((flags & SKIPROOT) && (vp->v_flag & VROOT)) {
2905 			vnode_unlock(vp);
2906 			mount_lock(mp);
2907 			continue;
2908 		}
2909 		/*
2910 		 * If WRITECLOSE is set, only flush out regular file
2911 		 * vnodes open for writing.
2912 		 */
2913 		if ((flags & WRITECLOSE) &&
2914 		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
2915 			vnode_unlock(vp);
2916 			mount_lock(mp);
2917 			continue;
2918 		}
2919 		/*
2920 		 * If the real usecount is 0, all we need to do is clear
2921 		 * out the vnode data structures and we are done.
2922 		 */
2923 		if (((vp->v_usecount == 0) ||
2924 		    ((vp->v_usecount - vp->v_kusecount) == 0))) {
2925 			vnode_lock_convert(vp);
2926 			vnode_hold(vp);
2927 			vp->v_iocount++;        /* so that drain waits for * other iocounts */
2928 #ifdef CONFIG_IOCOUNT_TRACE
2929 			record_vp(vp, 1);
2930 #endif
2931 			vnode_reclaim_internal(vp, 1, 1, 0);
2932 			vnode_dropiocount(vp);
2933 			vnode_list_add(vp);
2934 			vnode_drop_and_unlock(vp);
2935 
2936 			reclaimed++;
2937 			mount_lock(mp);
2938 			continue;
2939 		}
2940 		/*
2941 		 * If FORCECLOSE is set, forcibly close the vnode.
2942 		 * For block or character devices, revert to an
2943 		 * anonymous device. For all other files, just kill them.
2944 		 */
2945 		if (flags & FORCECLOSE) {
2946 			vnode_lock_convert(vp);
2947 
2948 			if (vp->v_type != VBLK && vp->v_type != VCHR) {
2949 				vp->v_iocount++;        /* so that drain waits * for other iocounts */
2950 				vnode_hold(vp);
2951 #ifdef CONFIG_IOCOUNT_TRACE
2952 				record_vp(vp, 1);
2953 #endif
2954 				vnode_abort_advlocks(vp);
2955 				vnode_reclaim_internal(vp, 1, 1, 0);
2956 				vnode_dropiocount(vp);
2957 				vnode_list_add(vp);
2958 				vnode_drop_and_unlock(vp);
2959 			} else {
2960 				vnode_hold(vp);
2961 				vp->v_lflag |= VL_OPSCHANGE;
2962 				vclean(vp, 0);
2963 				vp->v_lflag &= ~VL_DEAD;
2964 				vp->v_op = spec_vnodeop_p;
2965 				vp->v_flag |= VDEVFLUSH;
2966 				vnode_drop_and_unlock(vp);
2967 				wakeup(&vp->v_lflag); /* chkvnlock is waitng for VL_DEAD to get unset */
2968 			}
2969 			mount_lock(mp);
2970 			continue;
2971 		}
2972 
2973 		/* log vnodes blocking unforced unmounts */
2974 		if (print_busy_vnodes && first_try && ((flags & FORCECLOSE) == 0)) {
2975 			vprint("vflush - busy vnode", vp);
2976 		}
2977 
2978 		vnode_unlock(vp);
2979 		mount_lock(mp);
2980 		busy++;
2981 	}
2982 
2983 	/* At this point the worker queue is completed */
2984 	if (busy && ((flags & FORCECLOSE) == 0) && reclaimed) {
2985 		busy = 0;
2986 		reclaimed = 0;
2987 		(void)vnode_iterate_reloadq(mp);
2988 		first_try = false;
2989 		/* returned with mount lock held */
2990 		goto loop;
2991 	}
2992 
2993 	/* if new vnodes were created in between retry the reclaim */
2994 	if (vnode_iterate_reloadq(mp) != 0) {
2995 		if (!(busy && ((flags & FORCECLOSE) == 0))) {
2996 			first_try = false;
2997 			goto loop;
2998 		}
2999 	}
3000 	vnode_iterate_clear(mp);
3001 	mount_unlock(mp);
3002 	mount_iterate_unlock(mp);
3003 
3004 	if (busy && ((flags & FORCECLOSE) == 0)) {
3005 		return EBUSY;
3006 	}
3007 	return 0;
3008 }
3009 
3010 long num_recycledvnodes = 0;
3011 /*
3012  * Disassociate the underlying file system from a vnode.
3013  * The vnode lock is held on entry.
3014  */
3015 static void
vclean(vnode_t vp,int flags)3016 vclean(vnode_t vp, int flags)
3017 {
3018 	vfs_context_t ctx = vfs_context_current();
3019 	int active;
3020 	int need_inactive;
3021 	int already_terminating;
3022 	int clflags = 0;
3023 #if NAMEDSTREAMS
3024 	int is_namedstream;
3025 #endif
3026 
3027 	/*
3028 	 * Check to see if the vnode is in use.
3029 	 * If so we have to reference it before we clean it out
3030 	 * so that its count cannot fall to zero and generate a
3031 	 * race against ourselves to recycle it.
3032 	 */
3033 	active = vp->v_usecount;
3034 
3035 	/*
3036 	 * just in case we missed sending a needed
3037 	 * VNOP_INACTIVE, we'll do it now
3038 	 */
3039 	need_inactive = (vp->v_lflag & VL_NEEDINACTIVE);
3040 
3041 	vp->v_lflag &= ~VL_NEEDINACTIVE;
3042 
3043 	/*
3044 	 * Prevent the vnode from being recycled or
3045 	 * brought into use while we clean it out.
3046 	 */
3047 	already_terminating = (vp->v_lflag & VL_TERMINATE);
3048 
3049 	vp->v_lflag |= VL_TERMINATE;
3050 
3051 #if NAMEDSTREAMS
3052 	is_namedstream = vnode_isnamedstream(vp);
3053 #endif
3054 
3055 	vnode_unlock(vp);
3056 
3057 	OSAddAtomicLong(1, &num_recycledvnodes);
3058 
3059 	if (flags & DOCLOSE) {
3060 		clflags |= IO_NDELAY;
3061 	}
3062 	if (flags & REVOKEALL) {
3063 		clflags |= IO_REVOKE;
3064 	}
3065 
3066 #if CONFIG_MACF
3067 	if (vp->v_mount) {
3068 		/*
3069 		 * It is possible for bdevvp vnodes to not have a mount
3070 		 * pointer. It's fine to let it get reclaimed without
3071 		 * notifying.
3072 		 */
3073 		mac_vnode_notify_reclaim(vp);
3074 	}
3075 #endif
3076 
3077 	if (active && (flags & DOCLOSE)) {
3078 		VNOP_CLOSE(vp, clflags, ctx);
3079 	}
3080 
3081 	/*
3082 	 * Clean out any buffers associated with the vnode.
3083 	 */
3084 	if (flags & DOCLOSE) {
3085 		if (vp->v_tag == VT_NFS) {
3086 			nfs_vinvalbuf(vp, V_SAVE, ctx, 0);
3087 		} else {
3088 			VNOP_FSYNC(vp, MNT_WAIT, ctx);
3089 
3090 			/*
3091 			 * If the vnode is still in use (by the journal for
3092 			 * example) we don't want to invalidate locked buffers
3093 			 * here.  In that case, either the journal will tidy them
3094 			 * up, or we will deal with it when the usecount is
3095 			 * finally released in vnode_rele_internal.
3096 			 */
3097 			buf_invalidateblks(vp, BUF_WRITE_DATA | (active ? 0 : BUF_INVALIDATE_LOCKED), 0, 0);
3098 		}
3099 		if (UBCINFOEXISTS(vp)) {
3100 			/*
3101 			 * Clean the pages in VM.
3102 			 */
3103 			(void)ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL, UBC_PUSHALL | UBC_INVALIDATE | UBC_SYNC);
3104 		}
3105 	}
3106 	if (active || need_inactive) {
3107 		VNOP_INACTIVE(vp, ctx);
3108 	}
3109 
3110 #if NAMEDSTREAMS
3111 	if ((is_namedstream != 0) && (vp->v_parent != NULLVP)) {
3112 		vnode_t pvp = vp->v_parent;
3113 
3114 		/* Delete the shadow stream file before we reclaim its vnode */
3115 		if (vnode_isshadow(vp)) {
3116 			vnode_relenamedstream(pvp, vp);
3117 		}
3118 
3119 		/*
3120 		 * No more streams associated with the parent.  We
3121 		 * have a ref on it, so its identity is stable.
3122 		 * If the parent is on an opaque volume, then we need to know
3123 		 * whether it has associated named streams.
3124 		 */
3125 		if (vfs_authopaque(pvp->v_mount)) {
3126 			vnode_lock_spin(pvp);
3127 			pvp->v_lflag &= ~VL_HASSTREAMS;
3128 			vnode_unlock(pvp);
3129 		}
3130 	}
3131 #endif
3132 
3133 	vm_object_destroy_reason_t reason = VM_OBJECT_DESTROY_UNKNOWN_REASON;
3134 	bool forced_unmount = vnode_mount(vp) != NULL && (vnode_mount(vp)->mnt_lflag & MNT_LFORCE) != 0;
3135 	bool ungraft_heuristic = flags & REVOKEALL;
3136 	if (forced_unmount) {
3137 		reason = VM_OBJECT_DESTROY_FORCED_UNMOUNT;
3138 	} else if (ungraft_heuristic) {
3139 		reason = VM_OBJECT_DESTROY_UNGRAFT;
3140 	}
3141 
3142 	/*
3143 	 * Destroy ubc named reference
3144 	 * cluster_release is done on this path
3145 	 * along with dropping the reference on the ucred
3146 	 * (and in the case of forced unmount of an mmap-ed file,
3147 	 * the ubc reference on the vnode is dropped here too).
3148 	 */
3149 	ubc_destroy_named(vp, reason);
3150 
3151 #if CONFIG_TRIGGERS
3152 	/*
3153 	 * cleanup trigger info from vnode (if any)
3154 	 */
3155 	if (vp->v_resolve) {
3156 		vnode_resolver_detach(vp);
3157 	}
3158 #endif
3159 
3160 #if CONFIG_IO_COMPRESSION_STATS
3161 	if ((vp->io_compression_stats)) {
3162 		vnode_iocs_record_and_free(vp);
3163 	}
3164 #endif /* CONFIG_IO_COMPRESSION_STATS */
3165 
3166 	/*
3167 	 * Reclaim the vnode.
3168 	 */
3169 	if (VNOP_RECLAIM(vp, ctx)) {
3170 		panic("vclean: cannot reclaim");
3171 	}
3172 
3173 	// make sure the name & parent ptrs get cleaned out!
3174 	vnode_update_identity(vp, NULLVP, NULL, 0, 0, VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME | VNODE_UPDATE_PURGE | VNODE_UPDATE_PURGEFIRMLINK);
3175 
3176 	vnode_lock(vp);
3177 
3178 	/*
3179 	 * Remove the vnode from any mount list it might be on.  It is not
3180 	 * safe to do this any earlier because unmount needs to wait for
3181 	 * any vnodes to terminate and it cannot do that if it cannot find
3182 	 * them.
3183 	 */
3184 	insmntque(vp, (struct mount *)0);
3185 
3186 	vp->v_lflag |= VL_DEAD;
3187 	vp->v_mount = dead_mountp;
3188 	vp->v_op = dead_vnodeop_p;
3189 	vp->v_tag = VT_NON;
3190 	vp->v_data = NULL;
3191 
3192 	vp->v_flag &= ~VISDIRTY;
3193 
3194 	if (already_terminating == 0) {
3195 		vp->v_lflag &= ~VL_TERMINATE;
3196 		/*
3197 		 * Done with purge, notify sleepers of the grim news.
3198 		 */
3199 		if (vp->v_lflag & VL_TERMWANT) {
3200 			vp->v_lflag &= ~VL_TERMWANT;
3201 			wakeup(&vp->v_lflag);
3202 		}
3203 	}
3204 }
3205 
3206 /*
3207  * Eliminate all activity associated with  the requested vnode
3208  * and with all vnodes aliased to the requested vnode.
3209  */
3210 int
3211 #if DIAGNOSTIC
vn_revoke(vnode_t vp,int flags,__unused vfs_context_t a_context)3212 vn_revoke(vnode_t vp, int flags, __unused vfs_context_t a_context)
3213 #else
3214 vn_revoke(vnode_t vp, __unused int flags, __unused vfs_context_t a_context)
3215 #endif
3216 {
3217 	struct vnode *vq;
3218 	int vid;
3219 
3220 #if DIAGNOSTIC
3221 	if ((flags & REVOKEALL) == 0) {
3222 		panic("vnop_revoke");
3223 	}
3224 #endif
3225 
3226 	if (vnode_isaliased(vp)) {
3227 		/*
3228 		 * If a vgone (or vclean) is already in progress,
3229 		 * return an immediate error
3230 		 */
3231 		if (vp->v_lflag & VL_TERMINATE) {
3232 			return ENOENT;
3233 		}
3234 
3235 		/*
3236 		 * Ensure that vp will not be vgone'd while we
3237 		 * are eliminating its aliases.
3238 		 */
3239 		SPECHASH_LOCK();
3240 		while ((vp->v_specflags & SI_ALIASED)) {
3241 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
3242 				if (vq->v_rdev != vp->v_rdev ||
3243 				    vq->v_type != vp->v_type || vp == vq) {
3244 					continue;
3245 				}
3246 				vid = vq->v_id;
3247 				vnode_hold(vq);
3248 				SPECHASH_UNLOCK();
3249 				if (vnode_getwithvid(vq, vid)) {
3250 					vq = vnode_drop(vq);
3251 					SPECHASH_LOCK();
3252 					break;
3253 				}
3254 				vnode_lock(vq);
3255 				if (!(vq->v_lflag & VL_TERMINATE)) {
3256 					vnode_reclaim_internal(vq, 1, 1, 0);
3257 				}
3258 				vnode_put_locked(vq);
3259 				vq = vnode_drop_and_unlock(vq);
3260 				SPECHASH_LOCK();
3261 				break;
3262 			}
3263 		}
3264 		SPECHASH_UNLOCK();
3265 	}
3266 	vnode_lock(vp);
3267 	if (vp->v_lflag & VL_TERMINATE) {
3268 		vnode_unlock(vp);
3269 		return ENOENT;
3270 	}
3271 	vnode_reclaim_internal(vp, 1, 0, REVOKEALL);
3272 	vnode_unlock(vp);
3273 
3274 	return 0;
3275 }
3276 
3277 /*
3278  * Recycle an unused vnode to the front of the free list.
3279  * Release the passed interlock if the vnode will be recycled.
3280  */
3281 int
vnode_recycle(struct vnode * vp)3282 vnode_recycle(struct vnode *vp)
3283 {
3284 	vnode_lock_spin(vp);
3285 
3286 	if (vp->v_iocount || vp->v_usecount) {
3287 		vp->v_lflag |= VL_MARKTERM;
3288 		vnode_unlock(vp);
3289 		return 0;
3290 	}
3291 	vnode_lock_convert(vp);
3292 	vnode_hold(vp);
3293 	vnode_reclaim_internal(vp, 1, 0, 0);
3294 
3295 	vnode_drop_and_unlock(vp);
3296 
3297 	return 1;
3298 }
3299 
3300 static int
vnode_reload(vnode_t vp)3301 vnode_reload(vnode_t vp)
3302 {
3303 	vnode_lock_spin(vp);
3304 
3305 	if ((vp->v_iocount > 1) || vp->v_usecount) {
3306 		vnode_unlock(vp);
3307 		return 0;
3308 	}
3309 	if (vp->v_iocount <= 0) {
3310 		panic("vnode_reload with no iocount %d", vp->v_iocount);
3311 	}
3312 
3313 	/* mark for release when iocount is dopped */
3314 	vp->v_lflag |= VL_MARKTERM;
3315 	vnode_unlock(vp);
3316 
3317 	return 1;
3318 }
3319 
3320 
3321 static void
vgone(vnode_t vp,int flags)3322 vgone(vnode_t vp, int flags)
3323 {
3324 	struct vnode *vq;
3325 	struct vnode *vx;
3326 
3327 	/*
3328 	 * Clean out the filesystem specific data.
3329 	 * vclean also takes care of removing the
3330 	 * vnode from any mount list it might be on
3331 	 */
3332 	vclean(vp, flags | DOCLOSE);
3333 
3334 	/*
3335 	 * If special device, remove it from special device alias list
3336 	 * if it is on one.
3337 	 */
3338 	if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
3339 		SPECHASH_LOCK();
3340 		if (*vp->v_hashchain == vp) {
3341 			*vp->v_hashchain = vp->v_specnext;
3342 		} else {
3343 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
3344 				if (vq->v_specnext != vp) {
3345 					continue;
3346 				}
3347 				vq->v_specnext = vp->v_specnext;
3348 				break;
3349 			}
3350 			if (vq == NULL) {
3351 				panic("missing bdev");
3352 			}
3353 		}
3354 		if (vp->v_specflags & SI_ALIASED) {
3355 			vx = NULL;
3356 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
3357 				if (vq->v_rdev != vp->v_rdev ||
3358 				    vq->v_type != vp->v_type) {
3359 					continue;
3360 				}
3361 				if (vx) {
3362 					break;
3363 				}
3364 				vx = vq;
3365 			}
3366 			if (vx == NULL) {
3367 				panic("missing alias");
3368 			}
3369 			if (vq == NULL) {
3370 				vx->v_specflags &= ~SI_ALIASED;
3371 			}
3372 			vp->v_specflags &= ~SI_ALIASED;
3373 		}
3374 		SPECHASH_UNLOCK();
3375 		{
3376 			struct specinfo *tmp = vp->v_specinfo;
3377 			vp->v_specinfo = NULL;
3378 			zfree(specinfo_zone, tmp);
3379 		}
3380 	}
3381 }
3382 
3383 /*
3384  * Lookup a vnode by device number.
3385  */
3386 int
check_mountedon(dev_t dev,enum vtype type,int * errorp)3387 check_mountedon(dev_t dev, enum vtype type, int  *errorp)
3388 {
3389 	vnode_t vp;
3390 	int rc = 0;
3391 	int vid;
3392 
3393 loop:
3394 	SPECHASH_LOCK();
3395 	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
3396 		if (dev != vp->v_rdev || type != vp->v_type) {
3397 			continue;
3398 		}
3399 		vid = vp->v_id;
3400 		vnode_hold(vp);
3401 		SPECHASH_UNLOCK();
3402 		if (vnode_getwithvid(vp, vid)) {
3403 			vnode_drop(vp);
3404 			goto loop;
3405 		}
3406 		vnode_drop(vp);
3407 		vnode_lock_spin(vp);
3408 		if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) {
3409 			vnode_unlock(vp);
3410 			if ((*errorp = vfs_mountedon(vp)) != 0) {
3411 				rc = 1;
3412 			}
3413 		} else {
3414 			vnode_unlock(vp);
3415 		}
3416 		vnode_put(vp);
3417 		return rc;
3418 	}
3419 	SPECHASH_UNLOCK();
3420 	return 0;
3421 }
3422 
3423 /*
3424  * Calculate the total number of references to a special device.
3425  */
3426 int
vcount(vnode_t vp)3427 vcount(vnode_t vp)
3428 {
3429 	vnode_t vq, vnext;
3430 	int count;
3431 	int vid;
3432 
3433 	if (!vnode_isspec(vp)) {
3434 		return vp->v_usecount - vp->v_kusecount;
3435 	}
3436 
3437 loop:
3438 	if (!vnode_isaliased(vp)) {
3439 		return vp->v_specinfo->si_opencount;
3440 	}
3441 	count = 0;
3442 
3443 	SPECHASH_LOCK();
3444 	/*
3445 	 * Grab first vnode and its vid.
3446 	 */
3447 	vq = *vp->v_hashchain;
3448 	if (vq) {
3449 		vid = vq->v_id;
3450 		vnode_hold(vq);
3451 	} else {
3452 		vid = 0;
3453 	}
3454 	SPECHASH_UNLOCK();
3455 
3456 	while (vq) {
3457 		/*
3458 		 * Attempt to get the vnode outside the SPECHASH lock.
3459 		 * Don't take iocount on 'vp' as iocount is already held by the caller.
3460 		 */
3461 		if ((vq != vp) && vnode_getwithvid(vq, vid)) {
3462 			vnode_drop(vq);
3463 			goto loop;
3464 		}
3465 		vnode_drop(vq);
3466 		vnode_lock(vq);
3467 
3468 		if (vq->v_rdev == vp->v_rdev && vq->v_type == vp->v_type) {
3469 			if ((vq->v_usecount == 0) && (vq->v_iocount == 1) && vq != vp) {
3470 				/*
3471 				 * Alias, but not in use, so flush it out.
3472 				 */
3473 				vnode_hold(vq);
3474 				vnode_reclaim_internal(vq, 1, 1, 0);
3475 				vnode_put_locked(vq);
3476 				vnode_drop_and_unlock(vq);
3477 				goto loop;
3478 			}
3479 			count += vq->v_specinfo->si_opencount;
3480 		}
3481 		vnode_unlock(vq);
3482 
3483 		SPECHASH_LOCK();
3484 		/*
3485 		 * must do this with the reference still held on 'vq'
3486 		 * so that it can't be destroyed while we're poking
3487 		 * through v_specnext
3488 		 */
3489 		vnext = vq->v_specnext;
3490 		if (vnext) {
3491 			vid = vnext->v_id;
3492 			vnode_hold(vnext);
3493 		} else {
3494 			vid = 0;
3495 		}
3496 		SPECHASH_UNLOCK();
3497 
3498 		if (vq != vp) {
3499 			vnode_put(vq);
3500 		}
3501 
3502 		vq = vnext;
3503 	}
3504 
3505 	return count;
3506 }
3507 
3508 int     prtactive = 0;          /* 1 => print out reclaim of active vnodes */
3509 
3510 /*
3511  * Print out a description of a vnode.
3512  */
3513 static const char *typename[] =
3514 { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
3515 
3516 void
vprint(const char * label,struct vnode * vp)3517 vprint(const char *label, struct vnode *vp)
3518 {
3519 	char sbuf[64];
3520 
3521 	if (label != NULL) {
3522 		printf("%s: ", label);
3523 	}
3524 	printf("name %s type %s, usecount %d, writecount %d\n",
3525 	    vp->v_name, typename[vp->v_type],
3526 	    vp->v_usecount, vp->v_writecount);
3527 	sbuf[0] = '\0';
3528 	if (vp->v_flag & VROOT) {
3529 		strlcat(sbuf, "|VROOT", sizeof(sbuf));
3530 	}
3531 	if (vp->v_flag & VTEXT) {
3532 		strlcat(sbuf, "|VTEXT", sizeof(sbuf));
3533 	}
3534 	if (vp->v_flag & VSYSTEM) {
3535 		strlcat(sbuf, "|VSYSTEM", sizeof(sbuf));
3536 	}
3537 	if (vp->v_flag & VNOFLUSH) {
3538 		strlcat(sbuf, "|VNOFLUSH", sizeof(sbuf));
3539 	}
3540 	if (vp->v_flag & VBWAIT) {
3541 		strlcat(sbuf, "|VBWAIT", sizeof(sbuf));
3542 	}
3543 	if (vnode_isaliased(vp)) {
3544 		strlcat(sbuf, "|VALIASED", sizeof(sbuf));
3545 	}
3546 	if (sbuf[0] != '\0') {
3547 		printf("vnode flags (%s\n", &sbuf[1]);
3548 	}
3549 }
3550 
3551 static int
vn_getpath_flags_to_buildpath_flags(int flags)3552 vn_getpath_flags_to_buildpath_flags(int flags)
3553 {
3554 	int bpflags = (flags & VN_GETPATH_FSENTER) ? 0 : BUILDPATH_NO_FS_ENTER;
3555 
3556 	if (flags && (flags != VN_GETPATH_FSENTER)) {
3557 		if (flags & VN_GETPATH_NO_FIRMLINK) {
3558 			bpflags |= BUILDPATH_NO_FIRMLINK;
3559 		}
3560 		if (flags & VN_GETPATH_VOLUME_RELATIVE) {
3561 			bpflags |= (BUILDPATH_VOLUME_RELATIVE |
3562 			    BUILDPATH_NO_FIRMLINK);
3563 		}
3564 		if (flags & VN_GETPATH_NO_PROCROOT) {
3565 			bpflags |= BUILDPATH_NO_PROCROOT;
3566 		}
3567 		if (flags & VN_GETPATH_CHECK_MOVED) {
3568 			bpflags |= BUILDPATH_CHECK_MOVED;
3569 		}
3570 	}
3571 
3572 	return bpflags;
3573 }
3574 
3575 int
vn_getpath_ext_with_mntlen(struct vnode * vp,struct vnode * dvp,char * pathbuf,size_t * len,size_t * mntlen,int flags)3576 vn_getpath_ext_with_mntlen(struct vnode *vp, struct vnode *dvp, char *pathbuf,
3577     size_t *len, size_t *mntlen, int flags)
3578 {
3579 	int bpflags = vn_getpath_flags_to_buildpath_flags(flags);
3580 	int local_len;
3581 	int error;
3582 
3583 	if (*len > INT_MAX) {
3584 		return EINVAL;
3585 	}
3586 
3587 	local_len = *len;
3588 
3589 	error = build_path_with_parent(vp, dvp, pathbuf, local_len, &local_len,
3590 	    mntlen, bpflags, vfs_context_current());
3591 
3592 	if (local_len >= 0 && local_len <= (int)*len) {
3593 		*len = (size_t)local_len;
3594 	}
3595 
3596 	return error;
3597 }
3598 
3599 int
vn_getpath_ext(struct vnode * vp,struct vnode * dvp,char * pathbuf,size_t * len,int flags)3600 vn_getpath_ext(struct vnode *vp, struct vnode *dvp, char *pathbuf, size_t *len,
3601     int flags)
3602 {
3603 	return vn_getpath_ext_with_mntlen(vp, dvp, pathbuf, len, NULL, flags);
3604 }
3605 
3606 /*
3607  * Wrapper around vn_getpath_ext() that takes care of the int * <-> size_t *
3608  * conversion for the legacy KPIs.
3609  */
3610 static int
vn_getpath_ext_int(struct vnode * vp,struct vnode * dvp,char * pathbuf,int * len,int flags)3611 vn_getpath_ext_int(struct vnode *vp, struct vnode *dvp, char *pathbuf,
3612     int *len, int flags)
3613 {
3614 	size_t slen = *len;
3615 	int error;
3616 
3617 	if (*len < 0) {
3618 		return EINVAL;
3619 	}
3620 
3621 	error = vn_getpath_ext(vp, dvp, pathbuf, &slen, flags);
3622 
3623 	if (slen <= INT_MAX) {
3624 		*len = (int)slen;
3625 	}
3626 
3627 	return error;
3628 }
3629 
3630 int
vn_getpath(struct vnode * vp,char * pathbuf,int * len)3631 vn_getpath(struct vnode *vp, char *pathbuf, int *len)
3632 {
3633 	return vn_getpath_ext_int(vp, NULL, pathbuf, len, 0);
3634 }
3635 
3636 int
vn_getpath_fsenter(struct vnode * vp,char * pathbuf,int * len)3637 vn_getpath_fsenter(struct vnode *vp, char *pathbuf, int *len)
3638 {
3639 	return vn_getpath_ext_int(vp, NULL, pathbuf, len, VN_GETPATH_FSENTER);
3640 }
3641 
3642 /*
3643  * vn_getpath_fsenter_with_parent will reenter the file system to fine the path of the
3644  * vnode.  It requires that there are IO counts on both the vnode and the directory vnode.
3645  *
3646  * vn_getpath_fsenter is called by MAC hooks to authorize operations for every thing, but
3647  * unlink, rmdir and rename. For these operation the MAC hook  calls vn_getpath. This presents
3648  * problems where if the path can not be found from the name cache, those operations can
3649  * erroneously fail with EPERM even though the call should succeed. When removing or moving
3650  * file system objects with operations such as unlink or rename, those operations need to
3651  * take IO counts on the target and containing directory. Calling vn_getpath_fsenter from a
3652  * MAC hook from these operations during forced unmount operations can lead to dead
3653  * lock. This happens when the operation starts, IO counts are taken on the containing
3654  * directories and targets. Before the MAC hook is called a forced unmount from another
3655  * thread takes place and blocks on the on going operation's directory vnode in vdrain.
3656  * After which, the MAC hook gets called and calls vn_getpath_fsenter.  vn_getpath_fsenter
3657  * is called with the understanding that there is an IO count on the target. If in
3658  * build_path the directory vnode is no longer in the cache, then the parent object id via
3659  * vnode_getattr from the target is obtain and used to call VFS_VGET to get the parent
3660  * vnode. The file system's VFS_VGET then looks up by inode in its hash and tries to get
3661  * an IO count. But VFS_VGET "sees" the directory vnode is in vdrain and can block
3662  * depending on which version and how it calls the vnode_get family of interfaces.
3663  *
3664  * N.B.  A reasonable interface to use is vnode_getwithvid. This interface was modified to
3665  * call vnode_getiocount with VNODE_DRAINO, so it will happily get an IO count and not
3666  * cause issues, but there is no guarantee that all or any file systems are doing that.
3667  *
3668  * vn_getpath_fsenter_with_parent can enter the file system safely since there is a known
3669  * IO count on the directory vnode by calling build_path_with_parent.
3670  */
3671 
3672 int
vn_getpath_fsenter_with_parent(struct vnode * dvp,struct vnode * vp,char * pathbuf,int * len)3673 vn_getpath_fsenter_with_parent(struct vnode *dvp, struct vnode *vp, char *pathbuf, int *len)
3674 {
3675 	return build_path_with_parent(vp, dvp, pathbuf, *len, len, NULL, 0, vfs_context_current());
3676 }
3677 
3678 int
vn_getpath_no_firmlink(struct vnode * vp,char * pathbuf,int * len)3679 vn_getpath_no_firmlink(struct vnode *vp, char *pathbuf, int *len)
3680 {
3681 	return vn_getpath_ext_int(vp, NULLVP, pathbuf, len,
3682 	           VN_GETPATH_NO_FIRMLINK);
3683 }
3684 
3685 int
vn_getcdhash(struct vnode * vp,off_t offset,unsigned char * cdhash)3686 vn_getcdhash(struct vnode *vp, off_t offset, unsigned char *cdhash)
3687 {
3688 	return ubc_cs_getcdhash(vp, offset, cdhash);
3689 }
3690 
3691 
3692 static char *extension_table = NULL;
3693 static int   nexts;
3694 static int   max_ext_width;
3695 
3696 static int
extension_cmp(const void * a,const void * b)3697 extension_cmp(const void *a, const void *b)
3698 {
3699 	return (int)(strlen((const char *)a) - strlen((const char *)b));
3700 }
3701 
3702 
3703 //
3704 // This is the api LaunchServices uses to inform the kernel
3705 // the list of package extensions to ignore.
3706 //
3707 // Internally we keep the list sorted by the length of the
3708 // the extension (from longest to shortest).  We sort the
3709 // list of extensions so that we can speed up our searches
3710 // when comparing file names -- we only compare extensions
3711 // that could possibly fit into the file name, not all of
3712 // them (i.e. a short 8 character name can't have an 8
3713 // character extension).
3714 //
3715 extern lck_mtx_t pkg_extensions_lck;
3716 
3717 __private_extern__ int
set_package_extensions_table(user_addr_t data,int nentries,int maxwidth)3718 set_package_extensions_table(user_addr_t data, int nentries, int maxwidth)
3719 {
3720 	char *new_exts, *old_exts;
3721 	int old_nentries = 0, old_maxwidth = 0;
3722 	int error;
3723 
3724 	if (nentries <= 0 || nentries > 1024 || maxwidth <= 0 || maxwidth > 255) {
3725 		return EINVAL;
3726 	}
3727 
3728 
3729 	// allocate one byte extra so we can guarantee null termination
3730 	new_exts = kalloc_data((nentries * maxwidth) + 1, Z_WAITOK);
3731 	if (new_exts == NULL) {
3732 		return ENOMEM;
3733 	}
3734 
3735 	error = copyin(data, new_exts, nentries * maxwidth);
3736 	if (error) {
3737 		kfree_data(new_exts, (nentries * maxwidth) + 1);
3738 		return error;
3739 	}
3740 
3741 	new_exts[(nentries * maxwidth)] = '\0'; // guarantee null termination of the block
3742 
3743 	qsort(new_exts, nentries, maxwidth, extension_cmp);
3744 
3745 	lck_mtx_lock(&pkg_extensions_lck);
3746 
3747 	old_exts        = extension_table;
3748 	old_nentries    = nexts;
3749 	old_maxwidth    = max_ext_width;
3750 	extension_table = new_exts;
3751 	nexts           = nentries;
3752 	max_ext_width   = maxwidth;
3753 
3754 	lck_mtx_unlock(&pkg_extensions_lck);
3755 
3756 	kfree_data(old_exts, (old_nentries * old_maxwidth) + 1);
3757 
3758 	return 0;
3759 }
3760 
3761 
3762 int
is_package_name(const char * name,int len)3763 is_package_name(const char *name, int len)
3764 {
3765 	int i;
3766 	size_t extlen;
3767 	const char *ptr, *name_ext;
3768 
3769 	// if the name is less than 3 bytes it can't be of the
3770 	// form A.B and if it begins with a "." then it is also
3771 	// not a package.
3772 	if (len <= 3 || name[0] == '.') {
3773 		return 0;
3774 	}
3775 
3776 	name_ext = NULL;
3777 	for (ptr = name; *ptr != '\0'; ptr++) {
3778 		if (*ptr == '.') {
3779 			name_ext = ptr;
3780 		}
3781 	}
3782 
3783 	// if there is no "." extension, it can't match
3784 	if (name_ext == NULL) {
3785 		return 0;
3786 	}
3787 
3788 	// advance over the "."
3789 	name_ext++;
3790 
3791 	lck_mtx_lock(&pkg_extensions_lck);
3792 
3793 	// now iterate over all the extensions to see if any match
3794 	ptr = &extension_table[0];
3795 	for (i = 0; i < nexts; i++, ptr += max_ext_width) {
3796 		extlen = strlen(ptr);
3797 		if (strncasecmp(name_ext, ptr, extlen) == 0 && name_ext[extlen] == '\0') {
3798 			// aha, a match!
3799 			lck_mtx_unlock(&pkg_extensions_lck);
3800 			return 1;
3801 		}
3802 	}
3803 
3804 	lck_mtx_unlock(&pkg_extensions_lck);
3805 
3806 	// if we get here, no extension matched
3807 	return 0;
3808 }
3809 
3810 int
vn_path_package_check(__unused vnode_t vp,char * path,int pathlen,int * component)3811 vn_path_package_check(__unused vnode_t vp, char *path, int pathlen, int *component)
3812 {
3813 	char *ptr, *end;
3814 	int comp = 0;
3815 
3816 	if (pathlen < 0) {
3817 		return EINVAL;
3818 	}
3819 
3820 	*component = -1;
3821 	if (*path != '/') {
3822 		return EINVAL;
3823 	}
3824 
3825 	end = path + 1;
3826 	while (end < path + pathlen && *end != '\0') {
3827 		while (end < path + pathlen && *end == '/' && *end != '\0') {
3828 			end++;
3829 		}
3830 
3831 		ptr = end;
3832 
3833 		while (end < path + pathlen && *end != '/' && *end != '\0') {
3834 			end++;
3835 		}
3836 
3837 		if (end > path + pathlen) {
3838 			// hmm, string wasn't null terminated
3839 			return EINVAL;
3840 		}
3841 
3842 		*end = '\0';
3843 		if (is_package_name(ptr, (int)(end - ptr))) {
3844 			*component = comp;
3845 			break;
3846 		}
3847 
3848 		end++;
3849 		comp++;
3850 	}
3851 
3852 	return 0;
3853 }
3854 
3855 /*
3856  * Determine if a name is inappropriate for a searchfs query.
3857  * This list consists of /System currently.
3858  */
3859 
3860 int
vn_searchfs_inappropriate_name(const char * name,int len)3861 vn_searchfs_inappropriate_name(const char *name, int len)
3862 {
3863 	const char *bad_names[] = { "System" };
3864 	int   bad_len[]   = { 6 };
3865 	int  i;
3866 
3867 	if (len < 0) {
3868 		return EINVAL;
3869 	}
3870 
3871 	for (i = 0; i < (int) (sizeof(bad_names) / sizeof(bad_names[0])); i++) {
3872 		if (len == bad_len[i] && strncmp(name, bad_names[i], strlen(bad_names[i]) + 1) == 0) {
3873 			return 1;
3874 		}
3875 	}
3876 
3877 	// if we get here, no name matched
3878 	return 0;
3879 }
3880 
3881 /*
3882  * Top level filesystem related information gathering.
3883  */
3884 extern unsigned int vfs_nummntops;
3885 
3886 /*
3887  * The VFS_NUMMNTOPS shouldn't be at name[1] since
3888  * is a VFS generic variable. Since we no longer support
3889  * VT_UFS, we reserve its value to support this sysctl node.
3890  *
3891  * It should have been:
3892  *    name[0]:  VFS_GENERIC
3893  *    name[1]:  VFS_NUMMNTOPS
3894  */
3895 SYSCTL_INT(_vfs, VFS_NUMMNTOPS, nummntops,
3896     CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
3897     &vfs_nummntops, 0, "");
3898 
3899 int
3900 vfs_sysctl(int *name __unused, u_int namelen __unused,
3901     user_addr_t oldp __unused, size_t *oldlenp __unused,
3902     user_addr_t newp __unused, size_t newlen __unused, proc_t p __unused);
3903 
3904 int
vfs_sysctl(int * name __unused,u_int namelen __unused,user_addr_t oldp __unused,size_t * oldlenp __unused,user_addr_t newp __unused,size_t newlen __unused,proc_t p __unused)3905 vfs_sysctl(int *name __unused, u_int namelen __unused,
3906     user_addr_t oldp __unused, size_t *oldlenp __unused,
3907     user_addr_t newp __unused, size_t newlen __unused, proc_t p __unused)
3908 {
3909 	return EINVAL;
3910 }
3911 
3912 
3913 //
3914 // The following code disallows specific sysctl's that came through
3915 // the direct sysctl interface (vfs_sysctl_node) instead of the newer
3916 // sysctl_vfs_ctlbyfsid() interface.  We can not allow these selectors
3917 // through vfs_sysctl_node() because it passes the user's oldp pointer
3918 // directly to the file system which (for these selectors) casts it
3919 // back to a struct sysctl_req and then proceed to use SYSCTL_IN()
3920 // which jumps through an arbitrary function pointer.  When called
3921 // through the sysctl_vfs_ctlbyfsid() interface this does not happen
3922 // and so it's safe.
3923 //
3924 // Unfortunately we have to pull in definitions from AFP and SMB and
3925 // perform explicit name checks on the file system to determine if
3926 // these selectors are being used.
3927 //
3928 
3929 #define AFPFS_VFS_CTL_GETID            0x00020001
3930 #define AFPFS_VFS_CTL_NETCHANGE        0x00020002
3931 #define AFPFS_VFS_CTL_VOLCHANGE        0x00020003
3932 
3933 #define SMBFS_SYSCTL_REMOUNT           1
3934 #define SMBFS_SYSCTL_REMOUNT_INFO      2
3935 #define SMBFS_SYSCTL_GET_SERVER_SHARE  3
3936 
3937 
3938 static int
is_bad_sysctl_name(struct vfstable * vfsp,int selector_name)3939 is_bad_sysctl_name(struct vfstable *vfsp, int selector_name)
3940 {
3941 	switch (selector_name) {
3942 	case VFS_CTL_QUERY:
3943 	case VFS_CTL_TIMEO:
3944 	case VFS_CTL_NOLOCKS:
3945 	case VFS_CTL_NSTATUS:
3946 	case VFS_CTL_SADDR:
3947 	case VFS_CTL_DISC:
3948 	case VFS_CTL_SERVERINFO:
3949 		return 1;
3950 
3951 	default:
3952 		break;
3953 	}
3954 
3955 	// the more complicated check for some of SMB's special values
3956 	if (strcmp(vfsp->vfc_name, "smbfs") == 0) {
3957 		switch (selector_name) {
3958 		case SMBFS_SYSCTL_REMOUNT:
3959 		case SMBFS_SYSCTL_REMOUNT_INFO:
3960 		case SMBFS_SYSCTL_GET_SERVER_SHARE:
3961 			return 1;
3962 		}
3963 	} else if (strcmp(vfsp->vfc_name, "afpfs") == 0) {
3964 		switch (selector_name) {
3965 		case AFPFS_VFS_CTL_GETID:
3966 		case AFPFS_VFS_CTL_NETCHANGE:
3967 		case AFPFS_VFS_CTL_VOLCHANGE:
3968 			return 1;
3969 		}
3970 	}
3971 
3972 	//
3973 	// If we get here we passed all the checks so the selector is ok
3974 	//
3975 	return 0;
3976 }
3977 
3978 
3979 int vfs_sysctl_node SYSCTL_HANDLER_ARGS
3980 {
3981 	int *name, namelen;
3982 	struct vfstable *vfsp;
3983 	int error;
3984 	int fstypenum;
3985 
3986 	fstypenum = oidp->oid_number;
3987 	name = arg1;
3988 	namelen = arg2;
3989 
3990 	/* all sysctl names at this level should have at least one name slot for the FS */
3991 	if (namelen < 1) {
3992 		return EISDIR; /* overloaded */
3993 	}
3994 	mount_list_lock();
3995 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
3996 		if (vfsp->vfc_typenum == fstypenum) {
3997 			vfsp->vfc_refcount++;
3998 			break;
3999 		}
4000 	}
4001 	mount_list_unlock();
4002 
4003 	if (vfsp == NULL) {
4004 		return ENOTSUP;
4005 	}
4006 
4007 	if (is_bad_sysctl_name(vfsp, name[0])) {
4008 		printf("vfs: bad selector 0x%.8x for old-style sysctl().  use the sysctl-by-fsid interface instead\n", name[0]);
4009 		error = EPERM;
4010 	} else {
4011 		error = (vfsp->vfc_vfsops->vfs_sysctl)(name, namelen,
4012 		    req->oldptr, &req->oldlen, req->newptr, req->newlen,
4013 		    vfs_context_current());
4014 	}
4015 
4016 	mount_list_lock();
4017 	vfsp->vfc_refcount--;
4018 	mount_list_unlock();
4019 
4020 	return error;
4021 }
4022 
4023 /*
4024  * Check to see if a filesystem is mounted on a block device.
4025  */
4026 int
vfs_mountedon(struct vnode * vp)4027 vfs_mountedon(struct vnode *vp)
4028 {
4029 	struct vnode *vq;
4030 	int error = 0;
4031 
4032 restart:
4033 	SPECHASH_LOCK();
4034 	if (vp->v_specflags & SI_MOUNTING && (vp->v_specinfo->si_mountingowner != current_thread())) {
4035 		msleep((caddr_t)&vp->v_specflags, SPECHASH_LOCK_ADDR(), PVFS | PDROP, "vnode_waitformounting", NULL);
4036 		goto restart;
4037 	}
4038 	if (vp->v_specflags & SI_MOUNTEDON) {
4039 		error = EBUSY;
4040 		goto out;
4041 	}
4042 	if (vp->v_specflags & SI_ALIASED) {
4043 		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
4044 			if (vq->v_rdev != vp->v_rdev ||
4045 			    vq->v_type != vp->v_type) {
4046 				continue;
4047 			}
4048 			if (vq->v_specflags & SI_MOUNTING) {
4049 				msleep((caddr_t)&vq->v_specflags, SPECHASH_LOCK_ADDR(), PVFS | PDROP, "vnode_waitformounting", NULL);
4050 				goto restart;
4051 			}
4052 			if (vq->v_specflags & SI_MOUNTEDON) {
4053 				error = EBUSY;
4054 				break;
4055 			}
4056 		}
4057 	}
4058 out:
4059 	SPECHASH_UNLOCK();
4060 	return error;
4061 }
4062 
4063 void
vfs_setmountedon(vnode_t vp)4064 vfs_setmountedon(vnode_t vp)
4065 {
4066 	vnode_lock(vp);
4067 	SPECHASH_LOCK();
4068 	vp->v_specflags |= SI_MOUNTEDON;
4069 	vp->v_specflags &= ~SI_MOUNTING;
4070 	vp->v_specinfo->si_mountingowner = NULL;
4071 	SPECHASH_UNLOCK();
4072 	vnode_unlock(vp);
4073 	wakeup(&vp->v_specflags);
4074 }
4075 
4076 void
vfs_clearmounting(vnode_t vp)4077 vfs_clearmounting(vnode_t vp)
4078 {
4079 	vnode_lock(vp);
4080 	SPECHASH_LOCK();
4081 	vp->v_specflags &= ~SI_MOUNTING;
4082 	vp->v_specinfo->si_mountingowner = NULL;
4083 	SPECHASH_UNLOCK();
4084 	vnode_unlock(vp);
4085 	wakeup(&vp->v_specflags);
4086 }
4087 
4088 /*
4089  * Check to see if a filesystem is mounted on a block device.
4090  */
4091 int
vfs_setmounting(vnode_t vp)4092 vfs_setmounting(vnode_t vp)
4093 {
4094 	struct vnode *vq;
4095 	int error = 0;
4096 
4097 	vnode_lock(vp);
4098 	while (vp->v_specflags & SI_MOUNTING) {
4099 		msleep((caddr_t)&vp->v_specflags, &vp->v_lock, PVFS, "vnode_waitformounting", NULL);
4100 	}
4101 	if (vp->v_specflags & SI_MOUNTEDON) {
4102 		vnode_unlock(vp);
4103 		return EBUSY;
4104 	}
4105 	SPECHASH_LOCK();
4106 	vp->v_specflags |= SI_MOUNTING;
4107 	vp->v_specinfo->si_mountingowner = current_thread();
4108 	vnode_unlock(vp);
4109 restart:
4110 	if (vp->v_specflags & SI_ALIASED) {
4111 		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
4112 			if (vq->v_rdev != vp->v_rdev ||
4113 			    vq->v_type != vp->v_type) {
4114 				continue;
4115 			}
4116 			if (vq->v_specflags & SI_MOUNTING) {
4117 				msleep((caddr_t)&vq->v_specflags, SPECHASH_LOCK_ADDR(), PVFS | PDROP, "vnode_waitformounting", NULL);
4118 				SPECHASH_LOCK();
4119 				goto restart;
4120 			}
4121 			if (vq->v_specflags & SI_MOUNTEDON) {
4122 				error = EBUSY;
4123 				break;
4124 			}
4125 		}
4126 	}
4127 	SPECHASH_UNLOCK();
4128 	if (error) {
4129 		vnode_lock(vp);
4130 		SPECHASH_LOCK();
4131 		vp->v_specflags &= ~SI_MOUNTING;
4132 		SPECHASH_UNLOCK();
4133 		vnode_unlock(vp);
4134 		wakeup(&vp->v_specflags);
4135 	}
4136 	return error;
4137 }
4138 
4139 struct unmount_info {
4140 	int     u_errs; // Total failed unmounts
4141 	int     u_busy; // EBUSY failed unmounts
4142 	int     u_count; // Total volumes iterated
4143 	int     u_only_non_system;
4144 };
4145 
4146 static int
unmount_callback(mount_t mp,void * arg)4147 unmount_callback(mount_t mp, void *arg)
4148 {
4149 	int error;
4150 	char *mntname;
4151 	struct unmount_info *uip = arg;
4152 
4153 	uip->u_count++;
4154 
4155 	mntname = zalloc_flags(ZV_NAMEI, Z_WAITOK | Z_NOFAIL);
4156 	strlcpy(mntname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
4157 
4158 	if (uip->u_only_non_system
4159 	    && ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM))) { //MNTK_BACKS_ROOT
4160 		printf("unmount(%d) %s skipped\n", uip->u_only_non_system, mntname);
4161 		mount_iterdrop(mp);     // VFS_ITERATE_CB_DROPREF
4162 	} else {
4163 		printf("unmount(%d) %s\n", uip->u_only_non_system, mntname);
4164 
4165 		mount_ref(mp, 0);
4166 		mount_iterdrop(mp);     // VFS_ITERATE_CB_DROPREF
4167 		error = dounmount(mp, MNT_FORCE, 1, vfs_context_current());
4168 		if (error) {
4169 			uip->u_errs++;
4170 			printf("Unmount of %s failed (%d)\n", mntname ? mntname:"?", error);
4171 			if (error == EBUSY) {
4172 				uip->u_busy++;
4173 			}
4174 		}
4175 	}
4176 	zfree(ZV_NAMEI, mntname);
4177 
4178 	return VFS_RETURNED;
4179 }
4180 
4181 /*
4182  * Unmount all filesystems. The list is traversed in reverse order
4183  * of mounting to avoid dependencies.
4184  * Busy mounts are retried.
4185  */
4186 __private_extern__ void
vfs_unmountall(int only_non_system)4187 vfs_unmountall(int only_non_system)
4188 {
4189 	int mounts, sec = 1;
4190 	struct unmount_info ui;
4191 
4192 	/*
4193 	 * Ensure last-completion-time is valid before anyone can see that
4194 	 * VFS shutdown has started.
4195 	 */
4196 	vfs_shutdown_last_completion_time = mach_absolute_time();
4197 	OSMemoryBarrier();
4198 	vfs_unmountall_started = 1;
4199 	printf("vfs_unmountall(%ssystem) start\n", only_non_system ? "non" : "");
4200 
4201 retry:
4202 	ui.u_errs = ui.u_busy = ui.u_count = 0;
4203 	ui.u_only_non_system = only_non_system;
4204 	// avoid vfs_iterate deadlock in dounmount(), use VFS_ITERATE_CB_DROPREF
4205 	vfs_iterate(VFS_ITERATE_CB_DROPREF | VFS_ITERATE_TAIL_FIRST, unmount_callback, &ui);
4206 	mounts = mount_getvfscnt();
4207 	if (mounts == 0) {
4208 		goto out;
4209 	}
4210 	if (ui.u_busy > 0) {            // Busy mounts - wait & retry
4211 		tsleep(&nummounts, PVFS, "busy mount", sec * hz);
4212 		sec *= 2;
4213 		if (sec <= 32) {
4214 			goto retry;
4215 		}
4216 		printf("Unmounting timed out\n");
4217 	} else if (ui.u_count < mounts) {
4218 		// If the vfs_iterate missed mounts in progress - wait a bit
4219 		tsleep(&nummounts, PVFS, "missed mount", 2 * hz);
4220 	}
4221 
4222 out:
4223 	printf("vfs_unmountall(%ssystem) end\n", only_non_system ? "non" : "");
4224 
4225 	/*
4226 	 * reboot_kernel() calls us twice; once to deal with non-system
4227 	 * mounts, and again to sweep up anything left after terminating
4228 	 * DEXTs.  We're only finished once we've completed the second pass.
4229 	 */
4230 	if (!only_non_system) {
4231 		vfs_unmountall_finished = 1;
4232 	}
4233 }
4234 
4235 /*
4236  * vfs_shutdown_in_progress --
4237  *
4238  * Returns whether or not the VFS is shutting down the file systems.
4239  */
4240 boolean_t
vfs_shutdown_in_progress(void)4241 vfs_shutdown_in_progress(void)
4242 {
4243 	return vfs_unmountall_started && !vfs_unmountall_finished;
4244 }
4245 
4246 /*
4247  * vfs_shutdown_finished --
4248  *
4249  * Returns whether or not the VFS shutdown has completed.
4250  */
4251 boolean_t
vfs_shutdown_finished(void)4252 vfs_shutdown_finished(void)
4253 {
4254 	return !!vfs_unmountall_finished;
4255 }
4256 
4257 /*
4258  * vfs_update_last_completion_time --
4259  *
4260  * Updates the "last I/O completion time" timestamp used by the watchdog
4261  * to monitor VFS shutdown progress.  Called by various I/O stack layers
4262  * as operations complete and progress moves forward.
4263  */
4264 void
vfs_update_last_completion_time(void)4265 vfs_update_last_completion_time(void)
4266 {
4267 	if (vfs_unmountall_started) {
4268 		vfs_shutdown_last_completion_time = mach_absolute_time();
4269 	}
4270 }
4271 
4272 /*
4273  * vfs_last_completion_time --
4274  *
4275  * Returns the "last I/O completion time" timestamp.  Return
4276  * value is a mach_absolute_time() value, and is not meaningful
4277  * unless vfs_is_shutting_down() also returns true.
4278  */
4279 uint64_t
vfs_last_completion_time(void)4280 vfs_last_completion_time(void)
4281 {
4282 	return vfs_unmountall_started ? vfs_shutdown_last_completion_time : 0;
4283 }
4284 
4285 /*
4286  * This routine is called from vnode_pager_deallocate out of the VM
4287  * The path to vnode_pager_deallocate can only be initiated by ubc_destroy_named
4288  * on a vnode that has a UBCINFO
4289  */
4290 __private_extern__ void
vnode_pager_vrele(vnode_t vp)4291 vnode_pager_vrele(vnode_t vp)
4292 {
4293 	struct ubc_info *uip;
4294 
4295 	vnode_lock_spin(vp);
4296 
4297 	vp->v_lflag &= ~VNAMED_UBC;
4298 	if (vp->v_usecount != 0) {
4299 		/*
4300 		 * At the eleventh hour, just before the ubcinfo is
4301 		 * destroyed, ensure the ubc-specific v_usecount
4302 		 * reference has gone.  We use v_usecount != 0 as a hint;
4303 		 * ubc_unmap() does nothing if there's no mapping.
4304 		 *
4305 		 * This case is caused by coming here via forced unmount,
4306 		 * versus the usual vm_object_deallocate() path.
4307 		 * In the forced unmount case, ubc_destroy_named()
4308 		 * releases the pager before memory_object_last_unmap()
4309 		 * can be called.
4310 		 */
4311 		vnode_unlock(vp);
4312 		ubc_unmap(vp);
4313 		vnode_lock_spin(vp);
4314 	}
4315 
4316 	uip = vp->v_ubcinfo;
4317 	vp->v_ubcinfo = UBC_INFO_NULL;
4318 
4319 	vnode_unlock(vp);
4320 
4321 	ubc_info_deallocate(uip);
4322 }
4323 
4324 
4325 #include <sys/disk.h>
4326 
4327 u_int32_t rootunit = (u_int32_t)-1;
4328 
4329 #if CONFIG_IOSCHED
4330 extern int lowpri_throttle_enabled;
4331 extern int iosched_enabled;
4332 #endif
4333 
4334 errno_t
vfs_init_io_attributes(vnode_t devvp,mount_t mp)4335 vfs_init_io_attributes(vnode_t devvp, mount_t mp)
4336 {
4337 	int     error;
4338 	off_t   readblockcnt = 0;
4339 	off_t   writeblockcnt = 0;
4340 	off_t   readmaxcnt = 0;
4341 	off_t   writemaxcnt = 0;
4342 	off_t   readsegcnt = 0;
4343 	off_t   writesegcnt = 0;
4344 	off_t   readsegsize = 0;
4345 	off_t   writesegsize = 0;
4346 	off_t   alignment = 0;
4347 	u_int32_t minsaturationbytecount = 0;
4348 	u_int32_t ioqueue_depth = 0;
4349 	u_int32_t blksize;
4350 	u_int64_t temp;
4351 	u_int32_t features;
4352 	u_int64_t location = 0;
4353 	vfs_context_t ctx = vfs_context_current();
4354 	dk_corestorage_info_t cs_info;
4355 	boolean_t cs_present = FALSE;
4356 	int isssd = 0;
4357 	int isvirtual = 0;
4358 
4359 
4360 	VNOP_IOCTL(devvp, DKIOCGETTHROTTLEMASK, (caddr_t)&mp->mnt_throttle_mask, 0, NULL);
4361 	/*
4362 	 * as a reasonable approximation, only use the lowest bit of the mask
4363 	 * to generate a disk unit number
4364 	 */
4365 	mp->mnt_devbsdunit = num_trailing_0(mp->mnt_throttle_mask);
4366 
4367 	if (devvp == rootvp) {
4368 		rootunit = mp->mnt_devbsdunit;
4369 	}
4370 
4371 	if (mp->mnt_devbsdunit == rootunit) {
4372 		/*
4373 		 * this mount point exists on the same device as the root
4374 		 * partition, so it comes under the hard throttle control...
4375 		 * this is true even for the root mount point itself
4376 		 */
4377 		mp->mnt_kern_flag |= MNTK_ROOTDEV;
4378 	}
4379 	/*
4380 	 * force the spec device to re-cache
4381 	 * the underlying block size in case
4382 	 * the filesystem overrode the initial value
4383 	 */
4384 	set_fsblocksize(devvp);
4385 
4386 
4387 	if ((error = VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE,
4388 	    (caddr_t)&blksize, 0, ctx))) {
4389 		return error;
4390 	}
4391 
4392 	mp->mnt_devblocksize = blksize;
4393 
4394 	/*
4395 	 * set the maximum possible I/O size
4396 	 * this may get clipped to a smaller value
4397 	 * based on which constraints are being advertised
4398 	 * and if those advertised constraints result in a smaller
4399 	 * limit for a given I/O
4400 	 */
4401 	mp->mnt_maxreadcnt = MAX_UPL_SIZE_BYTES;
4402 	mp->mnt_maxwritecnt = MAX_UPL_SIZE_BYTES;
4403 
4404 	if (VNOP_IOCTL(devvp, DKIOCISVIRTUAL, (caddr_t)&isvirtual, 0, ctx) == 0) {
4405 		if (isvirtual) {
4406 			mp->mnt_kern_flag |= MNTK_VIRTUALDEV;
4407 			mp->mnt_flag |= MNT_REMOVABLE;
4408 		}
4409 	}
4410 	if (VNOP_IOCTL(devvp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ctx) == 0) {
4411 		if (isssd) {
4412 			mp->mnt_kern_flag |= MNTK_SSD;
4413 		}
4414 	}
4415 	if ((error = VNOP_IOCTL(devvp, DKIOCGETFEATURES,
4416 	    (caddr_t)&features, 0, ctx))) {
4417 		return error;
4418 	}
4419 
4420 	if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD,
4421 	    (caddr_t)&readblockcnt, 0, ctx))) {
4422 		return error;
4423 	}
4424 
4425 	if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE,
4426 	    (caddr_t)&writeblockcnt, 0, ctx))) {
4427 		return error;
4428 	}
4429 
4430 	if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD,
4431 	    (caddr_t)&readmaxcnt, 0, ctx))) {
4432 		return error;
4433 	}
4434 
4435 	if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE,
4436 	    (caddr_t)&writemaxcnt, 0, ctx))) {
4437 		return error;
4438 	}
4439 
4440 	if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD,
4441 	    (caddr_t)&readsegcnt, 0, ctx))) {
4442 		return error;
4443 	}
4444 
4445 	if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE,
4446 	    (caddr_t)&writesegcnt, 0, ctx))) {
4447 		return error;
4448 	}
4449 
4450 	if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTREAD,
4451 	    (caddr_t)&readsegsize, 0, ctx))) {
4452 		return error;
4453 	}
4454 
4455 	if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTWRITE,
4456 	    (caddr_t)&writesegsize, 0, ctx))) {
4457 		return error;
4458 	}
4459 
4460 	if ((error = VNOP_IOCTL(devvp, DKIOCGETMINSEGMENTALIGNMENTBYTECOUNT,
4461 	    (caddr_t)&alignment, 0, ctx))) {
4462 		return error;
4463 	}
4464 
4465 	if ((error = VNOP_IOCTL(devvp, DKIOCGETCOMMANDPOOLSIZE,
4466 	    (caddr_t)&ioqueue_depth, 0, ctx))) {
4467 		return error;
4468 	}
4469 
4470 	if (readmaxcnt) {
4471 		mp->mnt_maxreadcnt = (readmaxcnt > UINT32_MAX) ? UINT32_MAX :(uint32_t) readmaxcnt;
4472 	}
4473 
4474 	if (readblockcnt) {
4475 		temp = readblockcnt * blksize;
4476 		temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
4477 
4478 		if (temp < mp->mnt_maxreadcnt) {
4479 			mp->mnt_maxreadcnt = (u_int32_t)temp;
4480 		}
4481 	}
4482 
4483 	if (writemaxcnt) {
4484 		mp->mnt_maxwritecnt = (writemaxcnt > UINT32_MAX) ? UINT32_MAX : (uint32_t)writemaxcnt;
4485 	}
4486 
4487 	if (writeblockcnt) {
4488 		temp = writeblockcnt * blksize;
4489 		temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
4490 
4491 		if (temp < mp->mnt_maxwritecnt) {
4492 			mp->mnt_maxwritecnt = (u_int32_t)temp;
4493 		}
4494 	}
4495 
4496 	if (readsegcnt) {
4497 		temp = (readsegcnt > UINT16_MAX) ? UINT16_MAX : readsegcnt;
4498 	} else {
4499 		temp = mp->mnt_maxreadcnt / PAGE_SIZE;
4500 
4501 		if (temp > UINT16_MAX) {
4502 			temp = UINT16_MAX;
4503 		}
4504 	}
4505 	mp->mnt_segreadcnt = (u_int16_t)temp;
4506 
4507 	if (writesegcnt) {
4508 		temp = (writesegcnt > UINT16_MAX) ? UINT16_MAX : writesegcnt;
4509 	} else {
4510 		temp = mp->mnt_maxwritecnt / PAGE_SIZE;
4511 
4512 		if (temp > UINT16_MAX) {
4513 			temp = UINT16_MAX;
4514 		}
4515 	}
4516 	mp->mnt_segwritecnt = (u_int16_t)temp;
4517 
4518 	if (readsegsize) {
4519 		temp = (readsegsize > UINT32_MAX) ? UINT32_MAX : readsegsize;
4520 	} else {
4521 		temp = mp->mnt_maxreadcnt;
4522 	}
4523 	mp->mnt_maxsegreadsize = (u_int32_t)temp;
4524 
4525 	if (writesegsize) {
4526 		temp = (writesegsize > UINT32_MAX) ? UINT32_MAX : writesegsize;
4527 	} else {
4528 		temp = mp->mnt_maxwritecnt;
4529 	}
4530 	mp->mnt_maxsegwritesize = (u_int32_t)temp;
4531 
4532 	if (alignment) {
4533 		temp = (alignment > PAGE_SIZE) ? PAGE_MASK : alignment - 1;
4534 	} else {
4535 		temp = 0;
4536 	}
4537 	mp->mnt_alignmentmask = (uint32_t)temp;
4538 
4539 
4540 	if (ioqueue_depth > MNT_DEFAULT_IOQUEUE_DEPTH) {
4541 		temp = ioqueue_depth;
4542 	} else {
4543 		temp = MNT_DEFAULT_IOQUEUE_DEPTH;
4544 	}
4545 
4546 	mp->mnt_ioqueue_depth = (uint32_t)temp;
4547 	mp->mnt_ioscale = MNT_IOSCALE(mp->mnt_ioqueue_depth);
4548 
4549 	if (mp->mnt_ioscale > 1) {
4550 		printf("ioqueue_depth = %d,   ioscale = %d\n", (int)mp->mnt_ioqueue_depth, (int)mp->mnt_ioscale);
4551 	}
4552 
4553 	if (features & DK_FEATURE_FORCE_UNIT_ACCESS) {
4554 		mp->mnt_ioflags |= MNT_IOFLAGS_FUA_SUPPORTED;
4555 	}
4556 
4557 	if (VNOP_IOCTL(devvp, DKIOCGETIOMINSATURATIONBYTECOUNT, (caddr_t)&minsaturationbytecount, 0, ctx) == 0) {
4558 		mp->mnt_minsaturationbytecount = minsaturationbytecount;
4559 	} else {
4560 		mp->mnt_minsaturationbytecount = 0;
4561 	}
4562 
4563 	if (VNOP_IOCTL(devvp, DKIOCCORESTORAGE, (caddr_t)&cs_info, 0, ctx) == 0) {
4564 		cs_present = TRUE;
4565 	}
4566 
4567 	if (features & DK_FEATURE_UNMAP) {
4568 		mp->mnt_ioflags |= MNT_IOFLAGS_UNMAP_SUPPORTED;
4569 
4570 		if (cs_present == TRUE) {
4571 			mp->mnt_ioflags |= MNT_IOFLAGS_CSUNMAP_SUPPORTED;
4572 		}
4573 	}
4574 	if (cs_present == TRUE) {
4575 		/*
4576 		 * for now we'll use the following test as a proxy for
4577 		 * the underlying drive being FUSION in nature
4578 		 */
4579 		if ((cs_info.flags & DK_CORESTORAGE_PIN_YOUR_METADATA)) {
4580 			mp->mnt_ioflags |= MNT_IOFLAGS_FUSION_DRIVE;
4581 		}
4582 	} else {
4583 		/* Check for APFS Fusion */
4584 		dk_apfs_flavour_t flavour;
4585 		if ((VNOP_IOCTL(devvp, DKIOCGETAPFSFLAVOUR, (caddr_t)&flavour, 0, ctx) == 0) &&
4586 		    (flavour == DK_APFS_FUSION)) {
4587 			mp->mnt_ioflags |= MNT_IOFLAGS_FUSION_DRIVE;
4588 		}
4589 	}
4590 
4591 	if (VNOP_IOCTL(devvp, DKIOCGETLOCATION, (caddr_t)&location, 0, ctx) == 0) {
4592 		if (location & DK_LOCATION_EXTERNAL) {
4593 			mp->mnt_ioflags |= MNT_IOFLAGS_PERIPHERAL_DRIVE;
4594 			mp->mnt_flag |= MNT_REMOVABLE;
4595 		}
4596 	}
4597 
4598 #if CONFIG_IOSCHED
4599 	if (iosched_enabled && (features & DK_FEATURE_PRIORITY)) {
4600 		mp->mnt_ioflags |= MNT_IOFLAGS_IOSCHED_SUPPORTED;
4601 		throttle_info_disable_throttle(mp->mnt_devbsdunit, (mp->mnt_ioflags & MNT_IOFLAGS_FUSION_DRIVE) != 0);
4602 	}
4603 #endif /* CONFIG_IOSCHED */
4604 	return error;
4605 }
4606 
4607 static struct klist fs_klist;
4608 static LCK_GRP_DECLARE(fs_klist_lck_grp, "fs_klist");
4609 static LCK_MTX_DECLARE(fs_klist_lock, &fs_klist_lck_grp);
4610 
4611 void
vfs_event_init(void)4612 vfs_event_init(void)
4613 {
4614 	klist_init(&fs_klist);
4615 }
4616 
4617 void
vfs_event_signal(fsid_t * fsid,u_int32_t event,intptr_t data)4618 vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data)
4619 {
4620 	if (event == VQ_DEAD || event == VQ_NOTRESP) {
4621 		struct mount *mp = vfs_getvfs(fsid);
4622 		if (mp) {
4623 			mount_lock_spin(mp);
4624 			if (data) {
4625 				mp->mnt_kern_flag &= ~MNT_LNOTRESP;     // Now responding
4626 			} else {
4627 				mp->mnt_kern_flag |= MNT_LNOTRESP;      // Not responding
4628 			}
4629 			mount_unlock(mp);
4630 		}
4631 	}
4632 
4633 	lck_mtx_lock(&fs_klist_lock);
4634 	KNOTE(&fs_klist, event);
4635 	lck_mtx_unlock(&fs_klist_lock);
4636 }
4637 
4638 /*
4639  * return the number of mounted filesystems.
4640  */
4641 static int
sysctl_vfs_getvfscnt(void)4642 sysctl_vfs_getvfscnt(void)
4643 {
4644 	return mount_getvfscnt();
4645 }
4646 
4647 
4648 static int
mount_getvfscnt(void)4649 mount_getvfscnt(void)
4650 {
4651 	int ret;
4652 
4653 	mount_list_lock();
4654 	ret = nummounts;
4655 	mount_list_unlock();
4656 	return ret;
4657 }
4658 
4659 
4660 
4661 static int
mount_fillfsids(fsid_t * fsidlst,int count)4662 mount_fillfsids(fsid_t *fsidlst, int count)
4663 {
4664 	struct mount *mp;
4665 	int actual = 0;
4666 
4667 	actual = 0;
4668 	mount_list_lock();
4669 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
4670 		if (actual < count) {
4671 			fsidlst[actual] = mp->mnt_vfsstat.f_fsid;
4672 			actual++;
4673 		}
4674 	}
4675 	mount_list_unlock();
4676 	return actual;
4677 }
4678 
4679 /*
4680  * fill in the array of fsid_t's up to a max of 'count', the actual
4681  * number filled in will be set in '*actual'.  If there are more fsid_t's
4682  * than room in fsidlst then ENOMEM will be returned and '*actual' will
4683  * have the actual count.
4684  * having *actual filled out even in the error case is depended upon.
4685  */
4686 static int
sysctl_vfs_getvfslist(fsid_t * fsidlst,unsigned long count,unsigned long * actual)4687 sysctl_vfs_getvfslist(fsid_t *fsidlst, unsigned long count, unsigned long *actual)
4688 {
4689 	struct mount *mp;
4690 
4691 	*actual = 0;
4692 	mount_list_lock();
4693 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
4694 		(*actual)++;
4695 		if (*actual <= count) {
4696 			fsidlst[(*actual) - 1] = mp->mnt_vfsstat.f_fsid;
4697 		}
4698 	}
4699 	mount_list_unlock();
4700 	return *actual <= count ? 0 : ENOMEM;
4701 }
4702 
4703 static int
sysctl_vfs_vfslist(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)4704 sysctl_vfs_vfslist(__unused struct sysctl_oid *oidp, __unused void *arg1,
4705     __unused int arg2, struct sysctl_req *req)
4706 {
4707 	unsigned long actual;
4708 	int error;
4709 	size_t space;
4710 	fsid_t *fsidlst;
4711 
4712 	/* This is a readonly node. */
4713 	if (req->newptr != USER_ADDR_NULL) {
4714 		return EPERM;
4715 	}
4716 
4717 	/* they are querying us so just return the space required. */
4718 	if (req->oldptr == USER_ADDR_NULL) {
4719 		req->oldidx = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
4720 		return 0;
4721 	}
4722 again:
4723 	/*
4724 	 * Retrieve an accurate count of the amount of space required to copy
4725 	 * out all the fsids in the system.
4726 	 */
4727 	space = req->oldlen;
4728 	req->oldlen = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
4729 
4730 	/* they didn't give us enough space. */
4731 	if (space < req->oldlen) {
4732 		return ENOMEM;
4733 	}
4734 
4735 	fsidlst = kalloc_data(req->oldlen, Z_WAITOK | Z_ZERO);
4736 	if (fsidlst == NULL) {
4737 		return ENOMEM;
4738 	}
4739 
4740 	error = sysctl_vfs_getvfslist(fsidlst, req->oldlen / sizeof(fsid_t),
4741 	    &actual);
4742 	/*
4743 	 * If we get back ENOMEM, then another mount has been added while we
4744 	 * slept in malloc above.  If this is the case then try again.
4745 	 */
4746 	if (error == ENOMEM) {
4747 		kfree_data(fsidlst, req->oldlen);
4748 		req->oldlen = space;
4749 		goto again;
4750 	}
4751 	if (error == 0) {
4752 		error = SYSCTL_OUT(req, fsidlst, actual * sizeof(fsid_t));
4753 	}
4754 	kfree_data(fsidlst, req->oldlen);
4755 	return error;
4756 }
4757 
4758 /*
4759  * Do a sysctl by fsid.
4760  */
4761 static int
sysctl_vfs_ctlbyfsid(__unused struct sysctl_oid * oidp,void * arg1,int arg2,struct sysctl_req * req)4762 sysctl_vfs_ctlbyfsid(__unused struct sysctl_oid *oidp, void *arg1, int arg2,
4763     struct sysctl_req *req)
4764 {
4765 	union union_vfsidctl vc;
4766 	struct mount *mp = NULL;
4767 	struct vfsstatfs *sp;
4768 	int *name, namelen;
4769 	int flags = 0;
4770 	int error = 0, gotref = 0;
4771 	vfs_context_t ctx = vfs_context_current();
4772 	proc_t p = req->p;      /* XXX req->p != current_proc()? */
4773 	boolean_t is_64_bit;
4774 	union {
4775 		struct statfs64 sfs64;
4776 		struct user64_statfs osfs64;
4777 		struct user32_statfs osfs32;
4778 	} *sfsbuf;
4779 
4780 	if (req->newptr == USER_ADDR_NULL) {
4781 		error = EINVAL;
4782 		goto out;
4783 	}
4784 
4785 	name = arg1;
4786 	namelen = arg2;
4787 	is_64_bit = proc_is64bit(p);
4788 
4789 	error = SYSCTL_IN(req, &vc, is_64_bit? sizeof(vc.vc64):sizeof(vc.vc32));
4790 	if (error) {
4791 		goto out;
4792 	}
4793 	if (vc.vc32.vc_vers != VFS_CTL_VERS1) { /* works for 32 and 64 */
4794 		error = EINVAL;
4795 		goto out;
4796 	}
4797 	mp = mount_list_lookupby_fsid(&vc.vc32.vc_fsid, 0, 1); /* works for 32 and 64 */
4798 	if (mp == NULL) {
4799 		error = ENOENT;
4800 		goto out;
4801 	}
4802 	gotref = 1;
4803 	/* reset so that the fs specific code can fetch it. */
4804 	req->newidx = 0;
4805 	/*
4806 	 * Note if this is a VFS_CTL then we pass the actual sysctl req
4807 	 * in for "oldp" so that the lower layer can DTRT and use the
4808 	 * SYSCTL_IN/OUT routines.
4809 	 */
4810 	if (mp->mnt_op->vfs_sysctl != NULL) {
4811 		if (is_64_bit) {
4812 			if (vfs_64bitready(mp)) {
4813 				error = mp->mnt_op->vfs_sysctl(name, namelen,
4814 				    CAST_USER_ADDR_T(req),
4815 				    NULL, USER_ADDR_NULL, 0,
4816 				    ctx);
4817 			} else {
4818 				error = ENOTSUP;
4819 			}
4820 		} else {
4821 			error = mp->mnt_op->vfs_sysctl(name, namelen,
4822 			    CAST_USER_ADDR_T(req),
4823 			    NULL, USER_ADDR_NULL, 0,
4824 			    ctx);
4825 		}
4826 		if (error != ENOTSUP) {
4827 			goto out;
4828 		}
4829 	}
4830 	switch (name[0]) {
4831 	case VFS_CTL_UMOUNT:
4832 #if CONFIG_MACF
4833 		error = mac_mount_check_umount(ctx, mp);
4834 		if (error != 0) {
4835 			goto out;
4836 		}
4837 #endif
4838 		req->newidx = 0;
4839 		if (is_64_bit) {
4840 			req->newptr = vc.vc64.vc_ptr;
4841 			req->newlen = (size_t)vc.vc64.vc_len;
4842 		} else {
4843 			req->newptr = CAST_USER_ADDR_T(vc.vc32.vc_ptr);
4844 			req->newlen = vc.vc32.vc_len;
4845 		}
4846 		error = SYSCTL_IN(req, &flags, sizeof(flags));
4847 		if (error) {
4848 			break;
4849 		}
4850 
4851 		mount_ref(mp, 0);
4852 		mount_iterdrop(mp);
4853 		gotref = 0;
4854 		/* safedounmount consumes a ref */
4855 		error = safedounmount(mp, flags, ctx);
4856 		break;
4857 	case VFS_CTL_OSTATFS:
4858 	case VFS_CTL_STATFS64:
4859 #if CONFIG_MACF
4860 		error = mac_mount_check_stat(ctx, mp);
4861 		if (error != 0) {
4862 			break;
4863 		}
4864 #endif
4865 		req->newidx = 0;
4866 		if (is_64_bit) {
4867 			req->newptr = vc.vc64.vc_ptr;
4868 			req->newlen = (size_t)vc.vc64.vc_len;
4869 		} else {
4870 			req->newptr = CAST_USER_ADDR_T(vc.vc32.vc_ptr);
4871 			req->newlen = vc.vc32.vc_len;
4872 		}
4873 		error = SYSCTL_IN(req, &flags, sizeof(flags));
4874 		if (error) {
4875 			break;
4876 		}
4877 		sp = &mp->mnt_vfsstat;
4878 		if (((flags & MNT_NOWAIT) == 0 || (flags & (MNT_WAIT | MNT_DWAIT))) &&
4879 		    (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT))) {
4880 			goto out;
4881 		}
4882 
4883 		sfsbuf = kalloc_type(typeof(*sfsbuf), Z_WAITOK);
4884 
4885 		if (name[0] == VFS_CTL_STATFS64) {
4886 			struct statfs64 *sfs = &sfsbuf->sfs64;
4887 
4888 			vfs_get_statfs64(mp, sfs);
4889 			error = SYSCTL_OUT(req, sfs, sizeof(*sfs));
4890 		} else if (is_64_bit) {
4891 			struct user64_statfs *sfs = &sfsbuf->osfs64;
4892 
4893 			bzero(sfs, sizeof(*sfs));
4894 			sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4895 			sfs->f_type = (short)mp->mnt_vtable->vfc_typenum;
4896 			sfs->f_bsize = (user64_long_t)sp->f_bsize;
4897 			sfs->f_iosize = (user64_long_t)sp->f_iosize;
4898 			sfs->f_blocks = (user64_long_t)sp->f_blocks;
4899 			sfs->f_bfree = (user64_long_t)sp->f_bfree;
4900 			sfs->f_bavail = (user64_long_t)sp->f_bavail;
4901 			sfs->f_files = (user64_long_t)sp->f_files;
4902 			sfs->f_ffree = (user64_long_t)sp->f_ffree;
4903 			sfs->f_fsid = sp->f_fsid;
4904 			sfs->f_owner = sp->f_owner;
4905 			vfs_getfstypename(mp, sfs->f_fstypename, MFSNAMELEN);
4906 			strlcpy(sfs->f_mntonname, sp->f_mntonname, MNAMELEN);
4907 			strlcpy(sfs->f_mntfromname, sp->f_mntfromname, MNAMELEN);
4908 
4909 			error = SYSCTL_OUT(req, sfs, sizeof(*sfs));
4910 		} else {
4911 			struct user32_statfs *sfs = &sfsbuf->osfs32;
4912 			long temp;
4913 
4914 			bzero(sfs, sizeof(*sfs));
4915 			sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4916 			sfs->f_type = (short)mp->mnt_vtable->vfc_typenum;
4917 
4918 			/*
4919 			 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
4920 			 * have to fudge the numbers here in that case.   We inflate the blocksize in order
4921 			 * to reflect the filesystem size as best we can.
4922 			 */
4923 			if (sp->f_blocks > INT_MAX) {
4924 				int             shift;
4925 
4926 				/*
4927 				 * Work out how far we have to shift the block count down to make it fit.
4928 				 * Note that it's possible to have to shift so far that the resulting
4929 				 * blocksize would be unreportably large.  At that point, we will clip
4930 				 * any values that don't fit.
4931 				 *
4932 				 * For safety's sake, we also ensure that f_iosize is never reported as
4933 				 * being smaller than f_bsize.
4934 				 */
4935 				for (shift = 0; shift < 32; shift++) {
4936 					if ((sp->f_blocks >> shift) <= INT_MAX) {
4937 						break;
4938 					}
4939 					if ((((long long)sp->f_bsize) << (shift + 1)) > INT_MAX) {
4940 						break;
4941 					}
4942 				}
4943 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
4944 				sfs->f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sp->f_blocks, shift);
4945 				sfs->f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sp->f_bfree, shift);
4946 				sfs->f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sp->f_bavail, shift);
4947 #undef __SHIFT_OR_CLIP
4948 				sfs->f_bsize = (user32_long_t)(sp->f_bsize << shift);
4949 				temp = lmax(sp->f_iosize, sp->f_bsize);
4950 				if (temp > INT32_MAX) {
4951 					error = EINVAL;
4952 					kfree_type(typeof(*sfsbuf), sfsbuf);
4953 					goto out;
4954 				}
4955 				sfs->f_iosize = (user32_long_t)temp;
4956 			} else {
4957 				sfs->f_bsize = (user32_long_t)sp->f_bsize;
4958 				sfs->f_iosize = (user32_long_t)sp->f_iosize;
4959 				sfs->f_blocks = (user32_long_t)sp->f_blocks;
4960 				sfs->f_bfree = (user32_long_t)sp->f_bfree;
4961 				sfs->f_bavail = (user32_long_t)sp->f_bavail;
4962 			}
4963 			sfs->f_files = (user32_long_t)sp->f_files;
4964 			sfs->f_ffree = (user32_long_t)sp->f_ffree;
4965 			sfs->f_fsid = sp->f_fsid;
4966 			sfs->f_owner = sp->f_owner;
4967 
4968 			vfs_getfstypename(mp, sfs->f_fstypename, MFSNAMELEN);
4969 			strlcpy(sfs->f_mntonname, sp->f_mntonname, MNAMELEN);
4970 			strlcpy(sfs->f_mntfromname, sp->f_mntfromname, MNAMELEN);
4971 
4972 			error = SYSCTL_OUT(req, sfs, sizeof(*sfs));
4973 		}
4974 		kfree_type(typeof(*sfsbuf), sfsbuf);
4975 		break;
4976 	default:
4977 		error = ENOTSUP;
4978 		goto out;
4979 	}
4980 out:
4981 	if (gotref != 0) {
4982 		mount_iterdrop(mp);
4983 	}
4984 	return error;
4985 }
4986 
4987 static int      filt_fsattach(struct knote *kn, struct kevent_qos_s *kev);
4988 static void     filt_fsdetach(struct knote *kn);
4989 static int      filt_fsevent(struct knote *kn, long hint);
4990 static int      filt_fstouch(struct knote *kn, struct kevent_qos_s *kev);
4991 static int      filt_fsprocess(struct knote *kn, struct kevent_qos_s *kev);
4992 SECURITY_READ_ONLY_EARLY(struct filterops) fs_filtops = {
4993 	.f_attach = filt_fsattach,
4994 	.f_detach = filt_fsdetach,
4995 	.f_event = filt_fsevent,
4996 	.f_touch = filt_fstouch,
4997 	.f_process = filt_fsprocess,
4998 };
4999 
5000 static int
filt_fsattach(struct knote * kn,__unused struct kevent_qos_s * kev)5001 filt_fsattach(struct knote *kn, __unused struct kevent_qos_s *kev)
5002 {
5003 	kn->kn_flags |= EV_CLEAR; /* automatic */
5004 	kn->kn_sdata = 0;         /* incoming data is ignored */
5005 
5006 	lck_mtx_lock(&fs_klist_lock);
5007 	KNOTE_ATTACH(&fs_klist, kn);
5008 	lck_mtx_unlock(&fs_klist_lock);
5009 
5010 	/*
5011 	 * filter only sees future events,
5012 	 * so it can't be fired already.
5013 	 */
5014 	return 0;
5015 }
5016 
5017 static void
filt_fsdetach(struct knote * kn)5018 filt_fsdetach(struct knote *kn)
5019 {
5020 	lck_mtx_lock(&fs_klist_lock);
5021 	KNOTE_DETACH(&fs_klist, kn);
5022 	lck_mtx_unlock(&fs_klist_lock);
5023 }
5024 
5025 static int
filt_fsevent(struct knote * kn,long hint)5026 filt_fsevent(struct knote *kn, long hint)
5027 {
5028 	/*
5029 	 * Backwards compatibility:
5030 	 * Other filters would do nothing if kn->kn_sfflags == 0
5031 	 */
5032 
5033 	if ((kn->kn_sfflags == 0) || (kn->kn_sfflags & hint)) {
5034 		kn->kn_fflags |= hint;
5035 	}
5036 
5037 	return kn->kn_fflags != 0;
5038 }
5039 
5040 static int
filt_fstouch(struct knote * kn,struct kevent_qos_s * kev)5041 filt_fstouch(struct knote *kn, struct kevent_qos_s *kev)
5042 {
5043 	int res;
5044 
5045 	lck_mtx_lock(&fs_klist_lock);
5046 
5047 	kn->kn_sfflags = kev->fflags;
5048 
5049 	/*
5050 	 * the above filter function sets bits even if nobody is looking for them.
5051 	 * Just preserve those bits even in the new mask is more selective
5052 	 * than before.
5053 	 *
5054 	 * For compatibility with previous implementations, we leave kn_fflags
5055 	 * as they were before.
5056 	 */
5057 	//if (kn->kn_sfflags)
5058 	//	kn->kn_fflags &= kn->kn_sfflags;
5059 	res = (kn->kn_fflags != 0);
5060 
5061 	lck_mtx_unlock(&fs_klist_lock);
5062 
5063 	return res;
5064 }
5065 
5066 static int
filt_fsprocess(struct knote * kn,struct kevent_qos_s * kev)5067 filt_fsprocess(struct knote *kn, struct kevent_qos_s *kev)
5068 {
5069 	int res = 0;
5070 
5071 	lck_mtx_lock(&fs_klist_lock);
5072 	if (kn->kn_fflags) {
5073 		knote_fill_kevent(kn, kev, 0);
5074 		res = 1;
5075 	}
5076 	lck_mtx_unlock(&fs_klist_lock);
5077 	return res;
5078 }
5079 
5080 static int
sysctl_vfs_noremotehang(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)5081 sysctl_vfs_noremotehang(__unused struct sysctl_oid *oidp,
5082     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
5083 {
5084 	int out, error;
5085 	pid_t pid;
5086 	proc_t p;
5087 
5088 	/* We need a pid. */
5089 	if (req->newptr == USER_ADDR_NULL) {
5090 		return EINVAL;
5091 	}
5092 
5093 	error = SYSCTL_IN(req, &pid, sizeof(pid));
5094 	if (error) {
5095 		return error;
5096 	}
5097 
5098 	p = proc_find(pid < 0 ? -pid : pid);
5099 	if (p == NULL) {
5100 		return ESRCH;
5101 	}
5102 
5103 	/*
5104 	 * Fetching the value is ok, but we only fetch if the old
5105 	 * pointer is given.
5106 	 */
5107 	if (req->oldptr != USER_ADDR_NULL) {
5108 		out = !((p->p_flag & P_NOREMOTEHANG) == 0);
5109 		proc_rele(p);
5110 		error = SYSCTL_OUT(req, &out, sizeof(out));
5111 		return error;
5112 	}
5113 
5114 	/* cansignal offers us enough security. */
5115 	if (p != req->p && proc_suser(req->p) != 0) {
5116 		proc_rele(p);
5117 		return EPERM;
5118 	}
5119 
5120 	if (pid < 0) {
5121 		OSBitAndAtomic(~((uint32_t)P_NOREMOTEHANG), &p->p_flag);
5122 	} else {
5123 		OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
5124 	}
5125 	proc_rele(p);
5126 
5127 	return 0;
5128 }
5129 
5130 static int
5131 sysctl_vfs_generic_conf SYSCTL_HANDLER_ARGS
5132 {
5133 	int *name, namelen;
5134 	struct vfstable *vfsp;
5135 	struct vfsconf vfsc = {};
5136 
5137 	(void)oidp;
5138 	name = arg1;
5139 	namelen = arg2;
5140 
5141 	if (namelen < 1) {
5142 		return EISDIR;
5143 	} else if (namelen > 1) {
5144 		return ENOTDIR;
5145 	}
5146 
5147 	mount_list_lock();
5148 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
5149 		if (vfsp->vfc_typenum == name[0]) {
5150 			break;
5151 		}
5152 	}
5153 
5154 	if (vfsp == NULL) {
5155 		mount_list_unlock();
5156 		return ENOTSUP;
5157 	}
5158 
5159 	vfsc.vfc_reserved1 = 0;
5160 	bcopy(vfsp->vfc_name, vfsc.vfc_name, sizeof(vfsc.vfc_name));
5161 	vfsc.vfc_typenum = vfsp->vfc_typenum;
5162 	vfsc.vfc_refcount = vfsp->vfc_refcount;
5163 	vfsc.vfc_flags = vfsp->vfc_flags;
5164 	vfsc.vfc_reserved2 = 0;
5165 	vfsc.vfc_reserved3 = 0;
5166 
5167 	mount_list_unlock();
5168 	return SYSCTL_OUT(req, &vfsc, sizeof(struct vfsconf));
5169 }
5170 
5171 /* the vfs.generic. branch. */
5172 SYSCTL_EXTENSIBLE_NODE(_vfs, VFS_GENERIC, generic,
5173     CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs generic hinge");
5174 /* retreive a list of mounted filesystem fsid_t */
5175 SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist,
5176     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
5177     NULL, 0, sysctl_vfs_vfslist, "S,fsid", "List of mounted filesystem ids");
5178 /* perform operations on filesystem via fsid_t */
5179 SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW | CTLFLAG_LOCKED,
5180     sysctl_vfs_ctlbyfsid, "ctlbyfsid");
5181 SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW | CTLFLAG_ANYBODY,
5182     NULL, 0, sysctl_vfs_noremotehang, "I", "noremotehang");
5183 SYSCTL_INT(_vfs_generic, VFS_MAXTYPENUM, maxtypenum,
5184     CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
5185     &maxvfstypenum, 0, "");
5186 SYSCTL_INT(_vfs_generic, OID_AUTO, sync_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &sync_timeout_seconds, 0, "");
5187 SYSCTL_NODE(_vfs_generic, VFS_CONF, conf,
5188     CTLFLAG_RD | CTLFLAG_LOCKED,
5189     sysctl_vfs_generic_conf, "");
5190 #if DEVELOPMENT || DEBUG
5191 SYSCTL_INT(_vfs_generic, OID_AUTO, print_busy_vnodes,
5192     CTLTYPE_INT | CTLFLAG_RW,
5193     &print_busy_vnodes, 0,
5194     "VFS log busy vnodes blocking unmount");
5195 #endif
5196 
5197 /* Indicate that the root file system unmounted cleanly */
5198 static int vfs_root_unmounted_cleanly = 0;
5199 SYSCTL_INT(_vfs_generic, OID_AUTO, root_unmounted_cleanly, CTLFLAG_RD, &vfs_root_unmounted_cleanly, 0, "Root filesystem was unmounted cleanly");
5200 
5201 void
vfs_set_root_unmounted_cleanly(void)5202 vfs_set_root_unmounted_cleanly(void)
5203 {
5204 	vfs_root_unmounted_cleanly = 1;
5205 }
5206 
5207 /*
5208  * Print vnode state.
5209  */
5210 void
vn_print_state(struct vnode * vp,const char * fmt,...)5211 vn_print_state(struct vnode *vp, const char *fmt, ...)
5212 {
5213 	va_list ap;
5214 	char perm_str[] = "(VM_KERNEL_ADDRPERM pointer)";
5215 	char fs_name[MFSNAMELEN];
5216 
5217 	va_start(ap, fmt);
5218 	vprintf(fmt, ap);
5219 	va_end(ap);
5220 	printf("vp 0x%0llx %s: ", (uint64_t)VM_KERNEL_ADDRPERM(vp), perm_str);
5221 	printf("tag %d, type %d\n", vp->v_tag, vp->v_type);
5222 	/* Counts .. */
5223 	printf("    iocount %d, usecount %d, kusecount %d references %d\n",
5224 	    vp->v_iocount, vp->v_usecount, vp->v_kusecount, vp->v_references);
5225 	printf("    writecount %d, numoutput %d\n", vp->v_writecount,
5226 	    vp->v_numoutput);
5227 	/* Flags */
5228 	printf("    flag 0x%x, lflag 0x%x, listflag 0x%x\n", vp->v_flag,
5229 	    vp->v_lflag, vp->v_listflag);
5230 
5231 	if (vp->v_mount == NULL || vp->v_mount == dead_mountp) {
5232 		strlcpy(fs_name, "deadfs", MFSNAMELEN);
5233 	} else {
5234 		vfs_name(vp->v_mount, fs_name);
5235 	}
5236 
5237 	printf("    v_data 0x%0llx %s\n",
5238 	    (vp->v_data ? (uint64_t)VM_KERNEL_ADDRPERM(vp->v_data) : 0),
5239 	    perm_str);
5240 	printf("    v_mount 0x%0llx %s vfs_name %s\n",
5241 	    (vp->v_mount ? (uint64_t)VM_KERNEL_ADDRPERM(vp->v_mount) : 0),
5242 	    perm_str, fs_name);
5243 }
5244 
5245 long num_reusedvnodes = 0;
5246 
5247 
5248 static vnode_t
process_vp(vnode_t vp,int want_vp,bool can_defer,int * deferred)5249 process_vp(vnode_t vp, int want_vp, bool can_defer, int *deferred)
5250 {
5251 	unsigned int  vpid;
5252 
5253 	*deferred = 0;
5254 
5255 	vpid = vp->v_id;
5256 
5257 	vnode_list_remove_locked(vp);
5258 
5259 	vnode_hold(vp);
5260 	vnode_list_unlock();
5261 
5262 	vnode_lock_spin(vp);
5263 
5264 	/*
5265 	 * We could wait for the vnode_lock after removing the vp from the freelist
5266 	 * and the vid is bumped only at the very end of reclaim. So it is  possible
5267 	 * that we are looking at a vnode that is being terminated. If so skip it.
5268 	 */
5269 	if ((vpid != vp->v_id) || (vp->v_usecount != 0) || (vp->v_iocount != 0) ||
5270 	    VONLIST(vp) || (vp->v_lflag & VL_TERMINATE)) {
5271 		/*
5272 		 * we lost the race between dropping the list lock
5273 		 * and picking up the vnode_lock... someone else
5274 		 * used this vnode and it is now in a new state
5275 		 */
5276 		vnode_drop_and_unlock(vp);
5277 
5278 		return NULLVP;
5279 	}
5280 	if ((vp->v_lflag & (VL_NEEDINACTIVE | VL_MARKTERM)) == VL_NEEDINACTIVE) {
5281 		/*
5282 		 * we did a vnode_rele_ext that asked for
5283 		 * us not to reenter the filesystem during
5284 		 * the release even though VL_NEEDINACTIVE was
5285 		 * set... we'll do it here by doing a
5286 		 * vnode_get/vnode_put
5287 		 *
5288 		 * pick up an iocount so that we can call
5289 		 * vnode_put and drive the VNOP_INACTIVE...
5290 		 * vnode_put will either leave us off
5291 		 * the freelist if a new ref comes in,
5292 		 * or put us back on the end of the freelist
5293 		 * or recycle us if we were marked for termination...
5294 		 * so we'll just go grab a new candidate
5295 		 */
5296 		vp->v_iocount++;
5297 #ifdef CONFIG_IOCOUNT_TRACE
5298 		record_vp(vp, 1);
5299 #endif
5300 		vnode_put_locked(vp);
5301 		vnode_drop_and_unlock(vp);
5302 
5303 		return NULLVP;
5304 	}
5305 	/*
5306 	 * Checks for anyone racing us for recycle
5307 	 */
5308 	if (vp->v_type != VBAD) {
5309 		if ((want_vp || can_defer) && (vnode_on_reliable_media(vp) == FALSE || (vp->v_flag & VISDIRTY))) {
5310 			vnode_async_list_add(vp);
5311 			vnode_drop_and_unlock(vp);
5312 
5313 			*deferred = 1;
5314 
5315 			return NULLVP;
5316 		}
5317 		if (vp->v_lflag & VL_DEAD) {
5318 			panic("new_vnode(%p): the vnode is VL_DEAD but not VBAD", vp);
5319 		}
5320 
5321 		vnode_lock_convert(vp);
5322 		(void)vnode_reclaim_internal(vp, 1, want_vp, 0);
5323 
5324 		if (want_vp) {
5325 			if ((VONLIST(vp))) {
5326 				panic("new_vnode(%p): vp on list", vp);
5327 			}
5328 			if (vp->v_usecount || vp->v_iocount || vp->v_kusecount ||
5329 			    (vp->v_lflag & (VNAMED_UBC | VNAMED_MOUNT | VNAMED_FSHASH))) {
5330 				panic("new_vnode(%p): free vnode still referenced", vp);
5331 			}
5332 			if ((vp->v_mntvnodes.tqe_prev != 0) && (vp->v_mntvnodes.tqe_next != 0)) {
5333 				panic("new_vnode(%p): vnode seems to be on mount list", vp);
5334 			}
5335 			if (!LIST_EMPTY(&vp->v_nclinks) || !TAILQ_EMPTY(&vp->v_ncchildren)) {
5336 				panic("new_vnode(%p): vnode still hooked into the name cache", vp);
5337 			}
5338 		} else {
5339 			vnode_drop_and_unlock(vp);
5340 			vp = NULLVP;
5341 		}
5342 	}
5343 	return vp;
5344 }
5345 
5346 __attribute__((noreturn))
5347 static void
async_work_continue(void)5348 async_work_continue(void)
5349 {
5350 	struct async_work_lst *q;
5351 	int     deferred;
5352 	vnode_t vp;
5353 
5354 	q = &vnode_async_work_list;
5355 
5356 	for (;;) {
5357 		vnode_list_lock();
5358 
5359 		if (TAILQ_EMPTY(q)) {
5360 			assert_wait(q, (THREAD_UNINT));
5361 
5362 			vnode_list_unlock();
5363 
5364 			thread_block((thread_continue_t)async_work_continue);
5365 
5366 			continue;
5367 		}
5368 		async_work_handled++;
5369 
5370 		vp = TAILQ_FIRST(q);
5371 
5372 		vp = process_vp(vp, 0, false, &deferred);
5373 
5374 		if (vp != NULLVP) {
5375 			panic("found VBAD vp (%p) on async queue", vp);
5376 		}
5377 	}
5378 }
5379 
5380 #if CONFIG_JETSAM
5381 bool do_async_jetsam = false;
5382 #endif
5383 
5384 __attribute__((noreturn))
5385 static void
vn_laundry_continue(void)5386 vn_laundry_continue(void)
5387 {
5388 	struct freelst *free_q;
5389 	struct ragelst *rage_q;
5390 	vnode_t vp;
5391 	int deferred;
5392 	bool rage_q_empty;
5393 	bool free_q_empty;
5394 
5395 
5396 	free_q = &vnode_free_list;
5397 	rage_q = &vnode_rage_list;
5398 
5399 	for (;;) {
5400 		vnode_list_lock();
5401 
5402 #if CONFIG_JETSAM
5403 		if (do_async_jetsam) {
5404 			do_async_jetsam = false;
5405 			if (deadvnodes <= deadvnodes_low) {
5406 				vnode_list_unlock();
5407 
5408 				log(LOG_EMERG, "Initiating vnode jetsam : %d desired, %ld numvnodes, "
5409 				    "%ld free, %ld dead, %ld async, %d rage\n",
5410 				    desiredvnodes, numvnodes, freevnodes, deadvnodes, async_work_vnodes, ragevnodes);
5411 
5412 				memorystatus_kill_on_vnode_limit();
5413 
5414 				continue;
5415 			}
5416 		}
5417 #endif
5418 
5419 		if (!TAILQ_EMPTY(&vnode_async_work_list)) {
5420 			vp = TAILQ_FIRST(&vnode_async_work_list);
5421 			async_work_handled++;
5422 
5423 			vp = process_vp(vp, 0, false, &deferred);
5424 
5425 			if (vp != NULLVP) {
5426 				panic("found VBAD vp (%p) on async queue", vp);
5427 			}
5428 			continue;
5429 		}
5430 
5431 		free_q_empty = TAILQ_EMPTY(free_q);
5432 		rage_q_empty = TAILQ_EMPTY(rage_q);
5433 
5434 		if (!rage_q_empty && !free_q_empty) {
5435 			struct timeval current_tv;
5436 
5437 			microuptime(&current_tv);
5438 			if (ragevnodes < rage_limit &&
5439 			    ((current_tv.tv_sec - rage_tv.tv_sec) < RAGE_TIME_LIMIT)) {
5440 				rage_q_empty = true;
5441 			}
5442 		}
5443 
5444 		if (numvnodes < numvnodes_min || (rage_q_empty && free_q_empty) ||
5445 		    (reusablevnodes <= reusablevnodes_max && deadvnodes >= deadvnodes_high)) {
5446 			assert_wait(free_q, (THREAD_UNINT));
5447 
5448 			vnode_list_unlock();
5449 
5450 			thread_block((thread_continue_t)vn_laundry_continue);
5451 
5452 			continue;
5453 		}
5454 
5455 		if (!rage_q_empty) {
5456 			vp = TAILQ_FIRST(rage_q);
5457 		} else {
5458 			vp = TAILQ_FIRST(free_q);
5459 		}
5460 
5461 		vp = process_vp(vp, 0, false, &deferred);
5462 
5463 		if (vp != NULLVP) {
5464 			/* If process_vp returns a vnode, it is locked and has a holdcount */
5465 			vnode_drop_and_unlock(vp);
5466 			vp = NULLVP;
5467 		}
5468 	}
5469 }
5470 
5471 static inline void
wakeup_laundry_thread()5472 wakeup_laundry_thread()
5473 {
5474 	if (deadvnodes_noreuse || (numvnodes >= numvnodes_min && deadvnodes < deadvnodes_low &&
5475 	    (reusablevnodes > reusablevnodes_max || numvnodes >= desiredvnodes))) {
5476 		wakeup(&vnode_free_list);
5477 	}
5478 }
5479 
5480 /*
5481  * This must be called under vnode_list_lock() to prevent race when accessing
5482  * various vnode stats.
5483  */
5484 static void
send_freeable_vnodes_telemetry(void)5485 send_freeable_vnodes_telemetry(void)
5486 {
5487 	bool send_event = false;
5488 
5489 	/*
5490 	 * Log an event when the 'numvnodes' is above the freeable vnodes threshold
5491 	 * or when it falls back within the threshold.
5492 	 * When the 'numvnodes' is above the threshold, log an event when it has
5493 	 * been incrementally growing by 25%.
5494 	 */
5495 	if ((numvnodes > desiredvnodes) && (freevnodes + deadvnodes) == 0) {
5496 		long last_numvnodes = freeable_vnodes_telemetry.numvnodes;
5497 
5498 		if (numvnodes > (last_numvnodes + ((last_numvnodes * 25) / 100)) ||
5499 		    numvnodes >= numvnodes_max) {
5500 			send_event = true;
5501 		}
5502 		freeablevnodes_threshold_crossed = true;
5503 	} else if (freeablevnodes_threshold_crossed &&
5504 	    (freevnodes + deadvnodes) > busyvnodes) {
5505 		freeablevnodes_threshold_crossed = false;
5506 		send_event = true;
5507 	}
5508 
5509 	if (__improbable(send_event)) {
5510 		ca_event_t event = CA_EVENT_ALLOCATE_FLAGS(freeable_vnodes, Z_NOWAIT);
5511 
5512 		if (event) {
5513 			/*
5514 			 * Update the stats except the 'numvnodes_max' and 'desiredvnodes'
5515 			 * as they are immutable after init.
5516 			 */
5517 			freeable_vnodes_telemetry.numvnodes_min = numvnodes_min;
5518 			freeable_vnodes_telemetry.numvnodes = numvnodes;
5519 			freeable_vnodes_telemetry.freevnodes = freevnodes;
5520 			freeable_vnodes_telemetry.deadvnodes = deadvnodes;
5521 			freeable_vnodes_telemetry.freeablevnodes = freeablevnodes;
5522 			freeable_vnodes_telemetry.busyvnodes = busyvnodes;
5523 			freeable_vnodes_telemetry.threshold_crossed =
5524 			    freeablevnodes_threshold_crossed;
5525 
5526 			memcpy(event->data, &freeable_vnodes_telemetry,
5527 			    sizeof(CA_EVENT_TYPE(freeable_vnodes)));
5528 
5529 			if (!freeablevnodes_threshold_crossed) {
5530 				freeable_vnodes_telemetry.numvnodes = 0;
5531 			}
5532 			CA_EVENT_SEND(event);
5533 		}
5534 	}
5535 }
5536 
5537 static int
new_vnode(vnode_t * vpp,bool can_free)5538 new_vnode(vnode_t *vpp, bool can_free)
5539 {
5540 	long force_alloc_min;
5541 	vnode_t vp;
5542 #if CONFIG_JETSAM
5543 	uint32_t retries = 0, max_retries = 2;                  /* retry incase of tablefull */
5544 #else
5545 	uint32_t retries = 0, max_retries = 100;                /* retry incase of tablefull */
5546 #endif
5547 	int force_alloc = 0, walk_count = 0;
5548 	boolean_t need_reliable_vp = FALSE;
5549 	int deferred;
5550 	struct timeval initial_tv;
5551 	struct timeval current_tv;
5552 	proc_t  curproc = current_proc();
5553 	bool force_alloc_freeable = false;
5554 
5555 	if (vn_dealloc_level == DEALLOC_VNODE_NONE) {
5556 		can_free = false;
5557 	}
5558 
5559 	initial_tv.tv_sec = 0;
5560 retry:
5561 	vp = NULLVP;
5562 
5563 	vnode_list_lock();
5564 	newvnode++;
5565 
5566 	if (need_reliable_vp == TRUE) {
5567 		async_work_timed_out++;
5568 	}
5569 
5570 	/*
5571 	 * The vnode list lock was dropped after force_alloc_freeable was set,
5572 	 * reevaluate.
5573 	 */
5574 	force_alloc_min = MAX(desiredvnodes, numvnodes_min);
5575 	if (force_alloc_freeable &&
5576 	    (numvnodes < force_alloc_min || numvnodes >= numvnodes_max)) {
5577 		force_alloc_freeable = false;
5578 	}
5579 
5580 #if CONFIG_JETSAM
5581 	if ((numvnodes_max > desiredvnodes) && numvnodes > (numvnodes_max - 100)
5582 #if (DEVELOPMENT || DEBUG)
5583 	    && !bootarg_no_vnode_jetsam
5584 #endif
5585 	    ) {
5586 		do_async_jetsam = true;
5587 		wakeup(&vnode_free_list);
5588 	}
5589 #endif /* CONFIG_JETSAM */
5590 
5591 	if (((numvnodes - deadvnodes + deadvnodes_noreuse) < desiredvnodes) ||
5592 	    force_alloc || force_alloc_freeable) {
5593 		struct timespec ts;
5594 		uint32_t vflag = 0;
5595 
5596 		/*
5597 		 * Can always reuse a dead one except if it is in the process of
5598 		 * being freed or the FS cannot handle freeable vnodes.
5599 		 */
5600 		if (!TAILQ_EMPTY(&vnode_dead_list)) {
5601 			/* Select an appropriate deadvnode */
5602 			if (numvnodes <= numvnodes_min || !can_free) {
5603 				/* all vnodes upto numvnodes_min are not freeable */
5604 				vp = TAILQ_FIRST(&vnode_dead_list);
5605 				if (numvnodes > numvnodes_min &&
5606 				    (vp->v_flag & VCANDEALLOC)) {
5607 					/*
5608 					 * Freeable vnodes are added to the
5609 					 * back of the queue, so if the first
5610 					 * from the front is freeable, then
5611 					 * there are none on the dead list.
5612 					 */
5613 					vp = NULLVP;
5614 				}
5615 			} else {
5616 				/*
5617 				 * Filesystems which opt in to freeable vnodes
5618 				 * can get either one.
5619 				 */
5620 				TAILQ_FOREACH_REVERSE(vp, &vnode_dead_list,
5621 				    deadlst, v_freelist) {
5622 					if (!(vp->v_listflag & VLIST_NO_REUSE)) {
5623 						break;
5624 					}
5625 				}
5626 			}
5627 
5628 			if (vp) {
5629 				force_alloc_freeable = false;
5630 				goto steal_this_vp;
5631 			}
5632 		}
5633 
5634 		/*
5635 		 * no dead vnodes available... if we're under
5636 		 * the limit, we'll create a new vnode
5637 		 */
5638 		numvnodes++;
5639 		if (force_alloc) {
5640 			numvnodes_min++;
5641 		} else if (can_free && (numvnodes > numvnodes_min)) {
5642 			allocedvnodes++;
5643 			freeablevnodes++;
5644 			vflag = VCANDEALLOC;
5645 
5646 			send_freeable_vnodes_telemetry();
5647 		}
5648 		vnode_list_unlock();
5649 
5650 		if (nc_smr_enabled) {
5651 			vp = zalloc_smr(vnode_zone, Z_WAITOK_ZERO_NOFAIL);
5652 		} else {
5653 			vp = zalloc_flags(vnode_zone, Z_WAITOK_ZERO_NOFAIL);
5654 		}
5655 
5656 		VLISTNONE(vp);          /* avoid double queue removal */
5657 		lck_mtx_init(&vp->v_lock, &vnode_lck_grp, &vnode_lck_attr);
5658 
5659 		TAILQ_INIT(&vp->v_ncchildren);
5660 
5661 		klist_init(&vp->v_knotes);
5662 		nanouptime(&ts);
5663 		vp->v_id = (uint32_t)ts.tv_nsec;
5664 		vp->v_flag = VSTANDARD | vflag;
5665 		if (force_alloc_freeable) {
5666 			/* This vnode should be recycled and freed immediately */
5667 			vp->v_lflag = VL_MARKTERM;
5668 			vp->v_listflag = VLIST_NO_REUSE;
5669 		}
5670 
5671 		if (vflag & VCANDEALLOC) {
5672 			os_atomic_inc(&busyvnodes, relaxed);
5673 		}
5674 
5675 #if CONFIG_MACF
5676 		if (mac_vnode_label_init_needed(vp)) {
5677 			mac_vnode_label_init(vp);
5678 		}
5679 #endif /* MAC */
5680 
5681 #if CONFIG_IOCOUNT_TRACE
5682 		if (__improbable(bootarg_vnode_iocount_trace)) {
5683 			vp->v_iocount_trace = (vnode_iocount_trace_t)zalloc_permanent(
5684 				IOCOUNT_TRACE_MAX_TYPES * sizeof(struct vnode_iocount_trace),
5685 				ZALIGN(struct vnode_iocount_trace));
5686 		}
5687 #endif /* CONFIG_IOCOUNT_TRACE */
5688 
5689 #if CONFIG_FILE_LEASES
5690 		LIST_INIT(&vp->v_leases);
5691 #endif
5692 
5693 		vp->v_iocount = 1;
5694 
5695 		goto done;
5696 	}
5697 
5698 	microuptime(&current_tv);
5699 
5700 #define MAX_WALK_COUNT 1000
5701 
5702 	if (!TAILQ_EMPTY(&vnode_rage_list) &&
5703 	    (ragevnodes >= rage_limit ||
5704 	    (current_tv.tv_sec - rage_tv.tv_sec) >= RAGE_TIME_LIMIT)) {
5705 		TAILQ_FOREACH(vp, &vnode_rage_list, v_freelist) {
5706 			if (!(vp->v_listflag & VLIST_RAGE)) {
5707 				panic("new_vnode: vp (%p) on RAGE list not marked VLIST_RAGE", vp);
5708 			}
5709 
5710 			// if we're a dependency-capable process, skip vnodes that can
5711 			// cause recycling deadlocks. (i.e. this process is diskimages
5712 			// helper and the vnode is in a disk image).  Querying the
5713 			// mnt_kern_flag for the mount's virtual device status
5714 			// is safer than checking the mnt_dependent_process, which
5715 			// may not be updated if there are multiple devnode layers
5716 			// in between the disk image and the final consumer.
5717 
5718 			if (((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL ||
5719 			    (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) &&
5720 			    !(vp->v_listflag & VLIST_NO_REUSE) &&
5721 			    (can_free || !(vp->v_flag & VCANDEALLOC))) {
5722 				/*
5723 				 * if need_reliable_vp == TRUE, then we've already sent one or more
5724 				 * non-reliable vnodes to the async thread for processing and timed
5725 				 * out waiting for a dead vnode to show up.  Use the MAX_WALK_COUNT
5726 				 * mechanism to first scan for a reliable vnode before forcing
5727 				 * a new vnode to be created
5728 				 */
5729 				if (need_reliable_vp == FALSE || vnode_on_reliable_media(vp) == TRUE) {
5730 					break;
5731 				}
5732 			}
5733 
5734 			// don't iterate more than MAX_WALK_COUNT vnodes to
5735 			// avoid keeping the vnode list lock held for too long.
5736 
5737 			if (walk_count++ > MAX_WALK_COUNT) {
5738 				vp = NULL;
5739 				break;
5740 			}
5741 		}
5742 	}
5743 
5744 	if (vp == NULL && !TAILQ_EMPTY(&vnode_free_list)) {
5745 		/*
5746 		 * Pick the first vp for possible reuse
5747 		 */
5748 		walk_count = 0;
5749 		TAILQ_FOREACH(vp, &vnode_free_list, v_freelist) {
5750 			// if we're a dependency-capable process, skip vnodes that can
5751 			// cause recycling deadlocks. (i.e. this process is diskimages
5752 			// helper and the vnode is in a disk image).  Querying the
5753 			// mnt_kern_flag for the mount's virtual device status
5754 			// is safer than checking the mnt_dependent_process, which
5755 			// may not be updated if there are multiple devnode layers
5756 			// in between the disk image and the final consumer.
5757 
5758 			if (((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL ||
5759 			    (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) &&
5760 			    !(vp->v_listflag & VLIST_NO_REUSE) &&
5761 			    (can_free || !(vp->v_flag & VCANDEALLOC))) {
5762 				/*
5763 				 * if need_reliable_vp == TRUE, then we've already sent one or more
5764 				 * non-reliable vnodes to the async thread for processing and timed
5765 				 * out waiting for a dead vnode to show up.  Use the MAX_WALK_COUNT
5766 				 * mechanism to first scan for a reliable vnode before forcing
5767 				 * a new vnode to be created
5768 				 */
5769 				if (need_reliable_vp == FALSE || vnode_on_reliable_media(vp) == TRUE) {
5770 					break;
5771 				}
5772 			}
5773 
5774 			// don't iterate more than MAX_WALK_COUNT vnodes to
5775 			// avoid keeping the vnode list lock held for too long.
5776 
5777 			if (walk_count++ > MAX_WALK_COUNT) {
5778 				vp = NULL;
5779 				break;
5780 			}
5781 		}
5782 	}
5783 
5784 	//
5785 	// if we don't have a vnode and the walk_count is >= MAX_WALK_COUNT
5786 	// then we're trying to create a vnode on behalf of a
5787 	// process like diskimages-helper that has file systems
5788 	// mounted on top of itself (and thus we can't reclaim
5789 	// vnodes in the file systems on top of us).  if we can't
5790 	// find a vnode to reclaim then we'll just have to force
5791 	// the allocation.
5792 	//
5793 	if (vp == NULL && walk_count >= MAX_WALK_COUNT) {
5794 		force_alloc = 1;
5795 		vnode_list_unlock();
5796 		goto retry;
5797 	}
5798 
5799 	if (vp == NULL) {
5800 		if (can_free && (vn_dealloc_level > DEALLOC_VNODE_NONE) &&
5801 		    (numvnodes >= force_alloc_min) && (numvnodes < numvnodes_max)) {
5802 			force_alloc_freeable = true;
5803 			vnode_list_unlock();
5804 			goto retry;
5805 		}
5806 		vnode_list_unlock();
5807 
5808 		/*
5809 		 * we've reached the system imposed maximum number of vnodes
5810 		 * but there isn't a single one available
5811 		 * wait a bit and then retry... if we can't get a vnode
5812 		 * after our target number of retries, than log a complaint
5813 		 */
5814 		if (++retries <= max_retries) {
5815 			delay_for_interval(1, 1000 * 1000);
5816 			goto retry;
5817 		}
5818 
5819 		tablefull("vnode");
5820 		log(LOG_EMERG, "%d desired, %ld numvnodes, "
5821 		    "%ld free, %ld dead, %ld async, %d rage\n",
5822 		    desiredvnodes, numvnodes, freevnodes, deadvnodes, async_work_vnodes, ragevnodes);
5823 
5824 #if CONFIG_JETSAM
5825 		/*
5826 		 * Running out of vnodes tends to make a system unusable. Start killing
5827 		 * processes that jetsam knows are killable.
5828 		 */
5829 		if (memorystatus_kill_on_vnode_limit() == FALSE
5830 #if DEVELOPMENT || DEBUG
5831 		    || bootarg_no_vnode_jetsam
5832 #endif
5833 		    ) {
5834 			/*
5835 			 * If jetsam can't find any more processes to kill and there
5836 			 * still aren't any free vnodes, panic. Hopefully we'll get a
5837 			 * panic log to tell us why we ran out.
5838 			 */
5839 			panic("vnode table is full");
5840 		}
5841 
5842 		/*
5843 		 * Now that we've killed someone, wait a bit and continue looking
5844 		 */
5845 		delay_for_interval(3, 1000 * 1000);
5846 		retries = 0;
5847 		goto retry;
5848 #endif
5849 
5850 		*vpp = NULL;
5851 		return ENFILE;
5852 	}
5853 	newvnode_nodead++;
5854 steal_this_vp:
5855 	if ((vp = process_vp(vp, 1, true, &deferred)) == NULLVP) {
5856 		if (deferred) {
5857 			int     elapsed_msecs;
5858 			struct timeval elapsed_tv;
5859 
5860 			if (initial_tv.tv_sec == 0) {
5861 				microuptime(&initial_tv);
5862 			}
5863 
5864 			vnode_list_lock();
5865 
5866 			dead_vnode_waited++;
5867 			dead_vnode_wanted++;
5868 
5869 			/*
5870 			 * note that we're only going to explicitly wait 10ms
5871 			 * for a dead vnode to become available, since even if one
5872 			 * isn't available, a reliable vnode might now be available
5873 			 * at the head of the VRAGE or free lists... if so, we
5874 			 * can satisfy the new_vnode request with less latency then waiting
5875 			 * for the full 100ms duration we're ultimately willing to tolerate
5876 			 */
5877 			assert_wait_timeout((caddr_t)&dead_vnode_wanted, (THREAD_INTERRUPTIBLE), 10000, NSEC_PER_USEC);
5878 
5879 			vnode_list_unlock();
5880 
5881 			thread_block(THREAD_CONTINUE_NULL);
5882 
5883 			microuptime(&elapsed_tv);
5884 
5885 			timevalsub(&elapsed_tv, &initial_tv);
5886 			elapsed_msecs = (int)(elapsed_tv.tv_sec * 1000 + elapsed_tv.tv_usec / 1000);
5887 
5888 			if (elapsed_msecs >= 100) {
5889 				/*
5890 				 * we've waited long enough... 100ms is
5891 				 * somewhat arbitrary for this case, but the
5892 				 * normal worst case latency used for UI
5893 				 * interaction is 100ms, so I've chosen to
5894 				 * go with that.
5895 				 *
5896 				 * setting need_reliable_vp to TRUE
5897 				 * forces us to find a reliable vnode
5898 				 * that we can process synchronously, or
5899 				 * to create a new one if the scan for
5900 				 * a reliable one hits the scan limit
5901 				 */
5902 				need_reliable_vp = TRUE;
5903 			}
5904 		}
5905 		goto retry;
5906 	}
5907 	OSAddAtomicLong(1, &num_reusedvnodes);
5908 
5909 
5910 #if CONFIG_MACF
5911 	/*
5912 	 * We should never see VL_LABELWAIT or VL_LABEL here.
5913 	 * as those operations hold a reference.
5914 	 */
5915 	assert((vp->v_lflag & VL_LABELWAIT) != VL_LABELWAIT);
5916 	assert((vp->v_lflag & VL_LABEL) != VL_LABEL);
5917 	if (vp->v_lflag & VL_LABELED || mac_vnode_label(vp) != NULL) {
5918 		vnode_lock_convert(vp);
5919 		mac_vnode_label_recycle(vp);
5920 	} else if (mac_vnode_label_init_needed(vp)) {
5921 		vnode_lock_convert(vp);
5922 		mac_vnode_label_init(vp);
5923 	}
5924 
5925 #endif /* MAC */
5926 
5927 	vp->v_iocount = 1;
5928 	vp->v_lflag = 0;
5929 	vp->v_writecount = 0;
5930 	vp->v_references = 0;
5931 	vp->v_iterblkflags = 0;
5932 	if (can_free && (vp->v_flag & VCANDEALLOC)) {
5933 		vp->v_flag = VSTANDARD | VCANDEALLOC;
5934 	} else {
5935 		vp->v_flag = VSTANDARD;
5936 	}
5937 
5938 	/* vbad vnodes can point to dead_mountp */
5939 	vp->v_mount = NULL;
5940 	vp->v_defer_reclaimlist = (vnode_t)0;
5941 
5942 	/* process_vp returns a locked vnode with a holdcount */
5943 	vnode_drop_and_unlock(vp);
5944 
5945 done:
5946 	*vpp = vp;
5947 
5948 	return 0;
5949 }
5950 
5951 void
vnode_lock(vnode_t vp)5952 vnode_lock(vnode_t vp)
5953 {
5954 	lck_mtx_lock(&vp->v_lock);
5955 }
5956 
5957 void
vnode_lock_spin(vnode_t vp)5958 vnode_lock_spin(vnode_t vp)
5959 {
5960 	lck_mtx_lock_spin(&vp->v_lock);
5961 }
5962 
5963 void
vnode_unlock(vnode_t vp)5964 vnode_unlock(vnode_t vp)
5965 {
5966 	lck_mtx_unlock(&vp->v_lock);
5967 }
5968 
5969 void
vnode_hold(vnode_t vp)5970 vnode_hold(vnode_t vp)
5971 {
5972 	int32_t old_holdcount = os_atomic_inc_orig(&vp->v_holdcount, relaxed);
5973 
5974 	if (old_holdcount == INT32_MAX) {
5975 		/*
5976 		 * Because we allow atomic ops on the holdcount it is
5977 		 * possible that when the vnode is examined, its holdcount
5978 		 * is different than what will be printed in this
5979 		 * panic message.
5980 		 */
5981 		panic("%s: vp %p holdcount overflow from : %d v_tag = %d, v_type = %d, v_flag = %x.",
5982 		    __FUNCTION__, vp, old_holdcount, vp->v_tag, vp->v_type, vp->v_flag);
5983 	}
5984 }
5985 
5986 #define VNODE_HOLD_NO_SMR    (1<<29) /* Disable vnode_hold_smr */
5987 
5988 /*
5989  * To be used when smr is the only protection (cache_lookup and cache_lookup_path)
5990  */
5991 bool
vnode_hold_smr(vnode_t vp)5992 vnode_hold_smr(vnode_t vp)
5993 {
5994 	int32_t holdcount;
5995 
5996 	/*
5997 	 * For "high traffic" vnodes like rootvnode, the atomic
5998 	 * cmpexcg loop below can turn into a infinite loop, no need
5999 	 * to do it for vnodes that won't be dealloc'ed
6000 	 */
6001 	if (!(os_atomic_load(&vp->v_flag, relaxed) & VCANDEALLOC)) {
6002 		vnode_hold(vp);
6003 		return true;
6004 	}
6005 
6006 	for (;;) {
6007 		holdcount = os_atomic_load(&vp->v_holdcount, relaxed);
6008 
6009 		if (holdcount & VNODE_HOLD_NO_SMR) {
6010 			return false;
6011 		}
6012 
6013 		if ((os_atomic_cmpxchg(&vp->v_holdcount, holdcount, holdcount + 1, relaxed) != 0)) {
6014 			return true;
6015 		}
6016 	}
6017 }
6018 
6019 /*
6020  * free callback from smr enabled zones
6021  */
6022 static void
vnode_smr_free(void * _vp,__unused size_t _size)6023 vnode_smr_free(void *_vp, __unused size_t _size)
6024 {
6025 	vnode_t vp = _vp;
6026 
6027 	bzero(vp, sizeof(*vp));
6028 }
6029 
6030 static vnode_t
vnode_drop_internal(vnode_t vp,bool locked)6031 vnode_drop_internal(vnode_t vp, bool locked)
6032 {
6033 	int32_t old_holdcount = os_atomic_dec_orig(&vp->v_holdcount, relaxed);
6034 
6035 	if (old_holdcount < 1) {
6036 		if (locked) {
6037 			vnode_unlock(vp);
6038 		}
6039 
6040 		/*
6041 		 * Because we allow atomic ops on the holdcount it is possible
6042 		 * that when the vnode is examined, its holdcount is different
6043 		 * than what will be printed in this panic message.
6044 		 */
6045 		panic("%s : vp %p holdcount -ve: %d.  v_tag = %d, v_type = %d, v_flag = %x.",
6046 		    __FUNCTION__, vp, old_holdcount - 1, vp->v_tag, vp->v_type, vp->v_flag);
6047 	}
6048 
6049 	if (vn_dealloc_level == DEALLOC_VNODE_NONE || old_holdcount > 1 ||
6050 	    !(vp->v_flag & VCANDEALLOC) || !(vp->v_lflag & VL_DEAD)) {
6051 		if (locked) {
6052 			vnode_unlock(vp);
6053 		}
6054 		return vp;
6055 	}
6056 
6057 	if (!locked) {
6058 		vnode_lock(vp);
6059 	}
6060 
6061 	if ((os_atomic_load(&vp->v_holdcount, relaxed) != 0) || vp->v_iocount ||
6062 	    vp->v_usecount || !(vp->v_flag & VCANDEALLOC) || !(vp->v_lflag & VL_DEAD)) {
6063 		vnode_unlock(vp);
6064 		return vp;
6065 	}
6066 
6067 	vnode_list_lock();
6068 
6069 	/*
6070 	 * the v_listflag field is protected by the vnode_list_lock
6071 	 */
6072 	if (VONLIST(vp) && (vp->v_listflag & VLIST_DEAD) &&
6073 	    (numvnodes > desiredvnodes || (vp->v_listflag & VLIST_NO_REUSE) ||
6074 	    vn_dealloc_level != DEALLOC_VNODE_ALL || deadvnodes >= deadvnodes_high) &&
6075 	    (os_atomic_cmpxchg(&vp->v_holdcount, 0, VNODE_HOLD_NO_SMR, relaxed) != 0)) {
6076 		VREMDEAD("vnode_list_remove", vp);
6077 		numvnodes--;
6078 		freeablevnodes--;
6079 		deallocedvnodes++;
6080 		vp->v_listflag = 0;
6081 
6082 		send_freeable_vnodes_telemetry();
6083 		vnode_list_unlock();
6084 
6085 #if CONFIG_MACF
6086 		struct label *tmpl = mac_vnode_label(vp);
6087 		vp->v_label = NULL;
6088 #endif /* CONFIG_MACF */
6089 
6090 		vnode_unlock(vp);
6091 
6092 #if CONFIG_MACF
6093 		if (tmpl) {
6094 			mac_vnode_label_free(tmpl);
6095 		}
6096 #endif /* CONFIG_MACF */
6097 
6098 		if (nc_smr_enabled) {
6099 			zfree_smr(vnode_zone, vp);
6100 		} else {
6101 			zfree(vnode_zone, vp);
6102 		}
6103 
6104 		vp = NULLVP;
6105 	} else {
6106 		vnode_list_unlock();
6107 		vnode_unlock(vp);
6108 	}
6109 
6110 	return vp;
6111 }
6112 
6113 vnode_t
vnode_drop_and_unlock(vnode_t vp)6114 vnode_drop_and_unlock(vnode_t vp)
6115 {
6116 	return vnode_drop_internal(vp, true);
6117 }
6118 
6119 vnode_t
vnode_drop(vnode_t vp)6120 vnode_drop(vnode_t vp)
6121 {
6122 	return vnode_drop_internal(vp, false);
6123 }
6124 
6125 SYSCTL_NODE(_vfs, OID_AUTO, vnstats, CTLFLAG_RD | CTLFLAG_LOCKED, NULL, "vfs vnode stats");
6126 
6127 SYSCTL_COMPAT_INT(_vfs_vnstats, OID_AUTO, vn_dealloc_level,
6128     CTLFLAG_RD | CTLFLAG_LOCKED,
6129     &vn_dealloc_level, 0, "");
6130 SYSCTL_COMPAT_INT(_vfs_vnstats, OID_AUTO, desired_vnodes,
6131     CTLFLAG_RD | CTLFLAG_LOCKED,
6132     &desiredvnodes, 0, "");
6133 SYSCTL_LONG(_vfs_vnstats, OID_AUTO, num_vnodes,
6134     CTLFLAG_RD | CTLFLAG_LOCKED,
6135     &numvnodes, "");
6136 SYSCTL_COMPAT_INT(_vfs_vnstats, OID_AUTO, num_vnodes_min,
6137     CTLFLAG_RD | CTLFLAG_LOCKED,
6138     &numvnodes_min, 0, "");
6139 SYSCTL_COMPAT_INT(_vfs_vnstats, OID_AUTO, num_vnodes_max,
6140     CTLFLAG_RD | CTLFLAG_LOCKED,
6141     &numvnodes_max, 0, "");
6142 SYSCTL_COMPAT_INT(_vfs_vnstats, OID_AUTO, num_deallocable_vnodes,
6143     CTLFLAG_RD | CTLFLAG_LOCKED,
6144     &freeablevnodes, 0, "");
6145 SYSCTL_LONG(_vfs_vnstats, OID_AUTO, num_deallocable_busy_vnodes,
6146     CTLFLAG_RD | CTLFLAG_LOCKED,
6147     &busyvnodes, "");
6148 SYSCTL_LONG(_vfs_vnstats, OID_AUTO, num_dead_vnodes,
6149     CTLFLAG_RD | CTLFLAG_LOCKED,
6150     &deadvnodes, "");
6151 SYSCTL_LONG(_vfs_vnstats, OID_AUTO, num_dead_vnodes_to_dealloc,
6152     CTLFLAG_RD | CTLFLAG_LOCKED,
6153     &deadvnodes_noreuse, "");
6154 SYSCTL_LONG(_vfs_vnstats, OID_AUTO, num_async_work_vnodes,
6155     CTLFLAG_RD | CTLFLAG_LOCKED,
6156     &async_work_vnodes, "");
6157 SYSCTL_COMPAT_INT(_vfs_vnstats, OID_AUTO, num_rapid_aging_vnodes,
6158     CTLFLAG_RD | CTLFLAG_LOCKED,
6159     &ragevnodes, 0, "");
6160 SYSCTL_LONG(_vfs_vnstats, OID_AUTO, num_free_vnodes,
6161     CTLFLAG_RD | CTLFLAG_LOCKED,
6162     &freevnodes, "");
6163 SYSCTL_LONG(_vfs_vnstats, OID_AUTO, num_recycledvnodes,
6164     CTLFLAG_RD | CTLFLAG_LOCKED,
6165     &num_recycledvnodes, "");
6166 SYSCTL_QUAD(_vfs_vnstats, OID_AUTO, num_allocedvnodes,
6167     CTLFLAG_RD | CTLFLAG_LOCKED,
6168     &allocedvnodes, "");
6169 SYSCTL_QUAD(_vfs_vnstats, OID_AUTO, num_deallocedvnodes,
6170     CTLFLAG_RD | CTLFLAG_LOCKED,
6171     &deallocedvnodes, "");
6172 SYSCTL_QUAD(_vfs_vnstats, OID_AUTO, num_newvnode_calls,
6173     CTLFLAG_RD | CTLFLAG_LOCKED,
6174     &newvnode, "");
6175 SYSCTL_QUAD(_vfs_vnstats, OID_AUTO, num_newvnode_calls_nodead,
6176     CTLFLAG_RD | CTLFLAG_LOCKED,
6177     &newvnode_nodead, "");
6178 
6179 int
vnode_get(struct vnode * vp)6180 vnode_get(struct vnode *vp)
6181 {
6182 	int retval;
6183 
6184 	vnode_lock_spin(vp);
6185 	retval = vnode_get_locked(vp);
6186 	vnode_unlock(vp);
6187 
6188 	return retval;
6189 }
6190 
6191 int
vnode_get_locked(struct vnode * vp)6192 vnode_get_locked(struct vnode *vp)
6193 {
6194 #if DIAGNOSTIC
6195 	lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
6196 #endif
6197 	if ((vp->v_iocount == 0) && (vp->v_lflag & (VL_TERMINATE | VL_DEAD))) {
6198 		return ENOENT;
6199 	}
6200 
6201 	if (os_add_overflow(vp->v_iocount, 1, &vp->v_iocount)) {
6202 		panic("v_iocount overflow");
6203 	}
6204 
6205 #ifdef CONFIG_IOCOUNT_TRACE
6206 	record_vp(vp, 1);
6207 #endif
6208 	return 0;
6209 }
6210 
6211 /*
6212  * vnode_getwithvid() cuts in line in front of a vnode drain (that is,
6213  * while the vnode is draining, but at no point after that) to prevent
6214  * deadlocks when getting vnodes from filesystem hashes while holding
6215  * resources that may prevent other iocounts from being released.
6216  */
6217 int
vnode_getwithvid(vnode_t vp,uint32_t vid)6218 vnode_getwithvid(vnode_t vp, uint32_t vid)
6219 {
6220 	return vget_internal(vp, vid, (VNODE_NODEAD | VNODE_WITHID | VNODE_DRAINO));
6221 }
6222 
6223 /*
6224  * vnode_getwithvid_drainok() is like vnode_getwithvid(), but *does* block behind a vnode
6225  * drain; it exists for use in the VFS name cache, where we really do want to block behind
6226  * vnode drain to prevent holding off an unmount.
6227  */
6228 int
vnode_getwithvid_drainok(vnode_t vp,uint32_t vid)6229 vnode_getwithvid_drainok(vnode_t vp, uint32_t vid)
6230 {
6231 	return vget_internal(vp, vid, (VNODE_NODEAD | VNODE_WITHID));
6232 }
6233 
6234 int
vnode_getwithref(vnode_t vp)6235 vnode_getwithref(vnode_t vp)
6236 {
6237 	return vget_internal(vp, 0, 0);
6238 }
6239 
6240 __private_extern__ int
vnode_getwithref_noblock(vnode_t vp)6241 vnode_getwithref_noblock(vnode_t vp)
6242 {
6243 	return vget_internal(vp, 0, VNODE_NOBLOCK);
6244 }
6245 
6246 __private_extern__ int
vnode_getalways(vnode_t vp)6247 vnode_getalways(vnode_t vp)
6248 {
6249 	return vget_internal(vp, 0, VNODE_ALWAYS);
6250 }
6251 
6252 __private_extern__ int
vnode_getalways_from_pager(vnode_t vp)6253 vnode_getalways_from_pager(vnode_t vp)
6254 {
6255 	return vget_internal(vp, 0, VNODE_ALWAYS | VNODE_PAGER);
6256 }
6257 
6258 static inline void
vn_set_dead(vnode_t vp)6259 vn_set_dead(vnode_t vp)
6260 {
6261 	vp->v_mount = NULL;
6262 	vp->v_op = dead_vnodeop_p;
6263 	vp->v_tag = VT_NON;
6264 	vp->v_data = NULL;
6265 	vp->v_type = VBAD;
6266 	vp->v_lflag |= VL_DEAD;
6267 }
6268 
6269 static int
vnode_put_internal_locked(vnode_t vp,bool from_pager)6270 vnode_put_internal_locked(vnode_t vp, bool from_pager)
6271 {
6272 	vfs_context_t ctx = vfs_context_current();      /* hoist outside loop */
6273 
6274 #if DIAGNOSTIC
6275 	lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
6276 #endif
6277 retry:
6278 	if (vp->v_iocount < 1) {
6279 		panic("vnode_put(%p): iocount < 1", vp);
6280 	}
6281 
6282 	if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) {
6283 		vnode_dropiocount(vp);
6284 		return 0;
6285 	}
6286 
6287 	if (((vp->v_lflag & (VL_DEAD | VL_NEEDINACTIVE)) == VL_NEEDINACTIVE)) {
6288 		vp->v_lflag &= ~VL_NEEDINACTIVE;
6289 
6290 		if (UBCINFOEXISTS(vp)) {
6291 			ubc_cs_free_and_vnode_unlock(vp);
6292 		} else {
6293 			vnode_unlock(vp);
6294 		}
6295 
6296 		VNOP_INACTIVE(vp, ctx);
6297 
6298 		vnode_lock_spin(vp);
6299 		/*
6300 		 * because we had to drop the vnode lock before calling
6301 		 * VNOP_INACTIVE, the state of this vnode may have changed...
6302 		 * we may pick up both VL_MARTERM and either
6303 		 * an iocount or a usecount while in the VNOP_INACTIVE call
6304 		 * we don't want to call vnode_reclaim_internal on a vnode
6305 		 * that has active references on it... so loop back around
6306 		 * and reevaluate the state
6307 		 */
6308 		goto retry;
6309 	}
6310 	vp->v_lflag &= ~VL_NEEDINACTIVE;
6311 
6312 	if ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM) {
6313 		if (from_pager) {
6314 			/*
6315 			 * We can't initiate reclaim when called from the pager
6316 			 * because it will deadlock with itself so we hand it
6317 			 * off to the async cleaner thread.
6318 			 */
6319 			vnode_async_list_add(vp);
6320 		} else {
6321 			vnode_lock_convert(vp);
6322 			vnode_reclaim_internal(vp, 1, 1, 0);
6323 		}
6324 	}
6325 	vnode_dropiocount(vp);
6326 	vnode_list_add(vp);
6327 
6328 	return 0;
6329 }
6330 
6331 int
vnode_put_locked(vnode_t vp)6332 vnode_put_locked(vnode_t vp)
6333 {
6334 	return vnode_put_internal_locked(vp, false);
6335 }
6336 
6337 int
vnode_put(vnode_t vp)6338 vnode_put(vnode_t vp)
6339 {
6340 	int retval;
6341 
6342 	vnode_lock_spin(vp);
6343 	vnode_hold(vp);
6344 	retval = vnode_put_internal_locked(vp, false);
6345 	vnode_drop_and_unlock(vp);
6346 
6347 	return retval;
6348 }
6349 
6350 int
vnode_put_from_pager(vnode_t vp)6351 vnode_put_from_pager(vnode_t vp)
6352 {
6353 	int retval;
6354 
6355 	vnode_lock_spin(vp);
6356 	vnode_hold(vp);
6357 	/* Cannot initiate reclaim while paging */
6358 	retval = vnode_put_internal_locked(vp, true);
6359 	vnode_drop_and_unlock(vp);
6360 
6361 	return retval;
6362 }
6363 
6364 int
vnode_writecount(vnode_t vp)6365 vnode_writecount(vnode_t vp)
6366 {
6367 	return vp->v_writecount;
6368 }
6369 
6370 /* is vnode_t in use by others?  */
6371 int
vnode_isinuse(vnode_t vp,int refcnt)6372 vnode_isinuse(vnode_t vp, int refcnt)
6373 {
6374 	return vnode_isinuse_locked(vp, refcnt, 0);
6375 }
6376 
6377 int
vnode_usecount(vnode_t vp)6378 vnode_usecount(vnode_t vp)
6379 {
6380 	return vp->v_usecount;
6381 }
6382 
6383 int
vnode_iocount(vnode_t vp)6384 vnode_iocount(vnode_t vp)
6385 {
6386 	return vp->v_iocount;
6387 }
6388 
6389 int
vnode_isinuse_locked(vnode_t vp,int refcnt,int locked)6390 vnode_isinuse_locked(vnode_t vp, int refcnt, int locked)
6391 {
6392 	int retval = 0;
6393 
6394 	if (!locked) {
6395 		vnode_lock_spin(vp);
6396 	}
6397 	if ((vp->v_type != VREG) && ((vp->v_usecount - vp->v_kusecount) > refcnt)) {
6398 		retval = 1;
6399 		goto out;
6400 	}
6401 	if (vp->v_type == VREG) {
6402 		retval = ubc_isinuse_locked(vp, refcnt, 1);
6403 	}
6404 
6405 out:
6406 	if (!locked) {
6407 		vnode_unlock(vp);
6408 	}
6409 	return retval;
6410 }
6411 
6412 kauth_cred_t
vnode_cred(vnode_t vp)6413 vnode_cred(vnode_t vp)
6414 {
6415 	if (vp->v_cred) {
6416 		return kauth_cred_require(vp->v_cred);
6417 	}
6418 
6419 	return NULL;
6420 }
6421 
6422 
6423 /* resume vnode_t */
6424 errno_t
vnode_resume(vnode_t vp)6425 vnode_resume(vnode_t vp)
6426 {
6427 	if ((vp->v_lflag & VL_SUSPENDED) && vp->v_owner == current_thread()) {
6428 		vnode_lock_spin(vp);
6429 		vp->v_lflag &= ~VL_SUSPENDED;
6430 		vp->v_owner = NULL;
6431 		vnode_unlock(vp);
6432 
6433 		wakeup(&vp->v_iocount);
6434 	}
6435 	return 0;
6436 }
6437 
6438 /* suspend vnode_t
6439  * Please do not use on more than one vnode at a time as it may
6440  * cause deadlocks.
6441  * xxx should we explicity prevent this from happening?
6442  */
6443 
6444 errno_t
vnode_suspend(vnode_t vp)6445 vnode_suspend(vnode_t vp)
6446 {
6447 	if (vp->v_lflag & VL_SUSPENDED) {
6448 		return EBUSY;
6449 	}
6450 
6451 	vnode_lock_spin(vp);
6452 
6453 	/*
6454 	 * xxx is this sufficient to check if a vnode_drain is
6455 	 * progress?
6456 	 */
6457 
6458 	if (vp->v_owner == NULL) {
6459 		vp->v_lflag |= VL_SUSPENDED;
6460 		vp->v_owner = current_thread();
6461 	}
6462 	vnode_unlock(vp);
6463 
6464 	return 0;
6465 }
6466 
6467 /*
6468  * Release any blocked locking requests on the vnode.
6469  * Used for forced-unmounts.
6470  *
6471  * XXX	What about network filesystems?
6472  */
6473 static void
vnode_abort_advlocks(vnode_t vp)6474 vnode_abort_advlocks(vnode_t vp)
6475 {
6476 	if (vp->v_flag & VLOCKLOCAL) {
6477 		lf_abort_advlocks(vp);
6478 	}
6479 }
6480 
6481 
6482 static errno_t
vnode_drain(vnode_t vp)6483 vnode_drain(vnode_t vp)
6484 {
6485 	if (vp->v_lflag & VL_DRAIN) {
6486 		panic("vnode_drain: recursive drain");
6487 		return ENOENT;
6488 	}
6489 	vp->v_lflag |= VL_DRAIN;
6490 	vp->v_owner = current_thread();
6491 
6492 	while (vp->v_iocount > 1) {
6493 		if (bootarg_no_vnode_drain) {
6494 			struct timespec ts = {.tv_sec = 10, .tv_nsec = 0};
6495 			int error;
6496 
6497 			if (vfs_unmountall_started) {
6498 				ts.tv_sec = 1;
6499 			}
6500 
6501 			error = msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_drain_with_timeout", &ts);
6502 
6503 			/* Try to deal with leaked iocounts under bootarg and shutting down */
6504 			if (vp->v_iocount > 1 && error == EWOULDBLOCK &&
6505 			    ts.tv_sec == 1 && vp->v_numoutput == 0) {
6506 				vp->v_iocount = 1;
6507 				break;
6508 			}
6509 		} else {
6510 			msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_drain", NULL);
6511 		}
6512 	}
6513 
6514 	vp->v_lflag &= ~VL_DRAIN;
6515 
6516 	return 0;
6517 }
6518 
6519 
6520 /*
6521  * if the number of recent references via vnode_getwithvid or vnode_getwithref
6522  * exceeds this threshold, than 'UN-AGE' the vnode by removing it from
6523  * the LRU list if it's currently on it... once the iocount and usecount both drop
6524  * to 0, it will get put back on the end of the list, effectively making it younger
6525  * this allows us to keep actively referenced vnodes in the list without having
6526  * to constantly remove and add to the list each time a vnode w/o a usecount is
6527  * referenced which costs us taking and dropping a global lock twice.
6528  * However, if the vnode is marked DIRTY, we want to pull it out much earlier
6529  */
6530 #define UNAGE_THRESHHOLD        25
6531 #define UNAGE_DIRTYTHRESHHOLD    6
6532 
6533 errno_t
vnode_getiocount(vnode_t vp,unsigned int vid,int vflags)6534 vnode_getiocount(vnode_t vp, unsigned int vid, int vflags)
6535 {
6536 	int nodead = vflags & VNODE_NODEAD;
6537 	int nosusp = vflags & VNODE_NOSUSPEND;
6538 	int always = vflags & VNODE_ALWAYS;
6539 	int beatdrain = vflags & VNODE_DRAINO;
6540 	int withvid = vflags & VNODE_WITHID;
6541 	int forpager = vflags & VNODE_PAGER;
6542 	int noblock = vflags & VNODE_NOBLOCK;
6543 
6544 	for (;;) {
6545 		int sleepflg = 0;
6546 
6547 		/*
6548 		 * if it is a dead vnode with deadfs
6549 		 */
6550 		if (nodead && (vp->v_lflag & VL_DEAD) && ((vp->v_type == VBAD) || (vp->v_data == 0))) {
6551 			return ENOENT;
6552 		}
6553 		/*
6554 		 * will return VL_DEAD ones
6555 		 */
6556 		if ((vp->v_lflag & (VL_SUSPENDED | VL_DRAIN | VL_TERMINATE)) == 0) {
6557 			break;
6558 		}
6559 		/*
6560 		 * if suspended vnodes are to be failed
6561 		 */
6562 		if (nosusp && (vp->v_lflag & VL_SUSPENDED)) {
6563 			return ENOENT;
6564 		}
6565 		/*
6566 		 * if you are the owner of drain/suspend/termination , can acquire iocount
6567 		 * check for VL_TERMINATE; it does not set owner
6568 		 */
6569 		if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED | VL_TERMINATE)) &&
6570 		    (vp->v_owner == current_thread())) {
6571 			break;
6572 		}
6573 
6574 		if (always != 0) {
6575 			break;
6576 		}
6577 
6578 		if (noblock && (vp->v_lflag & (VL_DRAIN | VL_SUSPENDED | VL_TERMINATE))) {
6579 			return ENOENT;
6580 		}
6581 
6582 		/*
6583 		 * If this vnode is getting drained, there are some cases where
6584 		 * we can't block or, in case of tty vnodes, want to be
6585 		 * interruptible.
6586 		 */
6587 		if (vp->v_lflag & VL_DRAIN) {
6588 			/*
6589 			 * In some situations, we want to get an iocount
6590 			 * even if the vnode is draining to prevent deadlock,
6591 			 * e.g. if we're in the filesystem, potentially holding
6592 			 * resources that could prevent other iocounts from
6593 			 * being released.
6594 			 */
6595 			if (beatdrain) {
6596 				break;
6597 			}
6598 			/*
6599 			 * Don't block if the vnode's mount point is unmounting as
6600 			 * we may be the thread the unmount is itself waiting on
6601 			 * Only callers who pass in vids (at this point, we've already
6602 			 * handled nosusp and nodead) are expecting error returns
6603 			 * from this function, so only we can only return errors for
6604 			 * those. ENODEV is intended to inform callers that the call
6605 			 * failed because an unmount is in progress.
6606 			 */
6607 			if (withvid && (vp->v_mount) && vfs_isunmount(vp->v_mount)) {
6608 				return ENODEV;
6609 			}
6610 
6611 			if (vnode_istty(vp)) {
6612 				sleepflg = PCATCH;
6613 			}
6614 		}
6615 
6616 		vnode_lock_convert(vp);
6617 
6618 		if (vp->v_lflag & VL_TERMINATE) {
6619 			int error;
6620 
6621 			vp->v_lflag |= VL_TERMWANT;
6622 
6623 			error = msleep(&vp->v_lflag, &vp->v_lock,
6624 			    (PVFS | sleepflg), "vnode getiocount", NULL);
6625 			if (error) {
6626 				return error;
6627 			}
6628 		} else {
6629 			msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_getiocount", NULL);
6630 		}
6631 	}
6632 	if (withvid && vid != vp->v_id) {
6633 		return ENOENT;
6634 	}
6635 	if (!forpager && (++vp->v_references >= UNAGE_THRESHHOLD ||
6636 	    (vp->v_flag & VISDIRTY && vp->v_references >= UNAGE_DIRTYTHRESHHOLD))) {
6637 		vp->v_references = 0;
6638 		vnode_list_remove(vp);
6639 	}
6640 	vp->v_iocount++;
6641 #ifdef CONFIG_IOCOUNT_TRACE
6642 	record_vp(vp, 1);
6643 #endif
6644 	return 0;
6645 }
6646 
6647 static void
vnode_dropiocount(vnode_t vp)6648 vnode_dropiocount(vnode_t vp)
6649 {
6650 	if (vp->v_iocount < 1) {
6651 		panic("vnode_dropiocount(%p): v_iocount < 1", vp);
6652 	}
6653 
6654 	vp->v_iocount--;
6655 #ifdef CONFIG_IOCOUNT_TRACE
6656 	record_vp(vp, -1);
6657 #endif
6658 	if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED)) && (vp->v_iocount <= 1)) {
6659 		wakeup(&vp->v_iocount);
6660 	}
6661 }
6662 
6663 
6664 void
vnode_reclaim(struct vnode * vp)6665 vnode_reclaim(struct vnode * vp)
6666 {
6667 	vnode_reclaim_internal(vp, 0, 0, 0);
6668 }
6669 
6670 __private_extern__
6671 void
vnode_reclaim_internal(struct vnode * vp,int locked,int reuse,int flags)6672 vnode_reclaim_internal(struct vnode * vp, int locked, int reuse, int flags)
6673 {
6674 	int isfifo = 0;
6675 	bool clear_tty_revoke = false;
6676 
6677 	if (!locked) {
6678 		vnode_lock(vp);
6679 	}
6680 
6681 	if (vp->v_lflag & VL_TERMINATE) {
6682 		panic("vnode reclaim in progress");
6683 	}
6684 	vp->v_lflag |= VL_TERMINATE;
6685 
6686 	vn_clearunionwait(vp, 1);
6687 
6688 	/*
6689 	 * We have to force any terminals in reads to return and give up
6690 	 * their iocounts. It's important to do this after VL_TERMINATE
6691 	 * has been set to ensure new reads are blocked while the
6692 	 * revoke is in progress.
6693 	 */
6694 	if (vnode_istty(vp) && (flags & REVOKEALL) && (vp->v_iocount > 1)) {
6695 		vnode_unlock(vp);
6696 		VNOP_IOCTL(vp, TIOCREVOKE, (caddr_t)NULL, 0, vfs_context_kernel());
6697 		clear_tty_revoke = true;
6698 		vnode_lock(vp);
6699 	}
6700 
6701 	vnode_drain(vp);
6702 
6703 	if (clear_tty_revoke) {
6704 		vnode_unlock(vp);
6705 		VNOP_IOCTL(vp, TIOCREVOKECLEAR, (caddr_t)NULL, 0, vfs_context_kernel());
6706 		vnode_lock(vp);
6707 	}
6708 
6709 #if CONFIG_FILE_LEASES
6710 	/*
6711 	 * Revoke all leases in place for this vnode as it is about to be reclaimed.
6712 	 * In normal case, there shouldn't be any leases in place by the time we
6713 	 * get here as there shouldn't be any opens on the vnode (usecount == 0).
6714 	 * However, in the case of force unmount or unmount of a volume that
6715 	 * contains file that was opened with O_EVTONLY then the vnode can be
6716 	 * reclaimed while the file is still opened.
6717 	 */
6718 	vnode_revokelease(vp, true);
6719 #endif
6720 
6721 	isfifo = (vp->v_type == VFIFO);
6722 
6723 	if (vp->v_type != VBAD) {
6724 		vgone(vp, flags);               /* clean and reclaim the vnode */
6725 	}
6726 	/*
6727 	 * give the vnode a new identity so that vnode_getwithvid will fail
6728 	 * on any stale cache accesses...
6729 	 * grab the list_lock so that if we're in "new_vnode"
6730 	 * behind the list_lock trying to steal this vnode, the v_id is stable...
6731 	 * once new_vnode drops the list_lock, it will block trying to take
6732 	 * the vnode lock until we release it... at that point it will evaluate
6733 	 * whether the v_vid has changed
6734 	 * also need to make sure that the vnode isn't on a list where "new_vnode"
6735 	 * can find it after the v_id has been bumped until we are completely done
6736 	 * with the vnode (i.e. putting it back on a list has to be the very last
6737 	 * thing we do to this vnode... many of the callers of vnode_reclaim_internal
6738 	 * are holding an io_count on the vnode... they need to drop the io_count
6739 	 * BEFORE doing a vnode_list_add or make sure to hold the vnode lock until
6740 	 * they are completely done with the vnode
6741 	 */
6742 	vnode_list_lock();
6743 
6744 	vnode_list_remove_locked(vp);
6745 	vp->v_id++;
6746 
6747 	vnode_list_unlock();
6748 
6749 	if (isfifo) {
6750 		struct fifoinfo * fip;
6751 
6752 		fip = vp->v_fifoinfo;
6753 		vp->v_fifoinfo = NULL;
6754 		kfree_type(struct fifoinfo, fip);
6755 	}
6756 	vp->v_type = VBAD;
6757 
6758 	if (vp->v_data) {
6759 		panic("vnode_reclaim_internal: cleaned vnode isn't");
6760 	}
6761 	if (vp->v_numoutput) {
6762 		panic("vnode_reclaim_internal: clean vnode has pending I/O's");
6763 	}
6764 	if (UBCINFOEXISTS(vp)) {
6765 		panic("vnode_reclaim_internal: ubcinfo not cleaned");
6766 	}
6767 	if (vp->v_parent) {
6768 		panic("vnode_reclaim_internal: vparent not removed");
6769 	}
6770 	if (vp->v_name) {
6771 		panic("vnode_reclaim_internal: vname not removed");
6772 	}
6773 
6774 #if CONFIG_FILE_LEASES
6775 	if (__improbable(!LIST_EMPTY(&vp->v_leases))) {
6776 		panic("vnode_reclaim_internal: vleases NOT empty");
6777 	}
6778 #endif
6779 
6780 	vp->v_socket = NULL;
6781 
6782 	vp->v_lflag &= ~VL_TERMINATE;
6783 	vp->v_owner = NULL;
6784 
6785 #if CONFIG_IOCOUNT_TRACE
6786 	if (__improbable(bootarg_vnode_iocount_trace)) {
6787 		bzero(vp->v_iocount_trace,
6788 		    IOCOUNT_TRACE_MAX_TYPES * sizeof(struct vnode_iocount_trace));
6789 	}
6790 #endif /* CONFIG_IOCOUNT_TRACE */
6791 
6792 	KNOTE(&vp->v_knotes, NOTE_REVOKE);
6793 
6794 	/* Make sure that when we reuse the vnode, no knotes left over */
6795 	klist_init(&vp->v_knotes);
6796 
6797 	if (vp->v_lflag & VL_TERMWANT) {
6798 		vp->v_lflag &= ~VL_TERMWANT;
6799 		wakeup(&vp->v_lflag);
6800 	}
6801 	if (!reuse) {
6802 		/*
6803 		 * make sure we get on the
6804 		 * dead list if appropriate
6805 		 */
6806 		vnode_list_add(vp);
6807 	}
6808 	if (!locked) {
6809 		vnode_unlock(vp);
6810 	}
6811 }
6812 
6813 static int
vnode_create_internal(uint32_t flavor,uint32_t size,void * data,vnode_t * vpp,vnode_create_options_t vc_options)6814 vnode_create_internal(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp,
6815     vnode_create_options_t vc_options)
6816 {
6817 	int error;
6818 	int insert = 1;
6819 	vnode_t vp = NULLVP;
6820 	vnode_t nvp;
6821 	vnode_t dvp;
6822 	struct  uthread *ut;
6823 	struct componentname *cnp;
6824 	struct vnode_fsparam *param = (struct vnode_fsparam *)data;
6825 #if CONFIG_TRIGGERS
6826 	struct vnode_trigger_param *tinfo = NULL;
6827 #endif
6828 	bool existing_vnode;
6829 	bool init_vnode = !(vc_options & VNODE_CREATE_EMPTY);
6830 	bool is_bdevvp = false;
6831 
6832 	if (*vpp) {
6833 		vp = *vpp;
6834 		*vpp = NULLVP;
6835 		existing_vnode = true;
6836 	} else {
6837 		existing_vnode = false;
6838 	}
6839 
6840 	if (init_vnode) {
6841 		/* Do quick sanity check on the parameters. */
6842 		if ((param == NULL) || (param->vnfs_vtype == VBAD)) {
6843 			error = EINVAL;
6844 			goto error_out;
6845 		}
6846 
6847 #if CONFIG_TRIGGERS
6848 		if ((flavor == VNCREATE_TRIGGER) && (size == VNCREATE_TRIGGER_SIZE)) {
6849 			tinfo = (struct vnode_trigger_param *)data;
6850 
6851 			/* Validate trigger vnode input */
6852 			if ((param->vnfs_vtype != VDIR) ||
6853 			    (tinfo->vnt_resolve_func == NULL) ||
6854 			    (tinfo->vnt_flags & ~VNT_VALID_MASK)) {
6855 				error = EINVAL;
6856 				goto error_out;
6857 			}
6858 			/* Fall through a normal create (params will be the same) */
6859 			flavor = VNCREATE_FLAVOR;
6860 			size = VCREATESIZE;
6861 		}
6862 #endif
6863 		if ((flavor != VNCREATE_FLAVOR) || (size != VCREATESIZE)) {
6864 			error = EINVAL;
6865 			goto error_out;
6866 		}
6867 	}
6868 
6869 	if (!existing_vnode) {
6870 		if ((error = new_vnode(&vp, !(vc_options & VNODE_CREATE_NODEALLOC)))) {
6871 			return error;
6872 		}
6873 		if (!init_vnode) {
6874 			/* Make it so that it can be released by a vnode_put) */
6875 			vnode_lock(vp);
6876 			vn_set_dead(vp);
6877 			vnode_unlock(vp);
6878 			*vpp = vp;
6879 			return 0;
6880 		}
6881 	} else {
6882 		/*
6883 		 * A vnode obtained by vnode_create_empty has been passed to
6884 		 * vnode_initialize - Unset VL_DEAD set by vn_set_dead. After
6885 		 * this point, it is set back on any error.
6886 		 */
6887 		vnode_lock(vp);
6888 		vp->v_lflag &= ~VL_DEAD;
6889 		vnode_unlock(vp);
6890 	}
6891 
6892 	dvp = param->vnfs_dvp;
6893 	cnp = param->vnfs_cnp;
6894 
6895 	vp->v_op = param->vnfs_vops;
6896 	vp->v_type = (uint16_t)param->vnfs_vtype;
6897 	vp->v_data = param->vnfs_fsnode;
6898 
6899 	if (param->vnfs_markroot) {
6900 		vp->v_flag |= VROOT;
6901 	}
6902 	if (param->vnfs_marksystem) {
6903 		vp->v_flag |= VSYSTEM;
6904 	}
6905 	if (vp->v_type == VREG) {
6906 		error = ubc_info_init_withsize(vp, param->vnfs_filesize);
6907 		if (error) {
6908 #ifdef CONFIG_IOCOUNT_TRACE
6909 			record_vp(vp, 1);
6910 #endif
6911 			vnode_hold(vp);
6912 			vnode_lock(vp);
6913 			vn_set_dead(vp);
6914 
6915 			vnode_put_locked(vp);
6916 			vnode_drop_and_unlock(vp);
6917 			return error;
6918 		}
6919 		if (param->vnfs_mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED) {
6920 			memory_object_mark_io_tracking(vp->v_ubcinfo->ui_control);
6921 		}
6922 	}
6923 #ifdef CONFIG_IOCOUNT_TRACE
6924 	record_vp(vp, 1);
6925 #endif
6926 
6927 #if CONFIG_FIRMLINKS
6928 	vp->v_fmlink = NULLVP;
6929 #endif
6930 	vp->v_flag &= ~VFMLINKTARGET;
6931 
6932 #if CONFIG_TRIGGERS
6933 	/*
6934 	 * For trigger vnodes, attach trigger info to vnode
6935 	 */
6936 	if ((vp->v_type == VDIR) && (tinfo != NULL)) {
6937 		/*
6938 		 * Note: has a side effect of incrementing trigger count on the
6939 		 * mount if successful, which we would need to undo on a
6940 		 * subsequent failure.
6941 		 */
6942 #ifdef CONFIG_IOCOUNT_TRACE
6943 		record_vp(vp, -1);
6944 #endif
6945 		error = vnode_resolver_create(param->vnfs_mp, vp, tinfo, FALSE);
6946 		if (error) {
6947 			printf("vnode_create: vnode_resolver_create() err %d\n", error);
6948 			vnode_hold(vp);
6949 			vnode_lock(vp);
6950 			vn_set_dead(vp);
6951 #ifdef CONFIG_IOCOUNT_TRACE
6952 			record_vp(vp, 1);
6953 #endif
6954 			vnode_put_locked(vp);
6955 			vnode_drop_and_unlock(vp);
6956 			return error;
6957 		}
6958 	}
6959 #endif
6960 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
6961 		vp->v_tag = VT_DEVFS;           /* callers will reset if needed (bdevvp) */
6962 
6963 		if ((nvp = checkalias(vp, param->vnfs_rdev))) {
6964 			/*
6965 			 * if checkalias returns a vnode, it will be locked
6966 			 *
6967 			 * first get rid of the unneeded vnode we acquired
6968 			 */
6969 			vp->v_data = NULL;
6970 			vp->v_op = spec_vnodeop_p;
6971 			vp->v_type = VBAD;
6972 			vp->v_lflag = VL_DEAD;
6973 			vp->v_data = NULL;
6974 			vp->v_tag = VT_NON;
6975 			vnode_put(vp);
6976 
6977 			/*
6978 			 * switch to aliased vnode and finish
6979 			 * preparing it
6980 			 */
6981 			vp = nvp;
6982 
6983 			is_bdevvp = (vp->v_flag & VBDEVVP);
6984 
6985 			if (is_bdevvp) {
6986 				printf("%s: alias vnode (vid = %u) is in state of change (start) v_flags = 0x%x v_numoutput = %d\n",
6987 				    __func__, vp->v_id, vp->v_flag, vp->v_numoutput);
6988 			}
6989 
6990 			vnode_hold(vp);
6991 			vp->v_lflag |= VL_OPSCHANGE;
6992 			vclean(vp, 0);
6993 			vp->v_op = param->vnfs_vops;
6994 			vp->v_type = (uint16_t)param->vnfs_vtype;
6995 			vp->v_data = param->vnfs_fsnode;
6996 			vp->v_lflag = VL_OPSCHANGE;
6997 			vp->v_mount = NULL;
6998 			insmntque(vp, param->vnfs_mp);
6999 			insert = 0;
7000 
7001 			if (is_bdevvp) {
7002 				printf("%s: alias vnode (vid = %u), is in state of change (end) v_flags = 0x%x v_numoutput = %d\n",
7003 				    __func__, vp->v_id, vp->v_flag, vp->v_numoutput);
7004 			}
7005 
7006 			vnode_drop_and_unlock(vp);
7007 			wakeup(&vp->v_lflag); /* chkvnlock is waitng for VL_DEAD to get unset */
7008 		}
7009 
7010 		if (VCHR == vp->v_type) {
7011 			u_int maj = major(vp->v_rdev);
7012 
7013 			if (maj < (u_int)nchrdev && cdevsw[maj].d_type == D_TTY) {
7014 				vp->v_flag |= VISTTY;
7015 			}
7016 		}
7017 	}
7018 
7019 	if (vp->v_type == VFIFO) {
7020 		struct fifoinfo *fip;
7021 
7022 		fip = kalloc_type(struct fifoinfo, Z_WAITOK | Z_ZERO);
7023 		vp->v_fifoinfo = fip;
7024 	}
7025 	/* The file systems must pass the address of the location where
7026 	 * they store the vnode pointer. When we add the vnode into the mount
7027 	 * list and name cache they become discoverable. So the file system node
7028 	 * must have the connection to vnode setup by then
7029 	 */
7030 	*vpp = vp;
7031 
7032 	/* Add fs named reference. */
7033 	if (param->vnfs_flags & VNFS_ADDFSREF) {
7034 		vp->v_lflag |= VNAMED_FSHASH;
7035 	}
7036 	if (param->vnfs_mp) {
7037 		if (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL) {
7038 			vp->v_flag |= VLOCKLOCAL;
7039 		}
7040 		if (insert) {
7041 			if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) {
7042 				panic("insmntque: vp on the free list");
7043 			}
7044 
7045 			/*
7046 			 * enter in mount vnode list
7047 			 */
7048 			insmntque(vp, param->vnfs_mp);
7049 		}
7050 	}
7051 	if (dvp && vnode_ref(dvp) == 0) {
7052 		vp->v_parent = dvp;
7053 	}
7054 	if (cnp) {
7055 		if (dvp && ((param->vnfs_flags & (VNFS_NOCACHE | VNFS_CANTCACHE)) == 0)) {
7056 			/*
7057 			 * enter into name cache
7058 			 * we've got the info to enter it into the name cache now
7059 			 * cache_enter_create will pick up an extra reference on
7060 			 * the name entered into the string cache
7061 			 */
7062 			vp->v_name = cache_enter_create(dvp, vp, cnp);
7063 		} else {
7064 			vp->v_name = vfs_addname(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0);
7065 		}
7066 
7067 		if ((cnp->cn_flags & UNIONCREATED) == UNIONCREATED) {
7068 			vp->v_flag |= VISUNION;
7069 		}
7070 	}
7071 	if ((param->vnfs_flags & VNFS_CANTCACHE) == 0) {
7072 		/*
7073 		 * this vnode is being created as cacheable in the name cache
7074 		 * this allows us to re-enter it in the cache
7075 		 */
7076 		vp->v_flag |= VNCACHEABLE;
7077 	}
7078 	ut = current_uthread();
7079 
7080 	if ((current_proc()->p_lflag & P_LRAGE_VNODES) ||
7081 	    (ut->uu_flag & (UT_RAGE_VNODES | UT_KERN_RAGE_VNODES))) {
7082 		/*
7083 		 * process has indicated that it wants any
7084 		 * vnodes created on its behalf to be rapidly
7085 		 * aged to reduce the impact on the cached set
7086 		 * of vnodes
7087 		 *
7088 		 * if UT_KERN_RAGE_VNODES is set, then the
7089 		 * kernel internally wants vnodes to be rapidly
7090 		 * aged, even if the process hasn't requested
7091 		 * this
7092 		 */
7093 		vp->v_flag |= VRAGE;
7094 	}
7095 
7096 #if CONFIG_SECLUDED_MEMORY
7097 	switch (secluded_for_filecache) {
7098 	case SECLUDED_FILECACHE_NONE:
7099 		/*
7100 		 * secluded_for_filecache == 0:
7101 		 * + no file contents in secluded pool
7102 		 */
7103 		break;
7104 	case SECLUDED_FILECACHE_APPS:
7105 		/*
7106 		 * secluded_for_filecache == 1:
7107 		 * + no files from /
7108 		 * + files from /Applications/ are OK
7109 		 * + files from /Applications/Camera are not OK
7110 		 * + no files that are open for write
7111 		 */
7112 		if (vnode_vtype(vp) == VREG &&
7113 		    vnode_mount(vp) != NULL &&
7114 		    (!(vfs_flags(vnode_mount(vp)) & MNT_ROOTFS))) {
7115 			/* not from root filesystem: eligible for secluded pages */
7116 			memory_object_mark_eligible_for_secluded(
7117 				ubc_getobject(vp, UBC_FLAGS_NONE),
7118 				TRUE);
7119 		}
7120 		break;
7121 	case SECLUDED_FILECACHE_RDONLY:
7122 		/*
7123 		 * secluded_for_filecache == 2:
7124 		 * + all read-only files OK, except:
7125 		 *      + dyld_shared_cache_arm64*
7126 		 *      + Camera
7127 		 *	+ mediaserverd
7128 		 */
7129 		if (vnode_vtype(vp) == VREG) {
7130 			memory_object_mark_eligible_for_secluded(
7131 				ubc_getobject(vp, UBC_FLAGS_NONE),
7132 				TRUE);
7133 		}
7134 		break;
7135 	default:
7136 		break;
7137 	}
7138 #endif /* CONFIG_SECLUDED_MEMORY */
7139 
7140 	if (is_bdevvp) {
7141 		/*
7142 		 * The v_flags and v_lflags felds for the vndoe above are
7143 		 * manipulated without the vnode lock. This is fine for
7144 		 * everything because no other use  of this vnode is occurring.
7145 		 * However the case of the bdevvp alias vnode reuse is different
7146 		 * and the flags end up being modified while a thread may be in
7147 		 * vnode_waitforwrites which sets VTHROTTLED and any one of the
7148 		 * non atomic modifications of v_flag in this function can race
7149 		 * with the setting of that flag and cause VTHROTTLED on vflag
7150 		 * to get "lost".
7151 		 *
7152 		 * This should ideally be fixed by making sure all modifications
7153 		 * in this function to the vnode flags are done under the
7154 		 * vnode lock but at this time, a much smaller workaround is
7155 		 * being  employed and a the more correct (and potentially
7156 		 * much bigger) change will follow later.
7157 		 *
7158 		 * The effect of "losing" the VTHROTTLED flags would be a lost
7159 		 * wakeup so we just issue that wakeup here since this happens
7160 		 * only once per bdevvp vnode which are only one or two for a
7161 		 * given boot.
7162 		 */
7163 		wakeup(&vp->v_numoutput);
7164 
7165 		/*
7166 		 * now make sure the flags that we were suppossed to put aren't
7167 		 * lost.
7168 		 */
7169 		vnode_lock_spin(vp);
7170 		if (param->vnfs_flags & VNFS_ADDFSREF) {
7171 			vp->v_lflag |= VNAMED_FSHASH;
7172 		}
7173 		if (param->vnfs_mp && (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL)) {
7174 			vp->v_flag |= VLOCKLOCAL;
7175 		}
7176 		if ((param->vnfs_flags & VNFS_CANTCACHE) == 0) {
7177 			vp->v_flag |= VNCACHEABLE;
7178 		}
7179 		vnode_unlock(vp);
7180 	}
7181 
7182 	return 0;
7183 
7184 error_out:
7185 	if (existing_vnode) {
7186 		vnode_put(vp);
7187 	}
7188 	return error;
7189 }
7190 
7191 int
vnode_create_ext(uint32_t flavor,uint32_t size,void * data,vnode_t * vpp,vnode_create_options_t vc_options)7192 vnode_create_ext(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp, vnode_create_options_t vc_options)
7193 {
7194 	if (vc_options & ~(VNODE_CREATE_EMPTY | VNODE_CREATE_NODEALLOC)) {
7195 		return EINVAL;
7196 	}
7197 	*vpp = NULLVP;
7198 	return vnode_create_internal(flavor, size, data, vpp, vc_options);
7199 }
7200 
7201 /* USAGE:
7202  * The following api creates a vnode and associates all the parameter specified in vnode_fsparam
7203  * structure and returns a vnode handle with a reference. device aliasing is handled here so checkalias
7204  * is obsoleted by this.
7205  */
7206 int
vnode_create(uint32_t flavor,uint32_t size,void * data,vnode_t * vpp)7207 vnode_create(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp)
7208 {
7209 	return vnode_create_ext(flavor, size, data, vpp, VNODE_CREATE_NODEALLOC);
7210 }
7211 
7212 int
vnode_create_empty(vnode_t * vpp)7213 vnode_create_empty(vnode_t *vpp)
7214 {
7215 	return vnode_create_ext(VNCREATE_FLAVOR, VCREATESIZE, NULL,
7216 	           vpp, VNODE_CREATE_EMPTY);
7217 }
7218 
7219 int
vnode_initialize(uint32_t __unused flavor,uint32_t size,void * data,vnode_t * vpp)7220 vnode_initialize(uint32_t __unused flavor, uint32_t size, void *data, vnode_t *vpp)
7221 {
7222 	if (*vpp == NULLVP) {
7223 		panic("NULL vnode passed to vnode_initialize");
7224 	}
7225 #if DEVELOPMENT || DEBUG
7226 	/*
7227 	 * We lock to check that vnode is fit for unlocked use in
7228 	 * vnode_create_internal.
7229 	 */
7230 	vnode_lock_spin(*vpp);
7231 	VNASSERT(((*vpp)->v_iocount == 1), *vpp,
7232 	    ("vnode_initialize : iocount not 1, is %d", (*vpp)->v_iocount));
7233 	VNASSERT(((*vpp)->v_usecount == 0), *vpp,
7234 	    ("vnode_initialize : usecount not 0, is %d", (*vpp)->v_usecount));
7235 	VNASSERT(((*vpp)->v_lflag & VL_DEAD), *vpp,
7236 	    ("vnode_initialize : v_lflag does not have VL_DEAD, is 0x%x",
7237 	    (*vpp)->v_lflag));
7238 	VNASSERT(((*vpp)->v_data == NULL), *vpp,
7239 	    ("vnode_initialize : v_data not NULL"));
7240 	vnode_unlock(*vpp);
7241 #endif
7242 	return vnode_create_internal(flavor, size, data, vpp, VNODE_CREATE_DEFAULT);
7243 }
7244 
7245 int
vnode_addfsref(vnode_t vp)7246 vnode_addfsref(vnode_t vp)
7247 {
7248 	vnode_lock_spin(vp);
7249 	if (vp->v_lflag & VNAMED_FSHASH) {
7250 		panic("add_fsref: vp already has named reference");
7251 	}
7252 	if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) {
7253 		panic("addfsref: vp on the free list");
7254 	}
7255 	vp->v_lflag |= VNAMED_FSHASH;
7256 	vnode_unlock(vp);
7257 	return 0;
7258 }
7259 int
vnode_removefsref(vnode_t vp)7260 vnode_removefsref(vnode_t vp)
7261 {
7262 	vnode_lock_spin(vp);
7263 	if ((vp->v_lflag & VNAMED_FSHASH) == 0) {
7264 		panic("remove_fsref: no named reference");
7265 	}
7266 	vp->v_lflag &= ~VNAMED_FSHASH;
7267 	vnode_unlock(vp);
7268 	return 0;
7269 }
7270 
7271 
7272 int
vfs_iterate(int flags,int (* callout)(mount_t,void *),void * arg)7273 vfs_iterate(int flags, int (*callout)(mount_t, void *), void *arg)
7274 {
7275 	mount_t mp;
7276 	int ret = 0;
7277 	fsid_t * fsid_list;
7278 	int count, actualcount, i;
7279 	void * allocmem;
7280 	int indx_start, indx_stop, indx_incr;
7281 	int cb_dropref = (flags & VFS_ITERATE_CB_DROPREF);
7282 	int noskip_unmount = (flags & VFS_ITERATE_NOSKIP_UNMOUNT);
7283 
7284 	count = mount_getvfscnt();
7285 	count += 10;
7286 
7287 	fsid_list = kalloc_data(count * sizeof(fsid_t), Z_WAITOK);
7288 	allocmem = (void *)fsid_list;
7289 
7290 	actualcount = mount_fillfsids(fsid_list, count);
7291 
7292 	/*
7293 	 * Establish the iteration direction
7294 	 * VFS_ITERATE_TAIL_FIRST overrides default head first order (oldest first)
7295 	 */
7296 	if (flags & VFS_ITERATE_TAIL_FIRST) {
7297 		indx_start = actualcount - 1;
7298 		indx_stop = -1;
7299 		indx_incr = -1;
7300 	} else { /* Head first by default */
7301 		indx_start = 0;
7302 		indx_stop = actualcount;
7303 		indx_incr = 1;
7304 	}
7305 
7306 	for (i = indx_start; i != indx_stop; i += indx_incr) {
7307 		/* obtain the mount point with iteration reference */
7308 		mp = mount_list_lookupby_fsid(&fsid_list[i], 0, 1);
7309 
7310 		if (mp == (struct mount *)0) {
7311 			continue;
7312 		}
7313 		mount_lock(mp);
7314 		if ((mp->mnt_lflag & MNT_LDEAD) ||
7315 		    (!noskip_unmount && (mp->mnt_lflag & MNT_LUNMOUNT))) {
7316 			mount_unlock(mp);
7317 			mount_iterdrop(mp);
7318 			continue;
7319 		}
7320 		mount_unlock(mp);
7321 
7322 		/* iterate over all the vnodes */
7323 		ret = callout(mp, arg);
7324 
7325 		/*
7326 		 * Drop the iterref here if the callback didn't do it.
7327 		 * Note: If cb_dropref is set the mp may no longer exist.
7328 		 */
7329 		if (!cb_dropref) {
7330 			mount_iterdrop(mp);
7331 		}
7332 
7333 		switch (ret) {
7334 		case VFS_RETURNED:
7335 		case VFS_RETURNED_DONE:
7336 			if (ret == VFS_RETURNED_DONE) {
7337 				ret = 0;
7338 				goto out;
7339 			}
7340 			break;
7341 
7342 		case VFS_CLAIMED_DONE:
7343 			ret = 0;
7344 			goto out;
7345 		case VFS_CLAIMED:
7346 		default:
7347 			break;
7348 		}
7349 		ret = 0;
7350 	}
7351 
7352 out:
7353 	kfree_data(allocmem, count * sizeof(fsid_t));
7354 	return ret;
7355 }
7356 
7357 /*
7358  * Update the vfsstatfs structure in the mountpoint.
7359  * MAC: Parameter eventtype added, indicating whether the event that
7360  * triggered this update came from user space, via a system call
7361  * (VFS_USER_EVENT) or an internal kernel call (VFS_KERNEL_EVENT).
7362  */
7363 int
vfs_update_vfsstat(mount_t mp,vfs_context_t ctx,__unused int eventtype)7364 vfs_update_vfsstat(mount_t mp, vfs_context_t ctx, __unused int eventtype)
7365 {
7366 	struct vfs_attr va;
7367 	int             error;
7368 
7369 	/*
7370 	 * Request the attributes we want to propagate into
7371 	 * the per-mount vfsstat structure.
7372 	 */
7373 	VFSATTR_INIT(&va);
7374 	VFSATTR_WANTED(&va, f_iosize);
7375 	VFSATTR_WANTED(&va, f_blocks);
7376 	VFSATTR_WANTED(&va, f_bfree);
7377 	VFSATTR_WANTED(&va, f_bavail);
7378 	VFSATTR_WANTED(&va, f_bused);
7379 	VFSATTR_WANTED(&va, f_files);
7380 	VFSATTR_WANTED(&va, f_ffree);
7381 	VFSATTR_WANTED(&va, f_bsize);
7382 	VFSATTR_WANTED(&va, f_fssubtype);
7383 
7384 	if ((error = vfs_getattr(mp, &va, ctx)) != 0) {
7385 		KAUTH_DEBUG("STAT - filesystem returned error %d", error);
7386 		return error;
7387 	}
7388 #if CONFIG_MACF
7389 	if (eventtype == VFS_USER_EVENT) {
7390 		error = mac_mount_check_getattr(ctx, mp, &va);
7391 		if (error != 0) {
7392 			return error;
7393 		}
7394 	}
7395 #endif
7396 	/*
7397 	 * Unpack into the per-mount structure.
7398 	 *
7399 	 * We only overwrite these fields, which are likely to change:
7400 	 *	f_blocks
7401 	 *	f_bfree
7402 	 *	f_bavail
7403 	 *	f_bused
7404 	 *	f_files
7405 	 *	f_ffree
7406 	 *
7407 	 * And these which are not, but which the FS has no other way
7408 	 * of providing to us:
7409 	 *	f_bsize
7410 	 *	f_iosize
7411 	 *	f_fssubtype
7412 	 *
7413 	 */
7414 	if (VFSATTR_IS_SUPPORTED(&va, f_bsize)) {
7415 		/* 4822056 - protect against malformed server mount */
7416 		mp->mnt_vfsstat.f_bsize = (va.f_bsize > 0 ? va.f_bsize : 512);
7417 	} else {
7418 		mp->mnt_vfsstat.f_bsize = mp->mnt_devblocksize; /* default from the device block size */
7419 	}
7420 	if (VFSATTR_IS_SUPPORTED(&va, f_iosize)) {
7421 		mp->mnt_vfsstat.f_iosize = va.f_iosize;
7422 	} else {
7423 		mp->mnt_vfsstat.f_iosize = 1024 * 1024;         /* 1MB sensible I/O size */
7424 	}
7425 	if (VFSATTR_IS_SUPPORTED(&va, f_blocks)) {
7426 		mp->mnt_vfsstat.f_blocks = va.f_blocks;
7427 	}
7428 	if (VFSATTR_IS_SUPPORTED(&va, f_bfree)) {
7429 		mp->mnt_vfsstat.f_bfree = va.f_bfree;
7430 	}
7431 	if (VFSATTR_IS_SUPPORTED(&va, f_bavail)) {
7432 		mp->mnt_vfsstat.f_bavail = va.f_bavail;
7433 	}
7434 	if (VFSATTR_IS_SUPPORTED(&va, f_bused)) {
7435 		mp->mnt_vfsstat.f_bused = va.f_bused;
7436 	}
7437 	if (VFSATTR_IS_SUPPORTED(&va, f_files)) {
7438 		mp->mnt_vfsstat.f_files = va.f_files;
7439 	}
7440 	if (VFSATTR_IS_SUPPORTED(&va, f_ffree)) {
7441 		mp->mnt_vfsstat.f_ffree = va.f_ffree;
7442 	}
7443 
7444 	/* this is unlikely to change, but has to be queried for */
7445 	if (VFSATTR_IS_SUPPORTED(&va, f_fssubtype)) {
7446 		mp->mnt_vfsstat.f_fssubtype = va.f_fssubtype;
7447 	}
7448 
7449 	return 0;
7450 }
7451 
7452 int
mount_list_add(mount_t mp)7453 mount_list_add(mount_t mp)
7454 {
7455 	int res;
7456 
7457 	mount_list_lock();
7458 	if (get_system_inshutdown() != 0) {
7459 		res = -1;
7460 	} else {
7461 		TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
7462 		nummounts++;
7463 		res = 0;
7464 	}
7465 	mount_list_unlock();
7466 
7467 	return res;
7468 }
7469 
7470 void
mount_list_remove(mount_t mp)7471 mount_list_remove(mount_t mp)
7472 {
7473 	mount_list_lock();
7474 	TAILQ_REMOVE(&mountlist, mp, mnt_list);
7475 	nummounts--;
7476 	mp->mnt_list.tqe_next = NULL;
7477 	mp->mnt_list.tqe_prev = NULL;
7478 	mount_list_unlock();
7479 }
7480 
7481 mount_t
mount_lookupby_volfsid(int volfs_id,int withref)7482 mount_lookupby_volfsid(int volfs_id, int withref)
7483 {
7484 	mount_t cur_mount = (mount_t)0;
7485 	mount_t mp;
7486 
7487 	mount_list_lock();
7488 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
7489 		if (!(mp->mnt_kern_flag & MNTK_UNMOUNT) &&
7490 		    (mp->mnt_kern_flag & MNTK_PATH_FROM_ID) &&
7491 		    (mp->mnt_vfsstat.f_fsid.val[0] == volfs_id)) {
7492 			cur_mount = mp;
7493 			if (withref) {
7494 				if (mount_iterref(cur_mount, 1)) {
7495 					cur_mount = (mount_t)0;
7496 					mount_list_unlock();
7497 					goto out;
7498 				}
7499 			}
7500 			break;
7501 		}
7502 	}
7503 	mount_list_unlock();
7504 	if (withref && (cur_mount != (mount_t)0)) {
7505 		mp = cur_mount;
7506 		if (vfs_busy(mp, LK_NOWAIT) != 0) {
7507 			cur_mount = (mount_t)0;
7508 		}
7509 		mount_iterdrop(mp);
7510 	}
7511 out:
7512 	return cur_mount;
7513 }
7514 
7515 mount_t
mount_list_lookupby_fsid(fsid_t * fsid,int locked,int withref)7516 mount_list_lookupby_fsid(fsid_t *fsid, int locked, int withref)
7517 {
7518 	mount_t retmp = (mount_t)0;
7519 	mount_t mp;
7520 
7521 	if (!locked) {
7522 		mount_list_lock();
7523 	}
7524 	TAILQ_FOREACH(mp, &mountlist, mnt_list)
7525 	if (mp->mnt_vfsstat.f_fsid.val[0] == fsid->val[0] &&
7526 	    mp->mnt_vfsstat.f_fsid.val[1] == fsid->val[1]) {
7527 		retmp = mp;
7528 		if (withref) {
7529 			if (mount_iterref(retmp, 1)) {
7530 				retmp = (mount_t)0;
7531 			}
7532 		}
7533 		goto out;
7534 	}
7535 out:
7536 	if (!locked) {
7537 		mount_list_unlock();
7538 	}
7539 	return retmp;
7540 }
7541 
7542 errno_t
vnode_lookupat(const char * path,int flags,vnode_t * vpp,vfs_context_t ctx,vnode_t start_dvp)7543 vnode_lookupat(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx,
7544     vnode_t start_dvp)
7545 {
7546 	struct nameidata *ndp;
7547 	int error = 0;
7548 	u_int32_t ndflags = 0;
7549 
7550 	if (ctx == NULL) {
7551 		return EINVAL;
7552 	}
7553 
7554 	ndp = kalloc_type(struct nameidata, Z_WAITOK | Z_NOFAIL);
7555 
7556 	if (flags & VNODE_LOOKUP_NOFOLLOW) {
7557 		ndflags = NOFOLLOW;
7558 	} else {
7559 		ndflags = FOLLOW;
7560 	}
7561 
7562 	if (flags & VNODE_LOOKUP_NOCROSSMOUNT) {
7563 		ndflags |= NOCROSSMOUNT;
7564 	}
7565 
7566 	if (flags & VNODE_LOOKUP_CROSSMOUNTNOWAIT) {
7567 		ndflags |= CN_NBMOUNTLOOK;
7568 	}
7569 
7570 	/* XXX AUDITVNPATH1 needed ? */
7571 	NDINIT(ndp, LOOKUP, OP_LOOKUP, ndflags, UIO_SYSSPACE,
7572 	    CAST_USER_ADDR_T(path), ctx);
7573 
7574 	if (start_dvp && (path[0] != '/')) {
7575 		ndp->ni_dvp = start_dvp;
7576 		ndp->ni_cnd.cn_flags |= USEDVP;
7577 	}
7578 
7579 	if ((error = namei(ndp))) {
7580 		goto out_free;
7581 	}
7582 
7583 	ndp->ni_cnd.cn_flags &= ~USEDVP;
7584 
7585 	*vpp = ndp->ni_vp;
7586 	nameidone(ndp);
7587 
7588 out_free:
7589 	kfree_type(struct nameidata, ndp);
7590 	return error;
7591 }
7592 
7593 errno_t
vnode_lookup(const char * path,int flags,vnode_t * vpp,vfs_context_t ctx)7594 vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx)
7595 {
7596 	return vnode_lookupat(path, flags, vpp, ctx, NULLVP);
7597 }
7598 
7599 errno_t
vnode_open(const char * path,int fmode,int cmode,int flags,vnode_t * vpp,vfs_context_t ctx)7600 vnode_open(const char *path, int fmode, int cmode, int flags, vnode_t *vpp, vfs_context_t ctx)
7601 {
7602 	struct nameidata *ndp = NULL;
7603 	int error;
7604 	u_int32_t ndflags = 0;
7605 	int lflags = flags;
7606 
7607 	if (ctx == NULL) {              /* XXX technically an error */
7608 		ctx = vfs_context_current();
7609 	}
7610 
7611 	ndp = kalloc_type(struct nameidata, Z_WAITOK | Z_NOFAIL);
7612 
7613 	if (fmode & O_NOFOLLOW) {
7614 		lflags |= VNODE_LOOKUP_NOFOLLOW;
7615 	}
7616 
7617 	if (lflags & VNODE_LOOKUP_NOFOLLOW) {
7618 		ndflags = NOFOLLOW;
7619 	} else {
7620 		ndflags = FOLLOW;
7621 	}
7622 
7623 	if (lflags & VNODE_LOOKUP_NOCROSSMOUNT) {
7624 		ndflags |= NOCROSSMOUNT;
7625 	}
7626 
7627 	if (lflags & VNODE_LOOKUP_CROSSMOUNTNOWAIT) {
7628 		ndflags |= CN_NBMOUNTLOOK;
7629 	}
7630 
7631 	/* XXX AUDITVNPATH1 needed ? */
7632 	NDINIT(ndp, LOOKUP, OP_OPEN, ndflags, UIO_SYSSPACE,
7633 	    CAST_USER_ADDR_T(path), ctx);
7634 
7635 	if ((error = vn_open(ndp, fmode, cmode))) {
7636 		*vpp = NULL;
7637 	} else {
7638 		*vpp = ndp->ni_vp;
7639 	}
7640 
7641 	kfree_type(struct nameidata, ndp);
7642 	return error;
7643 }
7644 
7645 errno_t
vnode_close(vnode_t vp,int flags,vfs_context_t ctx)7646 vnode_close(vnode_t vp, int flags, vfs_context_t ctx)
7647 {
7648 	int error;
7649 
7650 	if (ctx == NULL) {
7651 		ctx = vfs_context_current();
7652 	}
7653 
7654 	error = vn_close(vp, flags, ctx);
7655 	vnode_put(vp);
7656 	return error;
7657 }
7658 
7659 errno_t
vnode_mtime(vnode_t vp,struct timespec * mtime,vfs_context_t ctx)7660 vnode_mtime(vnode_t vp, struct timespec *mtime, vfs_context_t ctx)
7661 {
7662 	struct vnode_attr       va;
7663 	int                     error;
7664 
7665 	VATTR_INIT(&va);
7666 	VATTR_WANTED(&va, va_modify_time);
7667 	error = vnode_getattr(vp, &va, ctx);
7668 	if (!error) {
7669 		*mtime = va.va_modify_time;
7670 	}
7671 	return error;
7672 }
7673 
7674 errno_t
vnode_flags(vnode_t vp,uint32_t * flags,vfs_context_t ctx)7675 vnode_flags(vnode_t vp, uint32_t *flags, vfs_context_t ctx)
7676 {
7677 	struct vnode_attr       va;
7678 	int                     error;
7679 
7680 	VATTR_INIT(&va);
7681 	VATTR_WANTED(&va, va_flags);
7682 	error = vnode_getattr(vp, &va, ctx);
7683 	if (!error) {
7684 		*flags = va.va_flags;
7685 	}
7686 	return error;
7687 }
7688 
7689 /*
7690  * Returns:	0			Success
7691  *	vnode_getattr:???
7692  */
7693 errno_t
vnode_size(vnode_t vp,off_t * sizep,vfs_context_t ctx)7694 vnode_size(vnode_t vp, off_t *sizep, vfs_context_t ctx)
7695 {
7696 	struct vnode_attr       va;
7697 	int                     error;
7698 
7699 	VATTR_INIT(&va);
7700 	VATTR_WANTED(&va, va_data_size);
7701 	error = vnode_getattr(vp, &va, ctx);
7702 	if (!error) {
7703 		*sizep = va.va_data_size;
7704 	}
7705 	return error;
7706 }
7707 
7708 errno_t
vnode_setsize(vnode_t vp,off_t size,int ioflag,vfs_context_t ctx)7709 vnode_setsize(vnode_t vp, off_t size, int ioflag, vfs_context_t ctx)
7710 {
7711 	struct vnode_attr       va;
7712 
7713 	VATTR_INIT(&va);
7714 	VATTR_SET(&va, va_data_size, size);
7715 	va.va_vaflags = ioflag & 0xffff;
7716 	return vnode_setattr(vp, &va, ctx);
7717 }
7718 
7719 int
vnode_setdirty(vnode_t vp)7720 vnode_setdirty(vnode_t vp)
7721 {
7722 	vnode_lock_spin(vp);
7723 	vp->v_flag |= VISDIRTY;
7724 	vnode_unlock(vp);
7725 	return 0;
7726 }
7727 
7728 int
vnode_cleardirty(vnode_t vp)7729 vnode_cleardirty(vnode_t vp)
7730 {
7731 	vnode_lock_spin(vp);
7732 	vp->v_flag &= ~VISDIRTY;
7733 	vnode_unlock(vp);
7734 	return 0;
7735 }
7736 
7737 int
vnode_isdirty(vnode_t vp)7738 vnode_isdirty(vnode_t vp)
7739 {
7740 	int dirty;
7741 
7742 	vnode_lock_spin(vp);
7743 	dirty = (vp->v_flag & VISDIRTY) ? 1 : 0;
7744 	vnode_unlock(vp);
7745 
7746 	return dirty;
7747 }
7748 
7749 static int
vn_create_reg(vnode_t dvp,vnode_t * vpp,struct nameidata * ndp,struct vnode_attr * vap,uint32_t flags,int fmode,uint32_t * statusp,vfs_context_t ctx)7750 vn_create_reg(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx)
7751 {
7752 	/* Only use compound VNOP for compound operation */
7753 	if (vnode_compound_open_available(dvp) && ((flags & VN_CREATE_DOOPEN) != 0)) {
7754 		*vpp = NULLVP;
7755 		return VNOP_COMPOUND_OPEN(dvp, vpp, ndp, O_CREAT, fmode, statusp, vap, ctx);
7756 	} else {
7757 		return VNOP_CREATE(dvp, vpp, &ndp->ni_cnd, vap, ctx);
7758 	}
7759 }
7760 
7761 /*
7762  * Create a filesystem object of arbitrary type with arbitrary attributes in
7763  * the spevied directory with the specified name.
7764  *
7765  * Parameters:	dvp			Pointer to the vnode of the directory
7766  *					in which to create the object.
7767  *		vpp			Pointer to the area into which to
7768  *					return the vnode of the created object.
7769  *		cnp			Component name pointer from the namei
7770  *					data structure, containing the name to
7771  *					use for the create object.
7772  *		vap			Pointer to the vnode_attr structure
7773  *					describing the object to be created,
7774  *					including the type of object.
7775  *		flags			VN_* flags controlling ACL inheritance
7776  *					and whether or not authorization is to
7777  *					be required for the operation.
7778  *
7779  * Returns:	0			Success
7780  *		!0			errno value
7781  *
7782  * Implicit:	*vpp			Contains the vnode of the object that
7783  *					was created, if successful.
7784  *		*cnp			May be modified by the underlying VFS.
7785  *		*vap			May be modified by the underlying VFS.
7786  *					modified by either ACL inheritance or
7787  *
7788  *
7789  *					be modified, even if the operation is
7790  *
7791  *
7792  * Notes:	The kauth_filesec_t in 'vap', if any, is in host byte order.
7793  *
7794  *		Modification of '*cnp' and '*vap' by the underlying VFS is
7795  *		strongly discouraged.
7796  *
7797  * XXX:		This function is a 'vn_*' function; it belongs in vfs_vnops.c
7798  *
7799  * XXX:		We should enummerate the possible errno values here, and where
7800  *		in the code they originated.
7801  */
7802 errno_t
vn_create(vnode_t dvp,vnode_t * vpp,struct nameidata * ndp,struct vnode_attr * vap,uint32_t flags,int fmode,uint32_t * statusp,vfs_context_t ctx)7803 vn_create(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx)
7804 {
7805 	errno_t error, old_error;
7806 	vnode_t vp = (vnode_t)0;
7807 	boolean_t batched;
7808 	struct componentname *cnp;
7809 	uint32_t defaulted;
7810 
7811 	cnp = &ndp->ni_cnd;
7812 	error = 0;
7813 	batched = namei_compound_available(dvp, ndp) ? TRUE : FALSE;
7814 
7815 	KAUTH_DEBUG("%p    CREATE - '%s'", dvp, cnp->cn_nameptr);
7816 
7817 	if (flags & VN_CREATE_NOINHERIT) {
7818 		vap->va_vaflags |= VA_NOINHERIT;
7819 	}
7820 	if (flags & VN_CREATE_NOAUTH) {
7821 		vap->va_vaflags |= VA_NOAUTH;
7822 	}
7823 	/*
7824 	 * Handle ACL inheritance, initialize vap.
7825 	 */
7826 	error = vn_attribute_prepare(dvp, vap, &defaulted, ctx);
7827 	if (error) {
7828 		return error;
7829 	}
7830 
7831 	if (vap->va_type != VREG && (fmode != 0 || (flags & VN_CREATE_DOOPEN) || statusp)) {
7832 		panic("Open parameters, but not a regular file.");
7833 	}
7834 	if ((fmode != 0) && ((flags & VN_CREATE_DOOPEN) == 0)) {
7835 		panic("Mode for open, but not trying to open...");
7836 	}
7837 
7838 
7839 	/*
7840 	 * Create the requested node.
7841 	 */
7842 	switch (vap->va_type) {
7843 	case VREG:
7844 		error = vn_create_reg(dvp, vpp, ndp, vap, flags, fmode, statusp, ctx);
7845 		break;
7846 	case VDIR:
7847 		error = vn_mkdir(dvp, vpp, ndp, vap, ctx);
7848 		break;
7849 	case VSOCK:
7850 	case VFIFO:
7851 	case VBLK:
7852 	case VCHR:
7853 		error = VNOP_MKNOD(dvp, vpp, cnp, vap, ctx);
7854 		break;
7855 	default:
7856 		panic("vnode_create: unknown vtype %d", vap->va_type);
7857 	}
7858 	if (error != 0) {
7859 		KAUTH_DEBUG("%p    CREATE - error %d returned by filesystem", dvp, error);
7860 		goto out;
7861 	}
7862 
7863 	vp = *vpp;
7864 	old_error = error;
7865 
7866 	/*
7867 	 * If some of the requested attributes weren't handled by the VNOP,
7868 	 * use our fallback code.
7869 	 */
7870 	if ((error == 0) && !VATTR_ALL_SUPPORTED(vap) && *vpp) {
7871 		KAUTH_DEBUG("     CREATE - doing fallback with ACL %p", vap->va_acl);
7872 		error = vnode_setattr_fallback(*vpp, vap, ctx);
7873 	}
7874 
7875 #if CONFIG_MACF
7876 	if ((error == 0) && !(flags & VN_CREATE_NOLABEL)) {
7877 		error = vnode_label(vnode_mount(vp), dvp, vp, cnp, VNODE_LABEL_CREATE, ctx);
7878 	}
7879 #endif
7880 
7881 	if ((error != 0) && (vp != (vnode_t)0)) {
7882 		/* If we've done a compound open, close */
7883 		if (batched && (old_error == 0) && (vap->va_type == VREG)) {
7884 			VNOP_CLOSE(vp, fmode, ctx);
7885 		}
7886 
7887 		/* Need to provide notifications if a create succeeded */
7888 		if (!batched) {
7889 			*vpp = (vnode_t) 0;
7890 			vnode_put(vp);
7891 			vp = NULLVP;
7892 		}
7893 	}
7894 
7895 	/*
7896 	 * For creation VNOPs, this is the equivalent of
7897 	 * lookup_handle_found_vnode.
7898 	 */
7899 	if (kdebug_enable && *vpp) {
7900 		kdebug_lookup(*vpp, cnp);
7901 	}
7902 
7903 out:
7904 	vn_attribute_cleanup(vap, defaulted);
7905 
7906 	return error;
7907 }
7908 
7909 static kauth_scope_t    vnode_scope;
7910 static int      vnode_authorize_callback(kauth_cred_t credential, void *idata, kauth_action_t action,
7911     uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3);
7912 static int vnode_authorize_callback_int(kauth_action_t action, vfs_context_t ctx,
7913     vnode_t vp, vnode_t dvp, int *errorp);
7914 
7915 typedef struct _vnode_authorize_context {
7916 	vnode_t         vp;
7917 	struct vnode_attr *vap;
7918 	vnode_t         dvp;
7919 	struct vnode_attr *dvap;
7920 	vfs_context_t   ctx;
7921 	int             flags;
7922 	int             flags_valid;
7923 #define _VAC_IS_OWNER           (1<<0)
7924 #define _VAC_IN_GROUP           (1<<1)
7925 #define _VAC_IS_DIR_OWNER       (1<<2)
7926 #define _VAC_IN_DIR_GROUP       (1<<3)
7927 #define _VAC_NO_VNODE_POINTERS  (1<<4)
7928 } *vauth_ctx;
7929 
7930 void
vnode_authorize_init(void)7931 vnode_authorize_init(void)
7932 {
7933 	vnode_scope = kauth_register_scope(KAUTH_SCOPE_VNODE, vnode_authorize_callback, NULL);
7934 }
7935 
7936 #define VATTR_PREPARE_DEFAULTED_UID             0x1
7937 #define VATTR_PREPARE_DEFAULTED_GID             0x2
7938 #define VATTR_PREPARE_DEFAULTED_MODE            0x4
7939 
7940 int
vn_attribute_prepare(vnode_t dvp,struct vnode_attr * vap,uint32_t * defaulted_fieldsp,vfs_context_t ctx)7941 vn_attribute_prepare(vnode_t dvp, struct vnode_attr *vap, uint32_t *defaulted_fieldsp, vfs_context_t ctx)
7942 {
7943 	kauth_acl_t nacl = NULL, oacl = NULL;
7944 	int error;
7945 
7946 	/*
7947 	 * Handle ACL inheritance.
7948 	 */
7949 	if (!(vap->va_vaflags & VA_NOINHERIT) && vfs_extendedsecurity(dvp->v_mount)) {
7950 		/* save the original filesec */
7951 		if (VATTR_IS_ACTIVE(vap, va_acl)) {
7952 			oacl = vap->va_acl;
7953 		}
7954 
7955 		vap->va_acl = NULL;
7956 		if ((error = kauth_acl_inherit(dvp,
7957 		    oacl,
7958 		    &nacl,
7959 		    vap->va_type == VDIR,
7960 		    ctx)) != 0) {
7961 			KAUTH_DEBUG("%p    CREATE - error %d processing inheritance", dvp, error);
7962 			return error;
7963 		}
7964 
7965 		/*
7966 		 * If the generated ACL is NULL, then we can save ourselves some effort
7967 		 * by clearing the active bit.
7968 		 */
7969 		if (nacl == NULL) {
7970 			VATTR_CLEAR_ACTIVE(vap, va_acl);
7971 		} else {
7972 			vap->va_base_acl = oacl;
7973 			VATTR_SET(vap, va_acl, nacl);
7974 		}
7975 	}
7976 
7977 	error = vnode_authattr_new_internal(dvp, vap, (vap->va_vaflags & VA_NOAUTH), defaulted_fieldsp, ctx);
7978 	if (error) {
7979 		vn_attribute_cleanup(vap, *defaulted_fieldsp);
7980 	}
7981 
7982 	return error;
7983 }
7984 
7985 void
vn_attribute_cleanup(struct vnode_attr * vap,uint32_t defaulted_fields)7986 vn_attribute_cleanup(struct vnode_attr *vap, uint32_t defaulted_fields)
7987 {
7988 	/*
7989 	 * If the caller supplied a filesec in vap, it has been replaced
7990 	 * now by the post-inheritance copy.  We need to put the original back
7991 	 * and free the inherited product.
7992 	 */
7993 	kauth_acl_t nacl, oacl;
7994 
7995 	if (VATTR_IS_ACTIVE(vap, va_acl)) {
7996 		nacl = vap->va_acl;
7997 		oacl = vap->va_base_acl;
7998 
7999 		if (oacl) {
8000 			VATTR_SET(vap, va_acl, oacl);
8001 			vap->va_base_acl = NULL;
8002 		} else {
8003 			VATTR_CLEAR_ACTIVE(vap, va_acl);
8004 		}
8005 
8006 		if (nacl != NULL) {
8007 			/*
8008 			 * Only free the ACL buffer if 'VA_FILESEC_ACL' is not set as it
8009 			 * should be freed by the caller or it is a post-inheritance copy.
8010 			 */
8011 			if (!(vap->va_vaflags & VA_FILESEC_ACL) ||
8012 			    (oacl != NULL && nacl != oacl)) {
8013 				kauth_acl_free(nacl);
8014 			}
8015 		}
8016 	}
8017 
8018 	if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_MODE) != 0) {
8019 		VATTR_CLEAR_ACTIVE(vap, va_mode);
8020 	}
8021 	if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_GID) != 0) {
8022 		VATTR_CLEAR_ACTIVE(vap, va_gid);
8023 	}
8024 	if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_UID) != 0) {
8025 		VATTR_CLEAR_ACTIVE(vap, va_uid);
8026 	}
8027 
8028 	return;
8029 }
8030 
8031 int
vn_authorize_unlink(vnode_t dvp,vnode_t vp,struct componentname * cnp,vfs_context_t ctx,__unused void * reserved)8032 vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, __unused void *reserved)
8033 {
8034 #if !CONFIG_MACF
8035 #pragma unused(cnp)
8036 #endif
8037 	int error = 0;
8038 
8039 	/*
8040 	 * Normally, unlinking of directories is not supported.
8041 	 * However, some file systems may have limited support.
8042 	 */
8043 	if ((vp->v_type == VDIR) &&
8044 	    !(vp->v_mount->mnt_kern_flag & MNTK_DIR_HARDLINKS)) {
8045 		return EPERM; /* POSIX */
8046 	}
8047 
8048 	/* authorize the delete operation */
8049 #if CONFIG_MACF
8050 	if (!error) {
8051 		error = mac_vnode_check_unlink(ctx, dvp, vp, cnp);
8052 	}
8053 #endif /* MAC */
8054 	if (!error) {
8055 		error = vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx);
8056 	}
8057 
8058 	return error;
8059 }
8060 
8061 int
vn_authorize_open_existing(vnode_t vp,struct componentname * cnp,int fmode,vfs_context_t ctx,void * reserved)8062 vn_authorize_open_existing(vnode_t vp, struct componentname *cnp, int fmode, vfs_context_t ctx, void *reserved)
8063 {
8064 	/* Open of existing case */
8065 	kauth_action_t action;
8066 	int error = 0;
8067 	if (cnp->cn_ndp == NULL) {
8068 		panic("NULL ndp");
8069 	}
8070 	if (reserved != NULL) {
8071 		panic("reserved not NULL.");
8072 	}
8073 
8074 #if CONFIG_MACF
8075 	/* XXX may do duplicate work here, but ignore that for now (idempotent) */
8076 	if (vfs_flags(vnode_mount(vp)) & MNT_MULTILABEL) {
8077 		error = vnode_label(vnode_mount(vp), NULL, vp, NULL, 0, ctx);
8078 		if (error) {
8079 			return error;
8080 		}
8081 	}
8082 #endif
8083 
8084 	if (vnode_isdir(vp)) {
8085 		if ((fmode & (FWRITE | O_TRUNC)) || /* disallow write operations on directories */
8086 		    ((fmode & FSEARCH) && !(fmode & O_DIRECTORY))) {
8087 			return EISDIR;
8088 		}
8089 	} else {
8090 		if (fmode & O_DIRECTORY) {
8091 			return ENOTDIR;
8092 		}
8093 
8094 		if (vp->v_type == VSOCK && vp->v_tag != VT_FDESC) {
8095 			return EOPNOTSUPP;    /* Operation not supported on socket */
8096 		}
8097 
8098 		if (vp->v_type == VLNK && (fmode & O_NOFOLLOW) != 0) {
8099 			return ELOOP;         /* O_NOFOLLOW was specified and the target is a symbolic link */
8100 		}
8101 
8102 		if (cnp->cn_ndp->ni_flag & NAMEI_TRAILINGSLASH) {
8103 			return ENOTDIR;
8104 		}
8105 
8106 		if (!vnode_isreg(vp) && (fmode & FEXEC)) {
8107 			return EACCES;
8108 		}
8109 	}
8110 
8111 #if CONFIG_MACF
8112 	/* If a file being opened is a shadow file containing
8113 	 * namedstream data, ignore the macf checks because it
8114 	 * is a kernel internal file and access should always
8115 	 * be allowed.
8116 	 */
8117 	if (!(vnode_isshadow(vp) && vnode_isnamedstream(vp))) {
8118 		error = mac_vnode_check_open(ctx, vp, fmode);
8119 		if (error) {
8120 			return error;
8121 		}
8122 	}
8123 #endif
8124 
8125 	/* compute action to be authorized */
8126 	action = 0;
8127 	if (fmode & FREAD) {
8128 		action |= KAUTH_VNODE_READ_DATA;
8129 	}
8130 	if (fmode & (FWRITE | O_TRUNC)) {
8131 		/*
8132 		 * If we are writing, appending, and not truncating,
8133 		 * indicate that we are appending so that if the
8134 		 * UF_APPEND or SF_APPEND bits are set, we do not deny
8135 		 * the open.
8136 		 */
8137 		if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) {
8138 			action |= KAUTH_VNODE_APPEND_DATA;
8139 		} else {
8140 			action |= KAUTH_VNODE_WRITE_DATA;
8141 		}
8142 	}
8143 	if (fmode & (FSEARCH | FEXEC)) {
8144 		if (vnode_isdir(vp)) {
8145 			action |= KAUTH_VNODE_SEARCH;
8146 		} else {
8147 			action |= KAUTH_VNODE_EXECUTE;
8148 		}
8149 	}
8150 	error = vnode_authorize(vp, NULL, action, ctx);
8151 #if NAMEDSTREAMS
8152 	if (error == EACCES) {
8153 		/*
8154 		 * Shadow files may exist on-disk with a different UID/GID
8155 		 * than that of the current context.  Verify that this file
8156 		 * is really a shadow file.  If it was created successfully
8157 		 * then it should be authorized.
8158 		 */
8159 		if (vnode_isshadow(vp) && vnode_isnamedstream(vp)) {
8160 			error = vnode_verifynamedstream(vp);
8161 		}
8162 	}
8163 #endif
8164 
8165 	return error;
8166 }
8167 
8168 int
vn_authorize_create(vnode_t dvp,struct componentname * cnp,struct vnode_attr * vap,vfs_context_t ctx,void * reserved)8169 vn_authorize_create(vnode_t dvp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx, void *reserved)
8170 {
8171 #if !CONFIG_MACF
8172 #pragma unused(vap)
8173 #endif
8174 	/* Creation case */
8175 	int error;
8176 
8177 	if (cnp->cn_ndp == NULL) {
8178 		panic("NULL cn_ndp");
8179 	}
8180 	if (reserved != NULL) {
8181 		panic("reserved not NULL.");
8182 	}
8183 
8184 	/* Only validate path for creation if we didn't do a complete lookup */
8185 	if (cnp->cn_ndp->ni_flag & NAMEI_UNFINISHED) {
8186 		error = lookup_validate_creation_path(cnp->cn_ndp);
8187 		if (error) {
8188 			return error;
8189 		}
8190 	}
8191 
8192 #if CONFIG_MACF
8193 	error = mac_vnode_check_create(ctx, dvp, cnp, vap);
8194 	if (error) {
8195 		return error;
8196 	}
8197 #endif /* CONFIG_MACF */
8198 
8199 	return vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
8200 }
8201 
8202 int
vn_authorize_rename(struct vnode * fdvp,struct vnode * fvp,struct componentname * fcnp,struct vnode * tdvp,struct vnode * tvp,struct componentname * tcnp,vfs_context_t ctx,void * reserved)8203 vn_authorize_rename(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp,
8204     struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp,
8205     vfs_context_t ctx, void *reserved)
8206 {
8207 	return vn_authorize_renamex(fdvp, fvp, fcnp, tdvp, tvp, tcnp, ctx, 0, reserved);
8208 }
8209 
8210 int
vn_authorize_renamex(struct vnode * fdvp,struct vnode * fvp,struct componentname * fcnp,struct vnode * tdvp,struct vnode * tvp,struct componentname * tcnp,vfs_context_t ctx,vfs_rename_flags_t flags,void * reserved)8211 vn_authorize_renamex(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp,
8212     struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp,
8213     vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved)
8214 {
8215 	return vn_authorize_renamex_with_paths(fdvp, fvp, fcnp, NULL, tdvp, tvp, tcnp, NULL, ctx, flags, reserved);
8216 }
8217 
8218 int
vn_authorize_renamex_with_paths(struct vnode * fdvp,struct vnode * fvp,struct componentname * fcnp,const char * from_path,struct vnode * tdvp,struct vnode * tvp,struct componentname * tcnp,const char * to_path,vfs_context_t ctx,vfs_rename_flags_t flags,void * reserved)8219 vn_authorize_renamex_with_paths(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, const char *from_path,
8220     struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, const char *to_path,
8221     vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved)
8222 {
8223 	int error = 0;
8224 	int moving = 0;
8225 	bool swap = flags & VFS_RENAME_SWAP;
8226 
8227 	if (reserved != NULL) {
8228 		panic("Passed something other than NULL as reserved field!");
8229 	}
8230 
8231 	/*
8232 	 * Avoid renaming "." and "..".
8233 	 *
8234 	 * XXX No need to check for this in the FS.  We should always have the leaves
8235 	 * in VFS in this case.
8236 	 */
8237 	if (fvp->v_type == VDIR &&
8238 	    ((fdvp == fvp) ||
8239 	    (fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
8240 	    ((fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT))) {
8241 		error = EINVAL;
8242 		goto out;
8243 	}
8244 
8245 	if (tvp == NULLVP && vnode_compound_rename_available(tdvp)) {
8246 		error = lookup_validate_creation_path(tcnp->cn_ndp);
8247 		if (error) {
8248 			goto out;
8249 		}
8250 	}
8251 
8252 	/***** <MACF> *****/
8253 #if CONFIG_MACF
8254 	error = mac_vnode_check_rename(ctx, fdvp, fvp, fcnp, tdvp, tvp, tcnp);
8255 	if (error) {
8256 		goto out;
8257 	}
8258 	if (swap) {
8259 		error = mac_vnode_check_rename(ctx, tdvp, tvp, tcnp, fdvp, fvp, fcnp);
8260 		if (error) {
8261 			goto out;
8262 		}
8263 	}
8264 #endif
8265 	/***** </MACF> *****/
8266 
8267 	/***** <MiscChecks> *****/
8268 	if (tvp != NULL) {
8269 		if (!swap) {
8270 			if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
8271 				error = ENOTDIR;
8272 				goto out;
8273 			} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
8274 				error = EISDIR;
8275 				goto out;
8276 			}
8277 		}
8278 	} else if (swap) {
8279 		/*
8280 		 * Caller should have already checked this and returned
8281 		 * ENOENT.  If we send back ENOENT here, caller will retry
8282 		 * which isn't what we want so we send back EINVAL here
8283 		 * instead.
8284 		 */
8285 		error = EINVAL;
8286 		goto out;
8287 	}
8288 
8289 	if (fvp == tdvp) {
8290 		error = EINVAL;
8291 		goto out;
8292 	}
8293 
8294 	/*
8295 	 * The following edge case is caught here:
8296 	 * (to cannot be a descendent of from)
8297 	 *
8298 	 *       o fdvp
8299 	 *      /
8300 	 *     /
8301 	 *    o fvp
8302 	 *     \
8303 	 *      \
8304 	 *       o tdvp
8305 	 *      /
8306 	 *     /
8307 	 *    o tvp
8308 	 */
8309 	if (tdvp->v_parent == fvp) {
8310 		error = EINVAL;
8311 		goto out;
8312 	}
8313 
8314 	if (swap && fdvp->v_parent == tvp) {
8315 		error = EINVAL;
8316 		goto out;
8317 	}
8318 	/***** </MiscChecks> *****/
8319 
8320 	/***** <Kauth> *****/
8321 
8322 	/*
8323 	 * As part of the Kauth step, we call out to allow 3rd-party
8324 	 * fileop notification of "about to rename".  This is needed
8325 	 * in the event that 3rd-parties need to know that the DELETE
8326 	 * authorization is actually part of a rename.  It's important
8327 	 * that we guarantee that the DELETE call-out will always be
8328 	 * made if the WILL_RENAME call-out is made.  Another fileop
8329 	 * call-out will be performed once the operation is completed.
8330 	 * We can ignore the result of kauth_authorize_fileop().
8331 	 *
8332 	 * N.B. We are passing the vnode and *both* paths to each
8333 	 * call; kauth_authorize_fileop() extracts the "from" path
8334 	 * when posting a KAUTH_FILEOP_WILL_RENAME notification.
8335 	 * As such, we only post these notifications if all of the
8336 	 * information we need is provided.
8337 	 */
8338 
8339 	if (swap) {
8340 		kauth_action_t f = 0, t = 0;
8341 
8342 		/*
8343 		 * Directories changing parents need ...ADD_SUBDIR...  to
8344 		 * permit changing ".."
8345 		 */
8346 		if (fdvp != tdvp) {
8347 			if (vnode_isdir(fvp)) {
8348 				f = KAUTH_VNODE_ADD_SUBDIRECTORY;
8349 			}
8350 			if (vnode_isdir(tvp)) {
8351 				t = KAUTH_VNODE_ADD_SUBDIRECTORY;
8352 			}
8353 		}
8354 		if (to_path != NULL) {
8355 			kauth_authorize_fileop(vfs_context_ucred(ctx),
8356 			    KAUTH_FILEOP_WILL_RENAME,
8357 			    (uintptr_t)fvp,
8358 			    (uintptr_t)to_path);
8359 		}
8360 		error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | f, ctx);
8361 		if (error) {
8362 			goto out;
8363 		}
8364 		if (from_path != NULL) {
8365 			kauth_authorize_fileop(vfs_context_ucred(ctx),
8366 			    KAUTH_FILEOP_WILL_RENAME,
8367 			    (uintptr_t)tvp,
8368 			    (uintptr_t)from_path);
8369 		}
8370 		error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE | t, ctx);
8371 		if (error) {
8372 			goto out;
8373 		}
8374 		f = vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE;
8375 		t = vnode_isdir(tvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE;
8376 		if (fdvp == tdvp) {
8377 			error = vnode_authorize(fdvp, NULL, f | t, ctx);
8378 		} else {
8379 			error = vnode_authorize(fdvp, NULL, t, ctx);
8380 			if (error) {
8381 				goto out;
8382 			}
8383 			error = vnode_authorize(tdvp, NULL, f, ctx);
8384 		}
8385 		if (error) {
8386 			goto out;
8387 		}
8388 	} else {
8389 		error = 0;
8390 		if ((tvp != NULL) && vnode_isdir(tvp)) {
8391 			if (tvp != fdvp) {
8392 				moving = 1;
8393 			}
8394 		} else if (tdvp != fdvp) {
8395 			moving = 1;
8396 		}
8397 
8398 		/*
8399 		 * must have delete rights to remove the old name even in
8400 		 * the simple case of fdvp == tdvp.
8401 		 *
8402 		 * If fvp is a directory, and we are changing it's parent,
8403 		 * then we also need rights to rewrite its ".." entry as well.
8404 		 */
8405 		if (to_path != NULL) {
8406 			kauth_authorize_fileop(vfs_context_ucred(ctx),
8407 			    KAUTH_FILEOP_WILL_RENAME,
8408 			    (uintptr_t)fvp,
8409 			    (uintptr_t)to_path);
8410 		}
8411 		if (vnode_isdir(fvp)) {
8412 			if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) {
8413 				goto out;
8414 			}
8415 		} else {
8416 			if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8417 				goto out;
8418 			}
8419 		}
8420 		if (moving) {
8421 			/* moving into tdvp or tvp, must have rights to add */
8422 			if ((error = vnode_authorize(((tvp != NULL) && vnode_isdir(tvp)) ? tvp : tdvp,
8423 			    NULL,
8424 			    vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE,
8425 			    ctx)) != 0) {
8426 				goto out;
8427 			}
8428 		} else {
8429 			/* node staying in same directory, must be allowed to add new name */
8430 			if ((error = vnode_authorize(fdvp, NULL,
8431 			    vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8432 				goto out;
8433 			}
8434 		}
8435 		/* overwriting tvp */
8436 		if ((tvp != NULL) && !vnode_isdir(tvp) &&
8437 		    ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0)) {
8438 			goto out;
8439 		}
8440 	}
8441 
8442 	/***** </Kauth> *****/
8443 
8444 	/* XXX more checks? */
8445 out:
8446 	return error;
8447 }
8448 
8449 int
vn_authorize_mkdir(vnode_t dvp,struct componentname * cnp,struct vnode_attr * vap,vfs_context_t ctx,void * reserved)8450 vn_authorize_mkdir(vnode_t dvp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx, void *reserved)
8451 {
8452 #if !CONFIG_MACF
8453 #pragma unused(vap)
8454 #endif
8455 	int error;
8456 
8457 	if (reserved != NULL) {
8458 		panic("reserved not NULL in vn_authorize_mkdir()");
8459 	}
8460 
8461 	/* XXX A hack for now, to make shadow files work */
8462 	if (cnp->cn_ndp == NULL) {
8463 		return 0;
8464 	}
8465 
8466 	if (vnode_compound_mkdir_available(dvp)) {
8467 		error = lookup_validate_creation_path(cnp->cn_ndp);
8468 		if (error) {
8469 			goto out;
8470 		}
8471 	}
8472 
8473 #if CONFIG_MACF
8474 	error = mac_vnode_check_create(ctx,
8475 	    dvp, cnp, vap);
8476 	if (error) {
8477 		goto out;
8478 	}
8479 #endif
8480 
8481 	/* authorize addition of a directory to the parent */
8482 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) {
8483 		goto out;
8484 	}
8485 
8486 out:
8487 	return error;
8488 }
8489 
8490 int
vn_authorize_rmdir(vnode_t dvp,vnode_t vp,struct componentname * cnp,vfs_context_t ctx,void * reserved)8491 vn_authorize_rmdir(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved)
8492 {
8493 #if CONFIG_MACF
8494 	int error;
8495 #else
8496 #pragma unused(cnp)
8497 #endif
8498 	if (reserved != NULL) {
8499 		panic("Non-NULL reserved argument to vn_authorize_rmdir()");
8500 	}
8501 
8502 	if (vp->v_type != VDIR) {
8503 		/*
8504 		 * rmdir only deals with directories
8505 		 */
8506 		return ENOTDIR;
8507 	}
8508 
8509 	if (dvp == vp) {
8510 		/*
8511 		 * No rmdir "." please.
8512 		 */
8513 		return EINVAL;
8514 	}
8515 
8516 #if CONFIG_MACF
8517 	error = mac_vnode_check_unlink(ctx, dvp,
8518 	    vp, cnp);
8519 	if (error) {
8520 		return error;
8521 	}
8522 #endif
8523 
8524 	return vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx);
8525 }
8526 
8527 /*
8528  * Authorizer for directory cloning. This does not use vnodes but instead
8529  * uses prefilled vnode attributes from the filesystem.
8530  *
8531  * The same function is called to set up the attributes required, perform the
8532  * authorization and cleanup (if required)
8533  */
8534 int
vnode_attr_authorize_dir_clone(struct vnode_attr * vap,kauth_action_t action,struct vnode_attr * dvap,__unused vnode_t sdvp,mount_t mp,dir_clone_authorizer_op_t vattr_op,uint32_t flags,vfs_context_t ctx,__unused void * reserved)8535 vnode_attr_authorize_dir_clone(struct vnode_attr *vap, kauth_action_t action,
8536     struct vnode_attr *dvap, __unused vnode_t sdvp, mount_t mp,
8537     dir_clone_authorizer_op_t vattr_op, uint32_t flags, vfs_context_t ctx,
8538     __unused void *reserved)
8539 {
8540 	int error;
8541 	int is_suser = vfs_context_issuser(ctx);
8542 
8543 	if (vattr_op == OP_VATTR_SETUP) {
8544 		VATTR_INIT(vap);
8545 
8546 		/*
8547 		 * When ACL inheritence is implemented, both vap->va_acl and
8548 		 * dvap->va_acl will be required (even as superuser).
8549 		 */
8550 		VATTR_WANTED(vap, va_type);
8551 		VATTR_WANTED(vap, va_mode);
8552 		VATTR_WANTED(vap, va_flags);
8553 		VATTR_WANTED(vap, va_uid);
8554 		VATTR_WANTED(vap, va_gid);
8555 		if (dvap) {
8556 			VATTR_INIT(dvap);
8557 			VATTR_WANTED(dvap, va_flags);
8558 		}
8559 
8560 		if (!is_suser) {
8561 			/*
8562 			 * If not superuser, we have to evaluate ACLs and
8563 			 * need the target directory gid to set the initial
8564 			 * gid of the new object.
8565 			 */
8566 			VATTR_WANTED(vap, va_acl);
8567 			if (dvap) {
8568 				VATTR_WANTED(dvap, va_gid);
8569 			}
8570 		} else if (dvap && (flags & VNODE_CLONEFILE_NOOWNERCOPY)) {
8571 			VATTR_WANTED(dvap, va_gid);
8572 		}
8573 		return 0;
8574 	} else if (vattr_op == OP_VATTR_CLEANUP) {
8575 		return 0; /* Nothing to do for now */
8576 	}
8577 
8578 	/* dvap isn't used for authorization */
8579 	error = vnode_attr_authorize(vap, NULL, mp, action, ctx);
8580 
8581 	if (error) {
8582 		return error;
8583 	}
8584 
8585 	/*
8586 	 * vn_attribute_prepare should be able to accept attributes as well as
8587 	 * vnodes but for now we do this inline.
8588 	 */
8589 	if (!is_suser || (flags & VNODE_CLONEFILE_NOOWNERCOPY)) {
8590 		/*
8591 		 * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit
8592 		 * owner is set, that owner takes ownership of all new files.
8593 		 */
8594 		if ((mp->mnt_flag & MNT_IGNORE_OWNERSHIP) &&
8595 		    (mp->mnt_fsowner != KAUTH_UID_NONE)) {
8596 			VATTR_SET(vap, va_uid, mp->mnt_fsowner);
8597 		} else {
8598 			/* default owner is current user */
8599 			VATTR_SET(vap, va_uid,
8600 			    kauth_cred_getuid(vfs_context_ucred(ctx)));
8601 		}
8602 
8603 		if ((mp->mnt_flag & MNT_IGNORE_OWNERSHIP) &&
8604 		    (mp->mnt_fsgroup != KAUTH_GID_NONE)) {
8605 			VATTR_SET(vap, va_gid, mp->mnt_fsgroup);
8606 		} else {
8607 			/*
8608 			 * default group comes from parent object,
8609 			 * fallback to current user
8610 			 */
8611 			if (VATTR_IS_SUPPORTED(dvap, va_gid)) {
8612 				VATTR_SET(vap, va_gid, dvap->va_gid);
8613 			} else {
8614 				VATTR_SET(vap, va_gid,
8615 				    kauth_cred_getgid(vfs_context_ucred(ctx)));
8616 			}
8617 		}
8618 	}
8619 
8620 	/* Inherit SF_RESTRICTED bit from destination directory only */
8621 	if (VATTR_IS_ACTIVE(vap, va_flags)) {
8622 		VATTR_SET(vap, va_flags,
8623 		    ((vap->va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)))); /* Turn off from source */
8624 		if (VATTR_IS_ACTIVE(dvap, va_flags)) {
8625 			VATTR_SET(vap, va_flags,
8626 			    vap->va_flags | (dvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED)));
8627 		}
8628 	} else if (VATTR_IS_ACTIVE(dvap, va_flags)) {
8629 		VATTR_SET(vap, va_flags, (dvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED)));
8630 	}
8631 
8632 	return 0;
8633 }
8634 
8635 
8636 /*
8637  * Authorize an operation on a vnode.
8638  *
8639  * This is KPI, but here because it needs vnode_scope.
8640  *
8641  * Returns:	0			Success
8642  *	kauth_authorize_action:EPERM	...
8643  *	xlate => EACCES			Permission denied
8644  *	kauth_authorize_action:0	Success
8645  *	kauth_authorize_action:		Depends on callback return; this is
8646  *					usually only vnode_authorize_callback(),
8647  *					but may include other listerners, if any
8648  *					exist.
8649  *		EROFS
8650  *		EACCES
8651  *		EPERM
8652  *		???
8653  */
8654 int
vnode_authorize(vnode_t vp,vnode_t dvp,kauth_action_t action,vfs_context_t ctx)8655 vnode_authorize(vnode_t vp, vnode_t dvp, kauth_action_t action, vfs_context_t ctx)
8656 {
8657 	int     error, result;
8658 
8659 	/*
8660 	 * We can't authorize against a dead vnode; allow all operations through so that
8661 	 * the correct error can be returned.
8662 	 */
8663 	if (vp->v_type == VBAD) {
8664 		return 0;
8665 	}
8666 
8667 	error = 0;
8668 	result = kauth_authorize_action(vnode_scope, vfs_context_ucred(ctx), action,
8669 	    (uintptr_t)ctx, (uintptr_t)vp, (uintptr_t)dvp, (uintptr_t)&error);
8670 	if (result == EPERM) {          /* traditional behaviour */
8671 		result = EACCES;
8672 	}
8673 	/* did the lower layers give a better error return? */
8674 	if ((result != 0) && (error != 0)) {
8675 		return error;
8676 	}
8677 	return result;
8678 }
8679 
8680 /*
8681  * Test for vnode immutability.
8682  *
8683  * The 'append' flag is set when the authorization request is constrained
8684  * to operations which only request the right to append to a file.
8685  *
8686  * The 'ignore' flag is set when an operation modifying the immutability flags
8687  * is being authorized.  We check the system securelevel to determine which
8688  * immutability flags we can ignore.
8689  */
8690 static int
vnode_immutable(struct vnode_attr * vap,int append,int ignore)8691 vnode_immutable(struct vnode_attr *vap, int append, int ignore)
8692 {
8693 	int     mask;
8694 
8695 	/* start with all bits precluding the operation */
8696 	mask = IMMUTABLE | APPEND;
8697 
8698 	/* if appending only, remove the append-only bits */
8699 	if (append) {
8700 		mask &= ~APPEND;
8701 	}
8702 
8703 	/* ignore only set when authorizing flags changes */
8704 	if (ignore) {
8705 		if (securelevel <= 0) {
8706 			/* in insecure state, flags do not inhibit changes */
8707 			mask = 0;
8708 		} else {
8709 			/* in secure state, user flags don't inhibit */
8710 			mask &= ~(UF_IMMUTABLE | UF_APPEND);
8711 		}
8712 	}
8713 	KAUTH_DEBUG("IMMUTABLE - file flags 0x%x mask 0x%x append = %d ignore = %d", vap->va_flags, mask, append, ignore);
8714 	if ((vap->va_flags & mask) != 0) {
8715 		return EPERM;
8716 	}
8717 	return 0;
8718 }
8719 
8720 static int
vauth_node_owner(struct vnode_attr * vap,kauth_cred_t cred)8721 vauth_node_owner(struct vnode_attr *vap, kauth_cred_t cred)
8722 {
8723 	int result;
8724 
8725 	/* default assumption is not-owner */
8726 	result = 0;
8727 
8728 	/*
8729 	 * If the filesystem has given us a UID, we treat this as authoritative.
8730 	 */
8731 	if (vap && VATTR_IS_SUPPORTED(vap, va_uid)) {
8732 		result = (vap->va_uid == kauth_cred_getuid(cred)) ? 1 : 0;
8733 	}
8734 	/* we could test the owner UUID here if we had a policy for it */
8735 
8736 	return result;
8737 }
8738 
8739 /*
8740  * vauth_node_group
8741  *
8742  * Description:	Ask if a cred is a member of the group owning the vnode object
8743  *
8744  * Parameters:		vap		vnode attribute
8745  *				vap->va_gid	group owner of vnode object
8746  *			cred		credential to check
8747  *			ismember	pointer to where to put the answer
8748  *			idontknow	Return this if we can't get an answer
8749  *
8750  * Returns:		0		Success
8751  *			idontknow	Can't get information
8752  *	kauth_cred_ismember_gid:?	Error from kauth subsystem
8753  *	kauth_cred_ismember_gid:?	Error from kauth subsystem
8754  */
8755 static int
vauth_node_group(struct vnode_attr * vap,kauth_cred_t cred,int * ismember,int idontknow)8756 vauth_node_group(struct vnode_attr *vap, kauth_cred_t cred, int *ismember, int idontknow)
8757 {
8758 	int     error;
8759 	int     result;
8760 
8761 	error = 0;
8762 	result = 0;
8763 
8764 	/*
8765 	 * The caller is expected to have asked the filesystem for a group
8766 	 * at some point prior to calling this function.  The answer may
8767 	 * have been that there is no group ownership supported for the
8768 	 * vnode object, in which case we return
8769 	 */
8770 	if (vap && VATTR_IS_SUPPORTED(vap, va_gid)) {
8771 		error = kauth_cred_ismember_gid(cred, vap->va_gid, &result);
8772 		/*
8773 		 * Credentials which are opted into external group membership
8774 		 * resolution which are not known to the external resolver
8775 		 * will result in an ENOENT error.  We translate this into
8776 		 * the appropriate 'idontknow' response for our caller.
8777 		 *
8778 		 * XXX We do not make a distinction here between an ENOENT
8779 		 * XXX arising from a response from the external resolver,
8780 		 * XXX and an ENOENT which is internally generated.  This is
8781 		 * XXX a deficiency of the published kauth_cred_ismember_gid()
8782 		 * XXX KPI which can not be overcome without new KPI.  For
8783 		 * XXX all currently known cases, however, this wil result
8784 		 * XXX in correct behaviour.
8785 		 */
8786 		if (error == ENOENT) {
8787 			error = idontknow;
8788 		}
8789 	}
8790 	/*
8791 	 * XXX We could test the group UUID here if we had a policy for it,
8792 	 * XXX but this is problematic from the perspective of synchronizing
8793 	 * XXX group UUID and POSIX GID ownership of a file and keeping the
8794 	 * XXX values coherent over time.  The problem is that the local
8795 	 * XXX system will vend transient group UUIDs for unknown POSIX GID
8796 	 * XXX values, and these are not persistent, whereas storage of values
8797 	 * XXX is persistent.  One potential solution to this is a local
8798 	 * XXX (persistent) replica of remote directory entries and vended
8799 	 * XXX local ids in a local directory server (think in terms of a
8800 	 * XXX caching DNS server).
8801 	 */
8802 
8803 	if (!error) {
8804 		*ismember = result;
8805 	}
8806 	return error;
8807 }
8808 
8809 static int
vauth_file_owner(vauth_ctx vcp)8810 vauth_file_owner(vauth_ctx vcp)
8811 {
8812 	int result;
8813 
8814 	if (vcp->flags_valid & _VAC_IS_OWNER) {
8815 		result = (vcp->flags & _VAC_IS_OWNER) ? 1 : 0;
8816 	} else {
8817 		result = vauth_node_owner(vcp->vap, vcp->ctx->vc_ucred);
8818 
8819 		/* cache our result */
8820 		vcp->flags_valid |= _VAC_IS_OWNER;
8821 		if (result) {
8822 			vcp->flags |= _VAC_IS_OWNER;
8823 		} else {
8824 			vcp->flags &= ~_VAC_IS_OWNER;
8825 		}
8826 	}
8827 	return result;
8828 }
8829 
8830 
8831 /*
8832  * vauth_file_ingroup
8833  *
8834  * Description:	Ask if a user is a member of the group owning the directory
8835  *
8836  * Parameters:		vcp		The vnode authorization context that
8837  *					contains the user and directory info
8838  *				vcp->flags_valid	Valid flags
8839  *				vcp->flags		Flags values
8840  *				vcp->vap		File vnode attributes
8841  *				vcp->ctx		VFS Context (for user)
8842  *			ismember	pointer to where to put the answer
8843  *			idontknow	Return this if we can't get an answer
8844  *
8845  * Returns:		0		Success
8846  *		vauth_node_group:?	Error from vauth_node_group()
8847  *
8848  * Implicit returns:	*ismember	0	The user is not a group member
8849  *					1	The user is a group member
8850  */
8851 static int
vauth_file_ingroup(vauth_ctx vcp,int * ismember,int idontknow)8852 vauth_file_ingroup(vauth_ctx vcp, int *ismember, int idontknow)
8853 {
8854 	int     error;
8855 
8856 	/* Check for a cached answer first, to avoid the check if possible */
8857 	if (vcp->flags_valid & _VAC_IN_GROUP) {
8858 		*ismember = (vcp->flags & _VAC_IN_GROUP) ? 1 : 0;
8859 		error = 0;
8860 	} else {
8861 		/* Otherwise, go look for it */
8862 		error = vauth_node_group(vcp->vap, vcp->ctx->vc_ucred, ismember, idontknow);
8863 
8864 		if (!error) {
8865 			/* cache our result */
8866 			vcp->flags_valid |= _VAC_IN_GROUP;
8867 			if (*ismember) {
8868 				vcp->flags |= _VAC_IN_GROUP;
8869 			} else {
8870 				vcp->flags &= ~_VAC_IN_GROUP;
8871 			}
8872 		}
8873 	}
8874 	return error;
8875 }
8876 
8877 static int
vauth_dir_owner(vauth_ctx vcp)8878 vauth_dir_owner(vauth_ctx vcp)
8879 {
8880 	int result;
8881 
8882 	if (vcp->flags_valid & _VAC_IS_DIR_OWNER) {
8883 		result = (vcp->flags & _VAC_IS_DIR_OWNER) ? 1 : 0;
8884 	} else {
8885 		result = vauth_node_owner(vcp->dvap, vcp->ctx->vc_ucred);
8886 
8887 		/* cache our result */
8888 		vcp->flags_valid |= _VAC_IS_DIR_OWNER;
8889 		if (result) {
8890 			vcp->flags |= _VAC_IS_DIR_OWNER;
8891 		} else {
8892 			vcp->flags &= ~_VAC_IS_DIR_OWNER;
8893 		}
8894 	}
8895 	return result;
8896 }
8897 
8898 /*
8899  * vauth_dir_ingroup
8900  *
8901  * Description:	Ask if a user is a member of the group owning the directory
8902  *
8903  * Parameters:		vcp		The vnode authorization context that
8904  *					contains the user and directory info
8905  *				vcp->flags_valid	Valid flags
8906  *				vcp->flags		Flags values
8907  *				vcp->dvap		Dir vnode attributes
8908  *				vcp->ctx		VFS Context (for user)
8909  *			ismember	pointer to where to put the answer
8910  *			idontknow	Return this if we can't get an answer
8911  *
8912  * Returns:		0		Success
8913  *		vauth_node_group:?	Error from vauth_node_group()
8914  *
8915  * Implicit returns:	*ismember	0	The user is not a group member
8916  *					1	The user is a group member
8917  */
8918 static int
vauth_dir_ingroup(vauth_ctx vcp,int * ismember,int idontknow)8919 vauth_dir_ingroup(vauth_ctx vcp, int *ismember, int idontknow)
8920 {
8921 	int     error;
8922 
8923 	/* Check for a cached answer first, to avoid the check if possible */
8924 	if (vcp->flags_valid & _VAC_IN_DIR_GROUP) {
8925 		*ismember = (vcp->flags & _VAC_IN_DIR_GROUP) ? 1 : 0;
8926 		error = 0;
8927 	} else {
8928 		/* Otherwise, go look for it */
8929 		error = vauth_node_group(vcp->dvap, vcp->ctx->vc_ucred, ismember, idontknow);
8930 
8931 		if (!error) {
8932 			/* cache our result */
8933 			vcp->flags_valid |= _VAC_IN_DIR_GROUP;
8934 			if (*ismember) {
8935 				vcp->flags |= _VAC_IN_DIR_GROUP;
8936 			} else {
8937 				vcp->flags &= ~_VAC_IN_DIR_GROUP;
8938 			}
8939 		}
8940 	}
8941 	return error;
8942 }
8943 
8944 /*
8945  * Test the posix permissions in (vap) to determine whether (credential)
8946  * may perform (action)
8947  */
8948 static int
vnode_authorize_posix(vauth_ctx vcp,int action,int on_dir)8949 vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir)
8950 {
8951 	struct vnode_attr *vap;
8952 	int needed, error, owner_ok, group_ok, world_ok, ismember;
8953 #ifdef KAUTH_DEBUG_ENABLE
8954 	const char *where = "uninitialized";
8955 # define _SETWHERE(c)   where = c;
8956 #else
8957 # define _SETWHERE(c)
8958 #endif
8959 
8960 	/* checking file or directory? */
8961 	if (on_dir) {
8962 		vap = vcp->dvap;
8963 	} else {
8964 		vap = vcp->vap;
8965 	}
8966 
8967 	error = 0;
8968 
8969 	/*
8970 	 * We want to do as little work here as possible.  So first we check
8971 	 * which sets of permissions grant us the access we need, and avoid checking
8972 	 * whether specific permissions grant access when more generic ones would.
8973 	 */
8974 
8975 	/* owner permissions */
8976 	needed = 0;
8977 	if (action & VREAD) {
8978 		needed |= S_IRUSR;
8979 	}
8980 	if (action & VWRITE) {
8981 		needed |= S_IWUSR;
8982 	}
8983 	if (action & VEXEC) {
8984 		needed |= S_IXUSR;
8985 	}
8986 	owner_ok = (needed & vap->va_mode) == needed;
8987 
8988 	/*
8989 	 * Processes with the appropriate entitlement can marked themselves as
8990 	 * ignoring file/directory permissions if they own it.
8991 	 */
8992 	if (!owner_ok && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) {
8993 		owner_ok = 1;
8994 	}
8995 
8996 	/* group permissions */
8997 	needed = 0;
8998 	if (action & VREAD) {
8999 		needed |= S_IRGRP;
9000 	}
9001 	if (action & VWRITE) {
9002 		needed |= S_IWGRP;
9003 	}
9004 	if (action & VEXEC) {
9005 		needed |= S_IXGRP;
9006 	}
9007 	group_ok = (needed & vap->va_mode) == needed;
9008 
9009 	/* world permissions */
9010 	needed = 0;
9011 	if (action & VREAD) {
9012 		needed |= S_IROTH;
9013 	}
9014 	if (action & VWRITE) {
9015 		needed |= S_IWOTH;
9016 	}
9017 	if (action & VEXEC) {
9018 		needed |= S_IXOTH;
9019 	}
9020 	world_ok = (needed & vap->va_mode) == needed;
9021 
9022 	/* If granted/denied by all three, we're done */
9023 	if (owner_ok && group_ok && world_ok) {
9024 		_SETWHERE("all");
9025 		goto out;
9026 	}
9027 
9028 	if (!owner_ok && !group_ok && !world_ok) {
9029 		_SETWHERE("all");
9030 		error = EACCES;
9031 		goto out;
9032 	}
9033 
9034 	/* Check ownership (relatively cheap) */
9035 	if ((on_dir && vauth_dir_owner(vcp)) ||
9036 	    (!on_dir && vauth_file_owner(vcp))) {
9037 		_SETWHERE("user");
9038 		if (!owner_ok) {
9039 			error = EACCES;
9040 		}
9041 		goto out;
9042 	}
9043 
9044 	/* Not owner; if group and world both grant it we're done */
9045 	if (group_ok && world_ok) {
9046 		_SETWHERE("group/world");
9047 		goto out;
9048 	}
9049 	if (!group_ok && !world_ok) {
9050 		_SETWHERE("group/world");
9051 		error = EACCES;
9052 		goto out;
9053 	}
9054 
9055 	/* Check group membership (most expensive) */
9056 	ismember = 0;   /* Default to allow, if the target has no group owner */
9057 
9058 	/*
9059 	 * In the case we can't get an answer about the user from the call to
9060 	 * vauth_dir_ingroup() or vauth_file_ingroup(), we want to fail on
9061 	 * the side of caution, rather than simply granting access, or we will
9062 	 * fail to correctly implement exclusion groups, so we set the third
9063 	 * parameter on the basis of the state of 'group_ok'.
9064 	 */
9065 	if (on_dir) {
9066 		error = vauth_dir_ingroup(vcp, &ismember, (!group_ok ? EACCES : 0));
9067 	} else {
9068 		error = vauth_file_ingroup(vcp, &ismember, (!group_ok ? EACCES : 0));
9069 	}
9070 	if (error) {
9071 		if (!group_ok) {
9072 			ismember = 1;
9073 		}
9074 		error = 0;
9075 	}
9076 	if (ismember) {
9077 		_SETWHERE("group");
9078 		if (!group_ok) {
9079 			error = EACCES;
9080 		}
9081 		goto out;
9082 	}
9083 
9084 	/* Not owner, not in group, use world result */
9085 	_SETWHERE("world");
9086 	if (!world_ok) {
9087 		error = EACCES;
9088 	}
9089 
9090 	/* FALLTHROUGH */
9091 
9092 out:
9093 	KAUTH_DEBUG("%p    %s - posix %s permissions : need %s%s%s %x have %s%s%s%s%s%s%s%s%s UID = %d file = %d,%d",
9094 	    vcp->vp, (error == 0) ? "ALLOWED" : "DENIED", where,
9095 	    (action & VREAD)  ? "r" : "-",
9096 	    (action & VWRITE) ? "w" : "-",
9097 	    (action & VEXEC)  ? "x" : "-",
9098 	    needed,
9099 	    (vap->va_mode & S_IRUSR) ? "r" : "-",
9100 	    (vap->va_mode & S_IWUSR) ? "w" : "-",
9101 	    (vap->va_mode & S_IXUSR) ? "x" : "-",
9102 	    (vap->va_mode & S_IRGRP) ? "r" : "-",
9103 	    (vap->va_mode & S_IWGRP) ? "w" : "-",
9104 	    (vap->va_mode & S_IXGRP) ? "x" : "-",
9105 	    (vap->va_mode & S_IROTH) ? "r" : "-",
9106 	    (vap->va_mode & S_IWOTH) ? "w" : "-",
9107 	    (vap->va_mode & S_IXOTH) ? "x" : "-",
9108 	    kauth_cred_getuid(vcp->ctx->vc_ucred),
9109 	    on_dir ? vcp->dvap->va_uid : vcp->vap->va_uid,
9110 	    on_dir ? vcp->dvap->va_gid : vcp->vap->va_gid);
9111 	return error;
9112 }
9113 
9114 /*
9115  * Authorize the deletion of the node vp from the directory dvp.
9116  *
9117  * We assume that:
9118  * - Neither the node nor the directory are immutable.
9119  * - The user is not the superuser.
9120  *
9121  * The precedence of factors for authorizing or denying delete for a credential
9122  *
9123  * 1) Explicit ACE on the node. (allow or deny DELETE)
9124  * 2) Explicit ACE on the directory (allow or deny DELETE_CHILD).
9125  *
9126  *    If there are conflicting ACEs on the node and the directory, the node
9127  *    ACE wins.
9128  *
9129  * 3) Sticky bit on the directory.
9130  *    Deletion is not permitted if the directory is sticky and the caller is
9131  *    not owner of the node or directory. The sticky bit rules are like a deny
9132  *    delete ACE except lower in priority than ACL's either allowing or denying
9133  *    delete.
9134  *
9135  * 4) POSIX permisions on the directory.
9136  *
9137  * As an optimization, we cache whether or not delete child is permitted
9138  * on directories. This enables us to skip directory ACL and POSIX checks
9139  * as we already have the result from those checks. However, we always check the
9140  * node ACL and, if the directory has the sticky bit set, we always check its
9141  * ACL (even for a directory with an authorized delete child). Furthermore,
9142  * caching the delete child authorization is independent of the sticky bit
9143  * being set as it is only applicable in determining whether the node can be
9144  * deleted or not.
9145  */
9146 static int
vnode_authorize_delete(vauth_ctx vcp,boolean_t cached_delete_child)9147 vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child)
9148 {
9149 	struct vnode_attr       *vap = vcp->vap;
9150 	struct vnode_attr       *dvap = vcp->dvap;
9151 	kauth_cred_t            cred = vcp->ctx->vc_ucred;
9152 	struct kauth_acl_eval   eval;
9153 	int                     error, ismember;
9154 
9155 	/* Check the ACL on the node first */
9156 	if (VATTR_IS_NOT(vap, va_acl, NULL)) {
9157 		eval.ae_requested = KAUTH_VNODE_DELETE;
9158 		eval.ae_acl = &vap->va_acl->acl_ace[0];
9159 		eval.ae_count = vap->va_acl->acl_entrycount;
9160 		eval.ae_options = 0;
9161 		if (vauth_file_owner(vcp)) {
9162 			eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
9163 		}
9164 		/*
9165 		 * We use ENOENT as a marker to indicate we could not get
9166 		 * information in order to delay evaluation until after we
9167 		 * have the ACL evaluation answer.  Previously, we would
9168 		 * always deny the operation at this point.
9169 		 */
9170 		if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) {
9171 			return error;
9172 		}
9173 		if (error == ENOENT) {
9174 			eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
9175 		} else if (ismember) {
9176 			eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
9177 		}
9178 		eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
9179 		eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
9180 		eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
9181 		eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
9182 
9183 		if ((error = kauth_acl_evaluate(cred, &eval)) != 0) {
9184 			KAUTH_DEBUG("%p    ERROR during ACL processing - %d", vcp->vp, error);
9185 			return error;
9186 		}
9187 
9188 		switch (eval.ae_result) {
9189 		case KAUTH_RESULT_DENY:
9190 			if (vauth_file_owner(vcp) && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) {
9191 				KAUTH_DEBUG("%p    Override DENY due to entitlement", vcp->vp);
9192 				return 0;
9193 			}
9194 			KAUTH_DEBUG("%p    DENIED - denied by ACL", vcp->vp);
9195 			return EACCES;
9196 		case KAUTH_RESULT_ALLOW:
9197 			KAUTH_DEBUG("%p    ALLOWED - granted by ACL", vcp->vp);
9198 			return 0;
9199 		case KAUTH_RESULT_DEFER:
9200 		default:
9201 			/* Defer to directory */
9202 			KAUTH_DEBUG("%p    DEFERRED - by file ACL", vcp->vp);
9203 			break;
9204 		}
9205 	}
9206 
9207 	/*
9208 	 * Without a sticky bit, a previously authorized delete child is
9209 	 * sufficient to authorize this delete.
9210 	 *
9211 	 * If the sticky bit is set, a directory ACL which allows delete child
9212 	 * overrides a (potential) sticky bit deny. The authorized delete child
9213 	 * cannot tell us if it was authorized because of an explicit delete
9214 	 * child allow ACE or because of POSIX permisions so we have to check
9215 	 * the directory ACL everytime if the directory has a sticky bit.
9216 	 */
9217 	if (!(dvap->va_mode & S_ISTXT) && cached_delete_child) {
9218 		KAUTH_DEBUG("%p    ALLOWED - granted by directory ACL or POSIX permissions and no sticky bit on directory", vcp->vp);
9219 		return 0;
9220 	}
9221 
9222 	/* check the ACL on the directory */
9223 	if (VATTR_IS_NOT(dvap, va_acl, NULL)) {
9224 		eval.ae_requested = KAUTH_VNODE_DELETE_CHILD;
9225 		eval.ae_acl = &dvap->va_acl->acl_ace[0];
9226 		eval.ae_count = dvap->va_acl->acl_entrycount;
9227 		eval.ae_options = 0;
9228 		if (vauth_dir_owner(vcp)) {
9229 			eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
9230 		}
9231 		/*
9232 		 * We use ENOENT as a marker to indicate we could not get
9233 		 * information in order to delay evaluation until after we
9234 		 * have the ACL evaluation answer.  Previously, we would
9235 		 * always deny the operation at this point.
9236 		 */
9237 		if ((error = vauth_dir_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) {
9238 			return error;
9239 		}
9240 		if (error == ENOENT) {
9241 			eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
9242 		} else if (ismember) {
9243 			eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
9244 		}
9245 		eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
9246 		eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
9247 		eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
9248 		eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
9249 
9250 		/*
9251 		 * If there is no entry, we are going to defer to other
9252 		 * authorization mechanisms.
9253 		 */
9254 		error = kauth_acl_evaluate(cred, &eval);
9255 
9256 		if (error != 0) {
9257 			KAUTH_DEBUG("%p    ERROR during ACL processing - %d", vcp->vp, error);
9258 			return error;
9259 		}
9260 		switch (eval.ae_result) {
9261 		case KAUTH_RESULT_DENY:
9262 			if (vauth_dir_owner(vcp) && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) {
9263 				KAUTH_DEBUG("%p    Override DENY due to entitlement", vcp->vp);
9264 				return 0;
9265 			}
9266 			KAUTH_DEBUG("%p    DENIED - denied by directory ACL", vcp->vp);
9267 			return EACCES;
9268 		case KAUTH_RESULT_ALLOW:
9269 			KAUTH_DEBUG("%p    ALLOWED - granted by directory ACL", vcp->vp);
9270 			if (!cached_delete_child && vcp->dvp) {
9271 				vnode_cache_authorized_action(vcp->dvp,
9272 				    vcp->ctx, KAUTH_VNODE_DELETE_CHILD);
9273 			}
9274 			return 0;
9275 		case KAUTH_RESULT_DEFER:
9276 		default:
9277 			/* Deferred by directory ACL */
9278 			KAUTH_DEBUG("%p    DEFERRED - directory ACL", vcp->vp);
9279 			break;
9280 		}
9281 	}
9282 
9283 	/*
9284 	 * From this point, we can't explicitly allow and if we reach the end
9285 	 * of the function without a denial, then the delete is authorized.
9286 	 */
9287 	if (!cached_delete_child) {
9288 		if (vnode_authorize_posix(vcp, VWRITE, 1 /* on_dir */) != 0) {
9289 			KAUTH_DEBUG("%p    DENIED - denied by posix permisssions", vcp->vp);
9290 			return EACCES;
9291 		}
9292 		/*
9293 		 * Cache the authorized action on the vnode if allowed by the
9294 		 * directory ACL or POSIX permissions. It is correct to cache
9295 		 * this action even if sticky bit would deny deleting the node.
9296 		 */
9297 		if (vcp->dvp) {
9298 			vnode_cache_authorized_action(vcp->dvp, vcp->ctx,
9299 			    KAUTH_VNODE_DELETE_CHILD);
9300 		}
9301 	}
9302 
9303 	/* enforce sticky bit behaviour */
9304 	if ((dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) {
9305 		KAUTH_DEBUG("%p    DENIED - sticky bit rules (user %d  file %d  dir %d)",
9306 		    vcp->vp, cred->cr_posix.cr_uid, vap->va_uid, dvap->va_uid);
9307 		return EACCES;
9308 	}
9309 
9310 	/* not denied, must be OK */
9311 	return 0;
9312 }
9313 
9314 
9315 /*
9316  * Authorize an operation based on the node's attributes.
9317  */
9318 static int
vnode_authorize_simple(vauth_ctx vcp,kauth_ace_rights_t acl_rights,kauth_ace_rights_t preauth_rights,boolean_t * found_deny)9319 vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_rights_t preauth_rights, boolean_t *found_deny)
9320 {
9321 	struct vnode_attr       *vap = vcp->vap;
9322 	kauth_cred_t            cred = vcp->ctx->vc_ucred;
9323 	struct kauth_acl_eval   eval;
9324 	int                     error, ismember;
9325 	mode_t                  posix_action;
9326 
9327 	/*
9328 	 * If we are the file owner, we automatically have some rights.
9329 	 *
9330 	 * Do we need to expand this to support group ownership?
9331 	 */
9332 	if (vauth_file_owner(vcp)) {
9333 		acl_rights &= ~(KAUTH_VNODE_WRITE_SECURITY);
9334 	}
9335 
9336 	/*
9337 	 * If we are checking both TAKE_OWNERSHIP and WRITE_SECURITY, we can
9338 	 * mask the latter.  If TAKE_OWNERSHIP is requested the caller is about to
9339 	 * change ownership to themselves, and WRITE_SECURITY is implicitly
9340 	 * granted to the owner.  We need to do this because at this point
9341 	 * WRITE_SECURITY may not be granted as the caller is not currently
9342 	 * the owner.
9343 	 */
9344 	if ((acl_rights & KAUTH_VNODE_TAKE_OWNERSHIP) &&
9345 	    (acl_rights & KAUTH_VNODE_WRITE_SECURITY)) {
9346 		acl_rights &= ~KAUTH_VNODE_WRITE_SECURITY;
9347 	}
9348 
9349 	if (acl_rights == 0) {
9350 		KAUTH_DEBUG("%p    ALLOWED - implicit or no rights required", vcp->vp);
9351 		return 0;
9352 	}
9353 
9354 	/* if we have an ACL, evaluate it */
9355 	if (VATTR_IS_NOT(vap, va_acl, NULL)) {
9356 		eval.ae_requested = acl_rights;
9357 		eval.ae_acl = &vap->va_acl->acl_ace[0];
9358 		eval.ae_count = vap->va_acl->acl_entrycount;
9359 		eval.ae_options = 0;
9360 		if (vauth_file_owner(vcp)) {
9361 			eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
9362 		}
9363 		/*
9364 		 * We use ENOENT as a marker to indicate we could not get
9365 		 * information in order to delay evaluation until after we
9366 		 * have the ACL evaluation answer.  Previously, we would
9367 		 * always deny the operation at this point.
9368 		 */
9369 		if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) {
9370 			return error;
9371 		}
9372 		if (error == ENOENT) {
9373 			eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
9374 		} else if (ismember) {
9375 			eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
9376 		}
9377 		eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
9378 		eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
9379 		eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
9380 		eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
9381 
9382 		if ((error = kauth_acl_evaluate(cred, &eval)) != 0) {
9383 			KAUTH_DEBUG("%p    ERROR during ACL processing - %d", vcp->vp, error);
9384 			return error;
9385 		}
9386 
9387 		switch (eval.ae_result) {
9388 		case KAUTH_RESULT_DENY:
9389 			if (vauth_file_owner(vcp) && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) {
9390 				KAUTH_DEBUG("%p    Override DENY due to entitlement", vcp->vp);
9391 				return 0;
9392 			}
9393 			KAUTH_DEBUG("%p    DENIED - by ACL", vcp->vp);
9394 			return EACCES;         /* deny, deny, counter-allege */
9395 		case KAUTH_RESULT_ALLOW:
9396 			KAUTH_DEBUG("%p    ALLOWED - all rights granted by ACL", vcp->vp);
9397 			return 0;
9398 		case KAUTH_RESULT_DEFER:
9399 		default:
9400 			/* Effectively the same as !delete_child_denied */
9401 			KAUTH_DEBUG("%p    DEFERRED - directory ACL", vcp->vp);
9402 			break;
9403 		}
9404 
9405 		*found_deny = eval.ae_found_deny;
9406 
9407 		/* fall through and evaluate residual rights */
9408 	} else {
9409 		/* no ACL, everything is residual */
9410 		eval.ae_residual = acl_rights;
9411 	}
9412 
9413 	/*
9414 	 * Grant residual rights that have been pre-authorized.
9415 	 */
9416 	eval.ae_residual &= ~preauth_rights;
9417 
9418 	/*
9419 	 * We grant WRITE_ATTRIBUTES to the owner if it hasn't been denied.
9420 	 */
9421 	if (vauth_file_owner(vcp)) {
9422 		eval.ae_residual &= ~KAUTH_VNODE_WRITE_ATTRIBUTES;
9423 	}
9424 
9425 	if (eval.ae_residual == 0) {
9426 		KAUTH_DEBUG("%p    ALLOWED - rights already authorized", vcp->vp);
9427 		return 0;
9428 	}
9429 
9430 	/*
9431 	 * Bail if we have residual rights that can't be granted by posix permissions,
9432 	 * or aren't presumed granted at this point.
9433 	 *
9434 	 * XXX these can be collapsed for performance
9435 	 */
9436 	if (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER) {
9437 		KAUTH_DEBUG("%p    DENIED - CHANGE_OWNER not permitted", vcp->vp);
9438 		return EACCES;
9439 	}
9440 	if (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY) {
9441 		KAUTH_DEBUG("%p    DENIED - WRITE_SECURITY not permitted", vcp->vp);
9442 		return EACCES;
9443 	}
9444 
9445 #if DIAGNOSTIC
9446 	if (eval.ae_residual & KAUTH_VNODE_DELETE) {
9447 		panic("vnode_authorize: can't be checking delete permission here");
9448 	}
9449 #endif
9450 
9451 	/*
9452 	 * Compute the fallback posix permissions that will satisfy the remaining
9453 	 * rights.
9454 	 */
9455 	posix_action = 0;
9456 	if (eval.ae_residual & (KAUTH_VNODE_READ_DATA |
9457 	    KAUTH_VNODE_LIST_DIRECTORY |
9458 	    KAUTH_VNODE_READ_EXTATTRIBUTES)) {
9459 		posix_action |= VREAD;
9460 	}
9461 	if (eval.ae_residual & (KAUTH_VNODE_WRITE_DATA |
9462 	    KAUTH_VNODE_ADD_FILE |
9463 	    KAUTH_VNODE_ADD_SUBDIRECTORY |
9464 	    KAUTH_VNODE_DELETE_CHILD |
9465 	    KAUTH_VNODE_WRITE_ATTRIBUTES |
9466 	    KAUTH_VNODE_WRITE_EXTATTRIBUTES)) {
9467 		posix_action |= VWRITE;
9468 	}
9469 	if (eval.ae_residual & (KAUTH_VNODE_EXECUTE |
9470 	    KAUTH_VNODE_SEARCH)) {
9471 		posix_action |= VEXEC;
9472 	}
9473 
9474 	if (posix_action != 0) {
9475 		return vnode_authorize_posix(vcp, posix_action, 0 /* !on_dir */);
9476 	} else {
9477 		KAUTH_DEBUG("%p    ALLOWED - residual rights %s%s%s%s%s%s%s%s%s%s%s%s%s%s granted due to no posix mapping",
9478 		    vcp->vp,
9479 		    (eval.ae_residual & KAUTH_VNODE_READ_DATA)
9480 		    ? vnode_isdir(vcp->vp) ? " LIST_DIRECTORY" : " READ_DATA" : "",
9481 		    (eval.ae_residual & KAUTH_VNODE_WRITE_DATA)
9482 		    ? vnode_isdir(vcp->vp) ? " ADD_FILE" : " WRITE_DATA" : "",
9483 		    (eval.ae_residual & KAUTH_VNODE_EXECUTE)
9484 		    ? vnode_isdir(vcp->vp) ? " SEARCH" : " EXECUTE" : "",
9485 		    (eval.ae_residual & KAUTH_VNODE_DELETE)
9486 		    ? " DELETE" : "",
9487 		    (eval.ae_residual & KAUTH_VNODE_APPEND_DATA)
9488 		    ? vnode_isdir(vcp->vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "",
9489 		    (eval.ae_residual & KAUTH_VNODE_DELETE_CHILD)
9490 		    ? " DELETE_CHILD" : "",
9491 		    (eval.ae_residual & KAUTH_VNODE_READ_ATTRIBUTES)
9492 		    ? " READ_ATTRIBUTES" : "",
9493 		    (eval.ae_residual & KAUTH_VNODE_WRITE_ATTRIBUTES)
9494 		    ? " WRITE_ATTRIBUTES" : "",
9495 		    (eval.ae_residual & KAUTH_VNODE_READ_EXTATTRIBUTES)
9496 		    ? " READ_EXTATTRIBUTES" : "",
9497 		    (eval.ae_residual & KAUTH_VNODE_WRITE_EXTATTRIBUTES)
9498 		    ? " WRITE_EXTATTRIBUTES" : "",
9499 		    (eval.ae_residual & KAUTH_VNODE_READ_SECURITY)
9500 		    ? " READ_SECURITY" : "",
9501 		    (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY)
9502 		    ? " WRITE_SECURITY" : "",
9503 		    (eval.ae_residual & KAUTH_VNODE_CHECKIMMUTABLE)
9504 		    ? " CHECKIMMUTABLE" : "",
9505 		    (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER)
9506 		    ? " CHANGE_OWNER" : "");
9507 	}
9508 
9509 	/*
9510 	 * Lack of required Posix permissions implies no reason to deny access.
9511 	 */
9512 	return 0;
9513 }
9514 
9515 /*
9516  * Check for file immutability.
9517  */
9518 static int
vnode_authorize_checkimmutable(mount_t mp,vauth_ctx vcp,struct vnode_attr * vap,int rights,int ignore)9519 vnode_authorize_checkimmutable(mount_t mp, vauth_ctx vcp,
9520     struct vnode_attr *vap, int rights, int ignore)
9521 {
9522 	int error;
9523 	int append;
9524 
9525 	/*
9526 	 * Perform immutability checks for operations that change data.
9527 	 *
9528 	 * Sockets, fifos and devices require special handling.
9529 	 */
9530 	switch (vap->va_type) {
9531 	case VSOCK:
9532 	case VFIFO:
9533 	case VBLK:
9534 	case VCHR:
9535 		/*
9536 		 * Writing to these nodes does not change the filesystem data,
9537 		 * so forget that it's being tried.
9538 		 */
9539 		rights &= ~KAUTH_VNODE_WRITE_DATA;
9540 		break;
9541 	default:
9542 		break;
9543 	}
9544 
9545 	error = 0;
9546 	if (rights & KAUTH_VNODE_WRITE_RIGHTS) {
9547 		/* check per-filesystem options if possible */
9548 		if (mp != NULL) {
9549 			/* check for no-EA filesystems */
9550 			if ((rights & KAUTH_VNODE_WRITE_EXTATTRIBUTES) &&
9551 			    (vfs_flags(mp) & MNT_NOUSERXATTR)) {
9552 				KAUTH_DEBUG("%p    DENIED - filesystem disallowed extended attributes", vap);
9553 				error = EACCES;  /* User attributes disabled */
9554 				goto out;
9555 			}
9556 		}
9557 
9558 		/*
9559 		 * check for file immutability. first, check if the requested rights are
9560 		 * allowable for a UF_APPEND file.
9561 		 */
9562 		append = 0;
9563 		if (vap->va_type == VDIR) {
9564 			if ((rights & (KAUTH_VNODE_ADD_FILE | KAUTH_VNODE_ADD_SUBDIRECTORY | KAUTH_VNODE_WRITE_EXTATTRIBUTES | ~KAUTH_VNODE_WRITE_RIGHTS)) == rights) {
9565 				append = 1;
9566 			}
9567 		} else {
9568 			if ((rights & (KAUTH_VNODE_APPEND_DATA | KAUTH_VNODE_WRITE_EXTATTRIBUTES | ~KAUTH_VNODE_WRITE_RIGHTS)) == rights) {
9569 				append = 1;
9570 			}
9571 		}
9572 		if ((error = vnode_immutable(vap, append, ignore)) != 0) {
9573 			if (error && !ignore) {
9574 				/*
9575 				 * In case of a rename, we want to check ownership for dvp as well.
9576 				 */
9577 				int owner = 0;
9578 				if (rights & KAUTH_VNODE_DELETE_CHILD && vcp->dvp != NULL) {
9579 					owner = vauth_file_owner(vcp) && vauth_dir_owner(vcp);
9580 				} else {
9581 					owner = vauth_file_owner(vcp);
9582 				}
9583 				if (owner && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) {
9584 					error = vnode_immutable(vap, append, 1);
9585 				}
9586 			}
9587 		}
9588 		if (error) {
9589 			KAUTH_DEBUG("%p    DENIED - file is immutable", vap);
9590 			goto out;
9591 		}
9592 	}
9593 out:
9594 	return error;
9595 }
9596 
9597 /*
9598  * Handle authorization actions for filesystems that advertise that the
9599  * server will be enforcing.
9600  *
9601  * Returns:	0			Authorization should be handled locally
9602  *		1			Authorization was handled by the FS
9603  *
9604  * Note:	Imputed returns will only occur if the authorization request
9605  *		was handled by the FS.
9606  *
9607  * Imputed:	*resultp, modified	Return code from FS when the request is
9608  *					handled by the FS.
9609  *		VNOP_ACCESS:???
9610  *		VNOP_OPEN:???
9611  */
9612 static int
vnode_authorize_opaque(vnode_t vp,int * resultp,kauth_action_t action,vfs_context_t ctx)9613 vnode_authorize_opaque(vnode_t vp, int *resultp, kauth_action_t action, vfs_context_t ctx)
9614 {
9615 	int     error;
9616 
9617 	/*
9618 	 * If the vp is a device node, socket or FIFO it actually represents a local
9619 	 * endpoint, so we need to handle it locally.
9620 	 */
9621 	switch (vp->v_type) {
9622 	case VBLK:
9623 	case VCHR:
9624 	case VSOCK:
9625 	case VFIFO:
9626 		return 0;
9627 	default:
9628 		break;
9629 	}
9630 
9631 	/*
9632 	 * In the advisory request case, if the filesystem doesn't think it's reliable
9633 	 * we will attempt to formulate a result ourselves based on VNOP_GETATTR data.
9634 	 */
9635 	if ((action & KAUTH_VNODE_ACCESS) && !vfs_authopaqueaccess(vp->v_mount)) {
9636 		return 0;
9637 	}
9638 
9639 	/*
9640 	 * Let the filesystem have a say in the matter.  It's OK for it to not implemnent
9641 	 * VNOP_ACCESS, as most will authorise inline with the actual request.
9642 	 */
9643 	if ((error = VNOP_ACCESS(vp, action, ctx)) != ENOTSUP) {
9644 		*resultp = error;
9645 		KAUTH_DEBUG("%p    DENIED - opaque filesystem VNOP_ACCESS denied access", vp);
9646 		return 1;
9647 	}
9648 
9649 	/*
9650 	 * Typically opaque filesystems do authorisation in-line, but exec is a special case.  In
9651 	 * order to be reasonably sure that exec will be permitted, we try a bit harder here.
9652 	 */
9653 	if ((action & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG)) {
9654 		/* try a VNOP_OPEN for readonly access */
9655 		if ((error = VNOP_OPEN(vp, FREAD, ctx)) != 0) {
9656 			*resultp = error;
9657 			KAUTH_DEBUG("%p    DENIED - EXECUTE denied because file could not be opened readonly", vp);
9658 			return 1;
9659 		}
9660 		VNOP_CLOSE(vp, FREAD, ctx);
9661 	}
9662 
9663 	/*
9664 	 * We don't have any reason to believe that the request has to be denied at this point,
9665 	 * so go ahead and allow it.
9666 	 */
9667 	*resultp = 0;
9668 	KAUTH_DEBUG("%p    ALLOWED - bypassing access check for non-local filesystem", vp);
9669 	return 1;
9670 }
9671 
9672 
9673 
9674 
9675 /*
9676  * Returns:	KAUTH_RESULT_ALLOW
9677  *		KAUTH_RESULT_DENY
9678  *
9679  * Imputed:	*arg3, modified		Error code in the deny case
9680  *		EROFS			Read-only file system
9681  *		EACCES			Permission denied
9682  *		EPERM			Operation not permitted [no execute]
9683  *	vnode_getattr:ENOMEM		Not enough space [only if has filesec]
9684  *	vnode_getattr:???
9685  *	vnode_authorize_opaque:*arg2	???
9686  *	vnode_authorize_checkimmutable:???
9687  *	vnode_authorize_delete:???
9688  *	vnode_authorize_simple:???
9689  */
9690 
9691 
9692 static int
vnode_authorize_callback(__unused kauth_cred_t cred,__unused void * idata,kauth_action_t action,uintptr_t arg0,uintptr_t arg1,uintptr_t arg2,uintptr_t arg3)9693 vnode_authorize_callback(__unused kauth_cred_t cred, __unused void *idata,
9694     kauth_action_t action, uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
9695     uintptr_t arg3)
9696 {
9697 	vfs_context_t   ctx;
9698 	vnode_t         cvp = NULLVP;
9699 	vnode_t         vp, dvp;
9700 	int             result = KAUTH_RESULT_DENY;
9701 	int             parent_iocount = 0;
9702 	int             parent_action = 0; /* In case we need to use namedstream's data fork for cached rights*/
9703 
9704 	ctx = (vfs_context_t)arg0;
9705 	vp = (vnode_t)arg1;
9706 	dvp = (vnode_t)arg2;
9707 
9708 	/*
9709 	 * if there are 2 vnodes passed in, we don't know at
9710 	 * this point which rights to look at based on the
9711 	 * combined action being passed in... defer until later...
9712 	 * otherwise check the kauth 'rights' cache hung
9713 	 * off of the vnode we're interested in... if we've already
9714 	 * been granted the right we're currently interested in,
9715 	 * we can just return success... otherwise we'll go through
9716 	 * the process of authorizing the requested right(s)... if that
9717 	 * succeeds, we'll add the right(s) to the cache.
9718 	 * VNOP_SETATTR and VNOP_SETXATTR will invalidate this cache
9719 	 */
9720 	if (dvp && vp) {
9721 		goto defer;
9722 	}
9723 	if (dvp) {
9724 		cvp = dvp;
9725 	} else {
9726 		/*
9727 		 * For named streams on local-authorization volumes, rights are cached on the parent;
9728 		 * authorization is determined by looking at the parent's properties anyway, so storing
9729 		 * on the parent means that we don't recompute for the named stream and that if
9730 		 * we need to flush rights (e.g. on VNOP_SETATTR()) we don't need to track down the
9731 		 * stream to flush its cache separately.  If we miss in the cache, then we authorize
9732 		 * as if there were no cached rights (passing the named stream vnode and desired rights to
9733 		 * vnode_authorize_callback_int()).
9734 		 *
9735 		 * On an opaquely authorized volume, we don't know the relationship between the
9736 		 * data fork's properties and the rights granted on a stream.  Thus, named stream vnodes
9737 		 * on such a volume are authorized directly (rather than using the parent) and have their
9738 		 * own caches.  When a named stream vnode is created, we mark the parent as having a named
9739 		 * stream. On a VNOP_SETATTR() for the parent that may invalidate cached authorization, we
9740 		 * find the stream and flush its cache.
9741 		 */
9742 		if (vnode_isnamedstream(vp) && (!vfs_authopaque(vp->v_mount))) {
9743 			cvp = vnode_getparent(vp);
9744 			if (cvp != NULLVP) {
9745 				parent_iocount = 1;
9746 			} else {
9747 				cvp = NULL;
9748 				goto defer; /* If we can't use the parent, take the slow path */
9749 			}
9750 
9751 			/* Have to translate some actions */
9752 			parent_action = action;
9753 			if (parent_action & KAUTH_VNODE_READ_DATA) {
9754 				parent_action &= ~KAUTH_VNODE_READ_DATA;
9755 				parent_action |= KAUTH_VNODE_READ_EXTATTRIBUTES;
9756 			}
9757 			if (parent_action & KAUTH_VNODE_WRITE_DATA) {
9758 				parent_action &= ~KAUTH_VNODE_WRITE_DATA;
9759 				parent_action |= KAUTH_VNODE_WRITE_EXTATTRIBUTES;
9760 			}
9761 		} else {
9762 			cvp = vp;
9763 		}
9764 	}
9765 
9766 	if (vnode_cache_is_authorized(cvp, ctx, parent_iocount ? parent_action : action) == TRUE) {
9767 		result = KAUTH_RESULT_ALLOW;
9768 		goto out;
9769 	}
9770 defer:
9771 	result = vnode_authorize_callback_int(action, ctx, vp, dvp, (int *)arg3);
9772 
9773 	if (result == KAUTH_RESULT_ALLOW && cvp != NULLVP) {
9774 		KAUTH_DEBUG("%p - caching action = %x", cvp, action);
9775 		vnode_cache_authorized_action(cvp, ctx, action);
9776 	}
9777 
9778 out:
9779 	if (parent_iocount) {
9780 		vnode_put(cvp);
9781 	}
9782 
9783 	return result;
9784 }
9785 
9786 static int
vnode_attr_authorize_internal(vauth_ctx vcp,mount_t mp,kauth_ace_rights_t rights,int is_suser,boolean_t * found_deny,int noimmutable,int parent_authorized_for_delete_child)9787 vnode_attr_authorize_internal(vauth_ctx vcp, mount_t mp,
9788     kauth_ace_rights_t rights, int is_suser, boolean_t *found_deny,
9789     int noimmutable, int parent_authorized_for_delete_child)
9790 {
9791 	int result;
9792 
9793 	/*
9794 	 * Check for immutability.
9795 	 *
9796 	 * In the deletion case, parent directory immutability vetoes specific
9797 	 * file rights.
9798 	 */
9799 	if ((result = vnode_authorize_checkimmutable(mp, vcp, vcp->vap, rights,
9800 	    noimmutable)) != 0) {
9801 		goto out;
9802 	}
9803 
9804 	if ((rights & KAUTH_VNODE_DELETE) &&
9805 	    !parent_authorized_for_delete_child) {
9806 		result = vnode_authorize_checkimmutable(mp, vcp, vcp->dvap,
9807 		    KAUTH_VNODE_DELETE_CHILD, 0);
9808 		if (result) {
9809 			goto out;
9810 		}
9811 	}
9812 
9813 	/*
9814 	 * Clear rights that have been authorized by reaching this point, bail if nothing left to
9815 	 * check.
9816 	 */
9817 	rights &= ~(KAUTH_VNODE_LINKTARGET | KAUTH_VNODE_CHECKIMMUTABLE);
9818 	if (rights == 0) {
9819 		goto out;
9820 	}
9821 
9822 	/*
9823 	 * If we're not the superuser, authorize based on file properties;
9824 	 * note that even if parent_authorized_for_delete_child is TRUE, we
9825 	 * need to check on the node itself.
9826 	 */
9827 	if (!is_suser) {
9828 		/* process delete rights */
9829 		if ((rights & KAUTH_VNODE_DELETE) &&
9830 		    ((result = vnode_authorize_delete(vcp, parent_authorized_for_delete_child)) != 0)) {
9831 			goto out;
9832 		}
9833 
9834 		/* process remaining rights */
9835 		if ((rights & ~KAUTH_VNODE_DELETE) &&
9836 		    (result = vnode_authorize_simple(vcp, rights, rights & KAUTH_VNODE_DELETE, found_deny)) != 0) {
9837 			goto out;
9838 		}
9839 	} else {
9840 		/*
9841 		 * Execute is only granted to root if one of the x bits is set.  This check only
9842 		 * makes sense if the posix mode bits are actually supported.
9843 		 */
9844 		if ((rights & KAUTH_VNODE_EXECUTE) &&
9845 		    (vcp->vap->va_type == VREG) &&
9846 		    VATTR_IS_SUPPORTED(vcp->vap, va_mode) &&
9847 		    !(vcp->vap->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) {
9848 			result = EPERM;
9849 			KAUTH_DEBUG("%p    DENIED - root execute requires at least one x bit in 0x%x", vcp, vcp->vap->va_mode);
9850 			goto out;
9851 		}
9852 
9853 		/* Assume that there were DENYs so we don't wrongly cache KAUTH_VNODE_SEARCHBYANYONE */
9854 		*found_deny = TRUE;
9855 
9856 		KAUTH_DEBUG("%p    ALLOWED - caller is superuser", vcp);
9857 	}
9858 out:
9859 	return result;
9860 }
9861 
9862 static int
vnode_authorize_callback_int(kauth_action_t action,vfs_context_t ctx,vnode_t vp,vnode_t dvp,int * errorp)9863 vnode_authorize_callback_int(kauth_action_t action, vfs_context_t ctx,
9864     vnode_t vp, vnode_t dvp, int *errorp)
9865 {
9866 	struct _vnode_authorize_context auth_context;
9867 	vauth_ctx               vcp;
9868 	kauth_cred_t            cred;
9869 	kauth_ace_rights_t      rights;
9870 	struct vnode_attr       va, dva;
9871 	int                     result;
9872 	int                     noimmutable;
9873 	boolean_t               parent_authorized_for_delete_child = FALSE;
9874 	boolean_t               found_deny = FALSE;
9875 	boolean_t               parent_ref = FALSE;
9876 	boolean_t               is_suser = FALSE;
9877 
9878 	vcp = &auth_context;
9879 	vcp->ctx = ctx;
9880 	vcp->vp = vp;
9881 	vcp->dvp = dvp;
9882 	/*
9883 	 * Note that we authorize against the context, not the passed cred
9884 	 * (the same thing anyway)
9885 	 */
9886 	cred = ctx->vc_ucred;
9887 
9888 	VATTR_INIT(&va);
9889 	vcp->vap = &va;
9890 	VATTR_INIT(&dva);
9891 	vcp->dvap = &dva;
9892 
9893 	vcp->flags = vcp->flags_valid = 0;
9894 
9895 #if DIAGNOSTIC
9896 	if ((ctx == NULL) || (vp == NULL) || (cred == NULL)) {
9897 		panic("vnode_authorize: bad arguments (context %p  vp %p  cred %p)", ctx, vp, cred);
9898 	}
9899 #endif
9900 
9901 	KAUTH_DEBUG("%p  AUTH - %s %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s on %s '%s' (0x%x:%p/%p)",
9902 	    vp, vfs_context_proc(ctx)->p_comm,
9903 	    (action & KAUTH_VNODE_ACCESS)               ? "access" : "auth",
9904 	    (action & KAUTH_VNODE_READ_DATA)            ? vnode_isdir(vp) ? " LIST_DIRECTORY" : " READ_DATA" : "",
9905 	    (action & KAUTH_VNODE_WRITE_DATA)           ? vnode_isdir(vp) ? " ADD_FILE" : " WRITE_DATA" : "",
9906 	    (action & KAUTH_VNODE_EXECUTE)              ? vnode_isdir(vp) ? " SEARCH" : " EXECUTE" : "",
9907 	    (action & KAUTH_VNODE_DELETE)               ? " DELETE" : "",
9908 	    (action & KAUTH_VNODE_APPEND_DATA)          ? vnode_isdir(vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "",
9909 	    (action & KAUTH_VNODE_DELETE_CHILD)         ? " DELETE_CHILD" : "",
9910 	    (action & KAUTH_VNODE_READ_ATTRIBUTES)      ? " READ_ATTRIBUTES" : "",
9911 	    (action & KAUTH_VNODE_WRITE_ATTRIBUTES)     ? " WRITE_ATTRIBUTES" : "",
9912 	    (action & KAUTH_VNODE_READ_EXTATTRIBUTES)   ? " READ_EXTATTRIBUTES" : "",
9913 	    (action & KAUTH_VNODE_WRITE_EXTATTRIBUTES)  ? " WRITE_EXTATTRIBUTES" : "",
9914 	    (action & KAUTH_VNODE_READ_SECURITY)        ? " READ_SECURITY" : "",
9915 	    (action & KAUTH_VNODE_WRITE_SECURITY)       ? " WRITE_SECURITY" : "",
9916 	    (action & KAUTH_VNODE_CHANGE_OWNER)         ? " CHANGE_OWNER" : "",
9917 	    (action & KAUTH_VNODE_NOIMMUTABLE)          ? " (noimmutable)" : "",
9918 	    vnode_isdir(vp) ? "directory" : "file",
9919 	    vp->v_name ? vp->v_name : "<NULL>", action, vp, dvp);
9920 
9921 	/*
9922 	 * Extract the control bits from the action, everything else is
9923 	 * requested rights.
9924 	 */
9925 	noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? 1 : 0;
9926 	rights = action & ~(KAUTH_VNODE_ACCESS | KAUTH_VNODE_NOIMMUTABLE);
9927 
9928 	if (rights & KAUTH_VNODE_DELETE) {
9929 #if DIAGNOSTIC
9930 		if (dvp == NULL) {
9931 			panic("vnode_authorize: KAUTH_VNODE_DELETE test requires a directory");
9932 		}
9933 #endif
9934 		/*
9935 		 * check to see if we've already authorized the parent
9936 		 * directory for deletion of its children... if so, we
9937 		 * can skip a whole bunch of work... we will still have to
9938 		 * authorize that this specific child can be removed
9939 		 */
9940 		if (vnode_cache_is_authorized(dvp, ctx, KAUTH_VNODE_DELETE_CHILD) == TRUE) {
9941 			parent_authorized_for_delete_child = TRUE;
9942 		}
9943 	} else {
9944 		vcp->dvp = NULLVP;
9945 		vcp->dvap = NULL;
9946 	}
9947 
9948 	/*
9949 	 * Check for read-only filesystems.
9950 	 */
9951 	if ((rights & KAUTH_VNODE_WRITE_RIGHTS) &&
9952 	    (vp->v_mount->mnt_flag & MNT_RDONLY) &&
9953 	    ((vp->v_type == VREG) || (vp->v_type == VDIR) ||
9954 	    (vp->v_type == VLNK) || (vp->v_type == VCPLX) ||
9955 	    (rights & KAUTH_VNODE_DELETE) || (rights & KAUTH_VNODE_DELETE_CHILD))) {
9956 		result = EROFS;
9957 		goto out;
9958 	}
9959 
9960 	/*
9961 	 * Check for noexec filesystems.
9962 	 */
9963 	if ((rights & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG) && (vp->v_mount->mnt_flag & MNT_NOEXEC)) {
9964 		result = EACCES;
9965 		goto out;
9966 	}
9967 
9968 	/*
9969 	 * Handle cases related to filesystems with non-local enforcement.
9970 	 * This call can return 0, in which case we will fall through to perform a
9971 	 * check based on VNOP_GETATTR data.  Otherwise it returns 1 and sets
9972 	 * an appropriate result, at which point we can return immediately.
9973 	 */
9974 	if ((vp->v_mount->mnt_kern_flag & MNTK_AUTH_OPAQUE) && vnode_authorize_opaque(vp, &result, action, ctx)) {
9975 		goto out;
9976 	}
9977 
9978 	/*
9979 	 * If the vnode is a namedstream (extended attribute) data vnode (eg.
9980 	 * a resource fork), *_DATA becomes *_EXTATTRIBUTES.
9981 	 */
9982 	if (vnode_isnamedstream(vp)) {
9983 		if (rights & KAUTH_VNODE_READ_DATA) {
9984 			rights &= ~KAUTH_VNODE_READ_DATA;
9985 			rights |= KAUTH_VNODE_READ_EXTATTRIBUTES;
9986 		}
9987 		if (rights & KAUTH_VNODE_WRITE_DATA) {
9988 			rights &= ~KAUTH_VNODE_WRITE_DATA;
9989 			rights |= KAUTH_VNODE_WRITE_EXTATTRIBUTES;
9990 		}
9991 
9992 		/*
9993 		 * Point 'vp' to the namedstream's parent for ACL checking
9994 		 */
9995 		if ((vp->v_parent != NULL) &&
9996 		    (vget_internal(vp->v_parent, 0, VNODE_NODEAD | VNODE_DRAINO) == 0)) {
9997 			parent_ref = TRUE;
9998 			vcp->vp = vp = vp->v_parent;
9999 		}
10000 	}
10001 
10002 	if (vfs_context_issuser(ctx)) {
10003 		/*
10004 		 * if we're not asking for execute permissions or modifications,
10005 		 * then we're done, this action is authorized.
10006 		 */
10007 		if (!(rights & (KAUTH_VNODE_EXECUTE | KAUTH_VNODE_WRITE_RIGHTS))) {
10008 			goto success;
10009 		}
10010 
10011 		is_suser = TRUE;
10012 	}
10013 
10014 	/*
10015 	 * Get vnode attributes and extended security information for the vnode
10016 	 * and directory if required.
10017 	 *
10018 	 * If we're root we only want mode bits and flags for checking
10019 	 * execute and immutability.
10020 	 */
10021 	VATTR_WANTED(&va, va_mode);
10022 	VATTR_WANTED(&va, va_flags);
10023 	if (!is_suser) {
10024 		VATTR_WANTED(&va, va_uid);
10025 		VATTR_WANTED(&va, va_gid);
10026 		VATTR_WANTED(&va, va_acl);
10027 	}
10028 	if ((result = vnode_getattr(vp, &va, ctx)) != 0) {
10029 		KAUTH_DEBUG("%p    ERROR - failed to get vnode attributes - %d", vp, result);
10030 		goto out;
10031 	}
10032 	VATTR_WANTED(&va, va_type);
10033 	VATTR_RETURN(&va, va_type, vnode_vtype(vp));
10034 
10035 	if (vcp->dvp) {
10036 		VATTR_WANTED(&dva, va_mode);
10037 		VATTR_WANTED(&dva, va_flags);
10038 		if (!is_suser) {
10039 			VATTR_WANTED(&dva, va_uid);
10040 			VATTR_WANTED(&dva, va_gid);
10041 			VATTR_WANTED(&dva, va_acl);
10042 		}
10043 		if ((result = vnode_getattr(vcp->dvp, &dva, ctx)) != 0) {
10044 			KAUTH_DEBUG("%p    ERROR - failed to get directory vnode attributes - %d", vp, result);
10045 			goto out;
10046 		}
10047 		VATTR_WANTED(&dva, va_type);
10048 		VATTR_RETURN(&dva, va_type, vnode_vtype(vcp->dvp));
10049 	}
10050 
10051 	result = vnode_attr_authorize_internal(vcp, vp->v_mount, rights, is_suser,
10052 	    &found_deny, noimmutable, parent_authorized_for_delete_child);
10053 out:
10054 	if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL)) {
10055 		kauth_acl_free(va.va_acl);
10056 	}
10057 	if (VATTR_IS_SUPPORTED(&dva, va_acl) && (dva.va_acl != NULL)) {
10058 		kauth_acl_free(dva.va_acl);
10059 	}
10060 
10061 	if (result) {
10062 		if (parent_ref) {
10063 			vnode_put(vp);
10064 		}
10065 		*errorp = result;
10066 		KAUTH_DEBUG("%p    DENIED - auth denied", vp);
10067 		return KAUTH_RESULT_DENY;
10068 	}
10069 	if ((rights & KAUTH_VNODE_SEARCH) && found_deny == FALSE && vp->v_type == VDIR) {
10070 		/*
10071 		 * if we were successfully granted the right to search this directory
10072 		 * and there were NO ACL DENYs for search and the posix permissions also don't
10073 		 * deny execute, we can synthesize a global right that allows anyone to
10074 		 * traverse this directory during a pathname lookup without having to
10075 		 * match the credential associated with this cache of rights.
10076 		 *
10077 		 * Note that we can correctly cache KAUTH_VNODE_SEARCHBYANYONE
10078 		 * only if we actually check ACLs which we don't for root. As
10079 		 * a workaround, the lookup fast path checks for root.
10080 		 */
10081 		if (!VATTR_IS_SUPPORTED(&va, va_mode) ||
10082 		    ((va.va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) ==
10083 		    (S_IXUSR | S_IXGRP | S_IXOTH))) {
10084 			vnode_cache_authorized_action(vp, ctx, KAUTH_VNODE_SEARCHBYANYONE);
10085 		}
10086 	}
10087 success:
10088 	if (parent_ref) {
10089 		vnode_put(vp);
10090 	}
10091 
10092 	/*
10093 	 * Note that this implies that we will allow requests for no rights, as well as
10094 	 * for rights that we do not recognise.  There should be none of these.
10095 	 */
10096 	KAUTH_DEBUG("%p    ALLOWED - auth granted", vp);
10097 	return KAUTH_RESULT_ALLOW;
10098 }
10099 
10100 int
vnode_attr_authorize_init(struct vnode_attr * vap,struct vnode_attr * dvap,kauth_action_t action,vfs_context_t ctx)10101 vnode_attr_authorize_init(struct vnode_attr *vap, struct vnode_attr *dvap,
10102     kauth_action_t action, vfs_context_t ctx)
10103 {
10104 	VATTR_INIT(vap);
10105 	VATTR_WANTED(vap, va_type);
10106 	VATTR_WANTED(vap, va_mode);
10107 	VATTR_WANTED(vap, va_flags);
10108 	if (dvap) {
10109 		VATTR_INIT(dvap);
10110 		if (action & KAUTH_VNODE_DELETE) {
10111 			VATTR_WANTED(dvap, va_type);
10112 			VATTR_WANTED(dvap, va_mode);
10113 			VATTR_WANTED(dvap, va_flags);
10114 		}
10115 	} else if (action & KAUTH_VNODE_DELETE) {
10116 		return EINVAL;
10117 	}
10118 
10119 	if (!vfs_context_issuser(ctx)) {
10120 		VATTR_WANTED(vap, va_uid);
10121 		VATTR_WANTED(vap, va_gid);
10122 		VATTR_WANTED(vap, va_acl);
10123 		if (dvap && (action & KAUTH_VNODE_DELETE)) {
10124 			VATTR_WANTED(dvap, va_uid);
10125 			VATTR_WANTED(dvap, va_gid);
10126 			VATTR_WANTED(dvap, va_acl);
10127 		}
10128 	}
10129 
10130 	return 0;
10131 }
10132 
10133 #define VNODE_SEC_ATTRS_NO_ACL (VNODE_ATTR_va_uid | VNODE_ATTR_va_gid | VNODE_ATTR_va_mode | VNODE_ATTR_va_flags | VNODE_ATTR_va_type)
10134 
10135 int
vnode_attr_authorize(struct vnode_attr * vap,struct vnode_attr * dvap,mount_t mp,kauth_action_t action,vfs_context_t ctx)10136 vnode_attr_authorize(struct vnode_attr *vap, struct vnode_attr *dvap, mount_t mp,
10137     kauth_action_t action, vfs_context_t ctx)
10138 {
10139 	struct _vnode_authorize_context auth_context;
10140 	vauth_ctx vcp;
10141 	kauth_ace_rights_t rights;
10142 	int noimmutable;
10143 	boolean_t found_deny;
10144 	boolean_t is_suser = FALSE;
10145 	int result = 0;
10146 	uid_t ouid = vap->va_uid;
10147 	gid_t ogid = vap->va_gid;
10148 
10149 	vcp = &auth_context;
10150 	vcp->ctx = ctx;
10151 	vcp->vp = NULLVP;
10152 	vcp->vap = vap;
10153 	vcp->dvp = NULLVP;
10154 	vcp->dvap = dvap;
10155 	vcp->flags = vcp->flags_valid = 0;
10156 
10157 	noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? 1 : 0;
10158 	rights = action & ~(KAUTH_VNODE_ACCESS | KAUTH_VNODE_NOIMMUTABLE);
10159 
10160 	/*
10161 	 * Check for read-only filesystems.
10162 	 */
10163 	if ((rights & KAUTH_VNODE_WRITE_RIGHTS) &&
10164 	    mp && (mp->mnt_flag & MNT_RDONLY) &&
10165 	    ((vap->va_type == VREG) || (vap->va_type == VDIR) ||
10166 	    (vap->va_type == VLNK) || (rights & KAUTH_VNODE_DELETE) ||
10167 	    (rights & KAUTH_VNODE_DELETE_CHILD))) {
10168 		result = EROFS;
10169 		goto out;
10170 	}
10171 
10172 	/*
10173 	 * Check for noexec filesystems.
10174 	 */
10175 	if ((rights & KAUTH_VNODE_EXECUTE) &&
10176 	    (vap->va_type == VREG) && mp && (mp->mnt_flag & MNT_NOEXEC)) {
10177 		result = EACCES;
10178 		goto out;
10179 	}
10180 
10181 	if (vfs_context_issuser(ctx)) {
10182 		/*
10183 		 * if we're not asking for execute permissions or modifications,
10184 		 * then we're done, this action is authorized.
10185 		 */
10186 		if (!(rights & (KAUTH_VNODE_EXECUTE | KAUTH_VNODE_WRITE_RIGHTS))) {
10187 			goto out;
10188 		}
10189 		is_suser = TRUE;
10190 	}
10191 
10192 	if (mp) {
10193 		if (vfs_extendedsecurity(mp) && VATTR_IS_ACTIVE(vap, va_acl) && !VATTR_IS_SUPPORTED(vap, va_acl)) {
10194 			panic("(1) vnode attrs not complete for vnode_attr_authorize");
10195 		}
10196 		vnode_attr_handle_uid_and_gid(vap, mp, ctx);
10197 	}
10198 
10199 	if ((vap->va_active & VNODE_SEC_ATTRS_NO_ACL) != (vap->va_supported & VNODE_SEC_ATTRS_NO_ACL)) {
10200 		panic("(2) vnode attrs not complete for vnode_attr_authorize (2) vap->va_active = 0x%llx , vap->va_supported = 0x%llx",
10201 		    vap->va_active, vap->va_supported);
10202 	}
10203 
10204 	result = vnode_attr_authorize_internal(vcp, mp, rights, is_suser,
10205 	    &found_deny, noimmutable, FALSE);
10206 
10207 	if (mp) {
10208 		vap->va_uid = ouid;
10209 		vap->va_gid = ogid;
10210 	}
10211 
10212 	if (result == EPERM) {
10213 		result = EACCES;
10214 	}
10215 out:
10216 	return result;
10217 }
10218 
10219 
10220 int
vnode_authattr_new(vnode_t dvp,struct vnode_attr * vap,int noauth,vfs_context_t ctx)10221 vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_t ctx)
10222 {
10223 	return vnode_authattr_new_internal(dvp, vap, noauth, NULL, ctx);
10224 }
10225 
10226 /*
10227  * Check that the attribute information in vattr can be legally applied to
10228  * a new file by the context.
10229  */
10230 static int
vnode_authattr_new_internal(vnode_t dvp,struct vnode_attr * vap,int noauth,uint32_t * defaulted_fieldsp,vfs_context_t ctx)10231 vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx)
10232 {
10233 	int             error;
10234 	int             has_priv_suser, ismember, defaulted_owner, defaulted_group, defaulted_mode;
10235 	uint32_t        inherit_flags;
10236 	kauth_cred_t    cred;
10237 	guid_t          changer;
10238 	mount_t         dmp;
10239 	struct vnode_attr dva;
10240 
10241 	error = 0;
10242 
10243 	if (defaulted_fieldsp) {
10244 		*defaulted_fieldsp = 0;
10245 	}
10246 
10247 	defaulted_owner = defaulted_group = defaulted_mode = 0;
10248 
10249 	inherit_flags = 0;
10250 
10251 	/*
10252 	 * Require that the filesystem support extended security to apply any.
10253 	 */
10254 	if (!vfs_extendedsecurity(dvp->v_mount) &&
10255 	    (VATTR_IS_ACTIVE(vap, va_acl) || VATTR_IS_ACTIVE(vap, va_uuuid) || VATTR_IS_ACTIVE(vap, va_guuid))) {
10256 		error = EINVAL;
10257 		goto out;
10258 	}
10259 
10260 	/*
10261 	 * Default some fields.
10262 	 */
10263 	dmp = dvp->v_mount;
10264 
10265 	/*
10266 	 * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit owner is set, that
10267 	 * owner takes ownership of all new files.
10268 	 */
10269 	if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsowner != KAUTH_UID_NONE)) {
10270 		VATTR_SET(vap, va_uid, dmp->mnt_fsowner);
10271 		defaulted_owner = 1;
10272 	} else {
10273 		if (!VATTR_IS_ACTIVE(vap, va_uid)) {
10274 			/* default owner is current user */
10275 			VATTR_SET(vap, va_uid, kauth_cred_getuid(vfs_context_ucred(ctx)));
10276 			defaulted_owner = 1;
10277 		}
10278 	}
10279 
10280 	/*
10281 	 * We need the dvp's va_flags and *may* need the gid of the directory,
10282 	 * we ask for both here.
10283 	 */
10284 	VATTR_INIT(&dva);
10285 	VATTR_WANTED(&dva, va_gid);
10286 	VATTR_WANTED(&dva, va_flags);
10287 	if ((error = vnode_getattr(dvp, &dva, ctx)) != 0) {
10288 		goto out;
10289 	}
10290 
10291 	/*
10292 	 * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit grouo is set, that
10293 	 * group takes ownership of all new files.
10294 	 */
10295 	if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsgroup != KAUTH_GID_NONE)) {
10296 		VATTR_SET(vap, va_gid, dmp->mnt_fsgroup);
10297 		defaulted_group = 1;
10298 	} else {
10299 		if (!VATTR_IS_ACTIVE(vap, va_gid)) {
10300 			/* default group comes from parent object, fallback to current user */
10301 			if (VATTR_IS_SUPPORTED(&dva, va_gid)) {
10302 				VATTR_SET(vap, va_gid, dva.va_gid);
10303 			} else {
10304 				VATTR_SET(vap, va_gid, kauth_cred_getgid(vfs_context_ucred(ctx)));
10305 			}
10306 			defaulted_group = 1;
10307 		}
10308 	}
10309 
10310 	if (!VATTR_IS_ACTIVE(vap, va_flags)) {
10311 		VATTR_SET(vap, va_flags, 0);
10312 	}
10313 
10314 	/* Determine if SF_RESTRICTED should be inherited from the parent
10315 	 * directory. */
10316 	if (VATTR_IS_SUPPORTED(&dva, va_flags)) {
10317 		inherit_flags = dva.va_flags & (UF_DATAVAULT | SF_RESTRICTED);
10318 	}
10319 
10320 	/* default mode is everything, masked with current umask */
10321 	if (!VATTR_IS_ACTIVE(vap, va_mode)) {
10322 		VATTR_SET(vap, va_mode, ACCESSPERMS & ~vfs_context_proc(ctx)->p_fd.fd_cmask);
10323 		KAUTH_DEBUG("ATTR - defaulting new file mode to %o from umask %o",
10324 		    vap->va_mode, vfs_context_proc(ctx)->p_fd.fd_cmask);
10325 		defaulted_mode = 1;
10326 	}
10327 	/* set timestamps to now */
10328 	if (!VATTR_IS_ACTIVE(vap, va_create_time)) {
10329 		nanotime(&vap->va_create_time);
10330 		VATTR_SET_ACTIVE(vap, va_create_time);
10331 	}
10332 
10333 	/*
10334 	 * Check for attempts to set nonsensical fields.
10335 	 */
10336 	if (vap->va_active & ~VNODE_ATTR_NEWOBJ) {
10337 		error = EINVAL;
10338 		KAUTH_DEBUG("ATTR - ERROR - attempt to set unsupported new-file attributes %llx",
10339 		    vap->va_active & ~VNODE_ATTR_NEWOBJ);
10340 		goto out;
10341 	}
10342 
10343 	/*
10344 	 * Quickly check for the applicability of any enforcement here.
10345 	 * Tests below maintain the integrity of the local security model.
10346 	 */
10347 	if (vfs_authopaque(dvp->v_mount)) {
10348 		goto out;
10349 	}
10350 
10351 	/*
10352 	 * We need to know if the caller is the superuser, or if the work is
10353 	 * otherwise already authorised.
10354 	 */
10355 	cred = vfs_context_ucred(ctx);
10356 	if (noauth) {
10357 		/* doing work for the kernel */
10358 		has_priv_suser = 1;
10359 	} else {
10360 		has_priv_suser = vfs_context_issuser(ctx);
10361 	}
10362 
10363 
10364 	if (VATTR_IS_ACTIVE(vap, va_flags)) {
10365 		vap->va_flags &= ~SF_SYNTHETIC;
10366 		if (has_priv_suser) {
10367 			if ((vap->va_flags & (UF_SETTABLE | SF_SETTABLE)) != vap->va_flags) {
10368 				error = EPERM;
10369 				KAUTH_DEBUG("  DENIED - superuser attempt to set illegal flag(s)");
10370 				goto out;
10371 			}
10372 		} else {
10373 			if ((vap->va_flags & UF_SETTABLE) != vap->va_flags) {
10374 				error = EPERM;
10375 				KAUTH_DEBUG("  DENIED - user attempt to set illegal flag(s)");
10376 				goto out;
10377 			}
10378 		}
10379 	}
10380 
10381 	/* if not superuser, validate legality of new-item attributes */
10382 	if (!has_priv_suser) {
10383 		if (!defaulted_mode && VATTR_IS_ACTIVE(vap, va_mode)) {
10384 			/* setgid? */
10385 			if (vap->va_mode & S_ISGID) {
10386 				if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
10387 					KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid);
10388 					goto out;
10389 				}
10390 				if (!ismember) {
10391 					KAUTH_DEBUG("  DENIED - can't set SGID bit, not a member of %d", vap->va_gid);
10392 					error = EPERM;
10393 					goto out;
10394 				}
10395 			}
10396 
10397 			/* setuid? */
10398 			if ((vap->va_mode & S_ISUID) && (vap->va_uid != kauth_cred_getuid(cred))) {
10399 				KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit");
10400 				error = EPERM;
10401 				goto out;
10402 			}
10403 		}
10404 		if (!defaulted_owner && (vap->va_uid != kauth_cred_getuid(cred))) {
10405 			KAUTH_DEBUG("  DENIED - cannot create new item owned by %d", vap->va_uid);
10406 			error = EPERM;
10407 			goto out;
10408 		}
10409 		if (!defaulted_group) {
10410 			if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
10411 				KAUTH_DEBUG("  ERROR - got %d checking for membership in %d", error, vap->va_gid);
10412 				goto out;
10413 			}
10414 			if (!ismember) {
10415 				KAUTH_DEBUG("  DENIED - cannot create new item with group %d - not a member", vap->va_gid);
10416 				error = EPERM;
10417 				goto out;
10418 			}
10419 		}
10420 
10421 		/* initialising owner/group UUID */
10422 		if (VATTR_IS_ACTIVE(vap, va_uuuid)) {
10423 			if ((error = kauth_cred_getguid(cred, &changer)) != 0) {
10424 				KAUTH_DEBUG("  ERROR - got %d trying to get caller UUID", error);
10425 				/* XXX ENOENT here - no GUID - should perhaps become EPERM */
10426 				goto out;
10427 			}
10428 			if (!kauth_guid_equal(&vap->va_uuuid, &changer)) {
10429 				KAUTH_DEBUG("  ERROR - cannot create item with supplied owner UUID - not us");
10430 				error = EPERM;
10431 				goto out;
10432 			}
10433 		}
10434 		if (VATTR_IS_ACTIVE(vap, va_guuid)) {
10435 			if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) {
10436 				KAUTH_DEBUG("  ERROR - got %d trying to check group membership", error);
10437 				goto out;
10438 			}
10439 			if (!ismember) {
10440 				KAUTH_DEBUG("  ERROR - cannot create item with supplied group UUID - not a member");
10441 				error = EPERM;
10442 				goto out;
10443 			}
10444 		}
10445 	}
10446 out:
10447 	if (inherit_flags) {
10448 		/* Apply SF_RESTRICTED to the file if its parent directory was
10449 		 * restricted.  This is done at the end so that root is not
10450 		 * required if this flag is only set due to inheritance. */
10451 		VATTR_SET(vap, va_flags, (vap->va_flags | inherit_flags));
10452 	}
10453 	if (defaulted_fieldsp) {
10454 		if (defaulted_mode) {
10455 			*defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_MODE;
10456 		}
10457 		if (defaulted_group) {
10458 			*defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_GID;
10459 		}
10460 		if (defaulted_owner) {
10461 			*defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_UID;
10462 		}
10463 	}
10464 	return error;
10465 }
10466 
10467 /*
10468  * Check that the attribute information in vap can be legally written by the
10469  * context.
10470  *
10471  * Call this when you're not sure about the vnode_attr; either its contents
10472  * have come from an unknown source, or when they are variable.
10473  *
10474  * Returns errno, or zero and sets *actionp to the KAUTH_VNODE_* actions that
10475  * must be authorized to be permitted to write the vattr.
10476  */
10477 int
vnode_authattr(vnode_t vp,struct vnode_attr * vap,kauth_action_t * actionp,vfs_context_t ctx)10478 vnode_authattr(vnode_t vp, struct vnode_attr *vap, kauth_action_t *actionp, vfs_context_t ctx)
10479 {
10480 	struct vnode_attr ova;
10481 	kauth_action_t  required_action;
10482 	int             error, has_priv_suser, ismember, chowner, chgroup, clear_suid, clear_sgid;
10483 	guid_t          changer;
10484 	gid_t           group;
10485 	uid_t           owner;
10486 	mode_t          newmode;
10487 	kauth_cred_t    cred;
10488 	uint32_t        fdelta;
10489 
10490 	VATTR_INIT(&ova);
10491 	required_action = 0;
10492 	error = 0;
10493 
10494 	/*
10495 	 * Quickly check for enforcement applicability.
10496 	 */
10497 	if (vfs_authopaque(vp->v_mount)) {
10498 		goto out;
10499 	}
10500 
10501 	/*
10502 	 * Check for attempts to set nonsensical fields.
10503 	 */
10504 	if (vap->va_active & VNODE_ATTR_RDONLY) {
10505 		KAUTH_DEBUG("ATTR - ERROR: attempt to set readonly attribute(s)");
10506 		error = EINVAL;
10507 		goto out;
10508 	}
10509 
10510 	/*
10511 	 * We need to know if the caller is the superuser.
10512 	 */
10513 	cred = vfs_context_ucred(ctx);
10514 	has_priv_suser = kauth_cred_issuser(cred);
10515 
10516 	/*
10517 	 * If any of the following are changing, we need information from the old file:
10518 	 * va_uid
10519 	 * va_gid
10520 	 * va_mode
10521 	 * va_uuuid
10522 	 * va_guuid
10523 	 */
10524 	if (VATTR_IS_ACTIVE(vap, va_uid) ||
10525 	    VATTR_IS_ACTIVE(vap, va_gid) ||
10526 	    VATTR_IS_ACTIVE(vap, va_mode) ||
10527 	    VATTR_IS_ACTIVE(vap, va_uuuid) ||
10528 	    VATTR_IS_ACTIVE(vap, va_guuid)) {
10529 		VATTR_WANTED(&ova, va_mode);
10530 		VATTR_WANTED(&ova, va_uid);
10531 		VATTR_WANTED(&ova, va_gid);
10532 		VATTR_WANTED(&ova, va_uuuid);
10533 		VATTR_WANTED(&ova, va_guuid);
10534 		KAUTH_DEBUG("ATTR - security information changing, fetching existing attributes");
10535 	}
10536 
10537 	/*
10538 	 * If timestamps are being changed, we need to know who the file is owned
10539 	 * by.
10540 	 */
10541 	if (VATTR_IS_ACTIVE(vap, va_create_time) ||
10542 	    VATTR_IS_ACTIVE(vap, va_change_time) ||
10543 	    VATTR_IS_ACTIVE(vap, va_modify_time) ||
10544 	    VATTR_IS_ACTIVE(vap, va_access_time) ||
10545 	    VATTR_IS_ACTIVE(vap, va_backup_time) ||
10546 	    VATTR_IS_ACTIVE(vap, va_addedtime)) {
10547 		VATTR_WANTED(&ova, va_uid);
10548 #if 0   /* enable this when we support UUIDs as official owners */
10549 		VATTR_WANTED(&ova, va_uuuid);
10550 #endif
10551 		KAUTH_DEBUG("ATTR - timestamps changing, fetching uid and GUID");
10552 	}
10553 
10554 	/*
10555 	 * If flags are being changed, we need the old flags.
10556 	 */
10557 	if (VATTR_IS_ACTIVE(vap, va_flags)) {
10558 		KAUTH_DEBUG("ATTR - flags changing, fetching old flags");
10559 		VATTR_WANTED(&ova, va_flags);
10560 	}
10561 
10562 	/*
10563 	 * If ACLs are being changed, we need the old ACLs.
10564 	 */
10565 	if (VATTR_IS_ACTIVE(vap, va_acl)) {
10566 		KAUTH_DEBUG("ATTR - acl changing, fetching old flags");
10567 		VATTR_WANTED(&ova, va_acl);
10568 	}
10569 
10570 	/*
10571 	 * If the size is being set, make sure it's not a directory.
10572 	 */
10573 	if (VATTR_IS_ACTIVE(vap, va_data_size)) {
10574 		/* size is only meaningful on regular files, don't permit otherwise */
10575 		if (!vnode_isreg(vp)) {
10576 			KAUTH_DEBUG("ATTR - ERROR: size change requested on non-file");
10577 			error = vnode_isdir(vp) ? EISDIR : EINVAL;
10578 			goto out;
10579 		}
10580 	}
10581 
10582 	/*
10583 	 * Get old data.
10584 	 */
10585 	KAUTH_DEBUG("ATTR - fetching old attributes %016llx", ova.va_active);
10586 	if ((error = vnode_getattr(vp, &ova, ctx)) != 0) {
10587 		KAUTH_DEBUG("  ERROR - got %d trying to get attributes", error);
10588 		goto out;
10589 	}
10590 
10591 	/*
10592 	 * Size changes require write access to the file data.
10593 	 */
10594 	if (VATTR_IS_ACTIVE(vap, va_data_size)) {
10595 		/* if we can't get the size, or it's different, we need write access */
10596 		KAUTH_DEBUG("ATTR - size change, requiring WRITE_DATA");
10597 		required_action |= KAUTH_VNODE_WRITE_DATA;
10598 	}
10599 
10600 	/*
10601 	 * Changing timestamps?
10602 	 *
10603 	 * Note that we are only called to authorize user-requested time changes;
10604 	 * side-effect time changes are not authorized.  Authorisation is only
10605 	 * required for existing files.
10606 	 *
10607 	 * Non-owners are not permitted to change the time on an existing
10608 	 * file to anything other than the current time.
10609 	 */
10610 	if (VATTR_IS_ACTIVE(vap, va_create_time) ||
10611 	    VATTR_IS_ACTIVE(vap, va_change_time) ||
10612 	    VATTR_IS_ACTIVE(vap, va_modify_time) ||
10613 	    VATTR_IS_ACTIVE(vap, va_access_time) ||
10614 	    VATTR_IS_ACTIVE(vap, va_backup_time) ||
10615 	    VATTR_IS_ACTIVE(vap, va_addedtime)) {
10616 		/*
10617 		 * The owner and root may set any timestamps they like,
10618 		 * provided that the file is not immutable.  The owner still needs
10619 		 * WRITE_ATTRIBUTES (implied by ownership but still deniable).
10620 		 */
10621 		if (has_priv_suser || vauth_node_owner(&ova, cred)) {
10622 			KAUTH_DEBUG("ATTR - root or owner changing timestamps");
10623 			required_action |= KAUTH_VNODE_CHECKIMMUTABLE | KAUTH_VNODE_WRITE_ATTRIBUTES;
10624 		} else {
10625 			/* just setting the current time? */
10626 			if (vap->va_vaflags & VA_UTIMES_NULL) {
10627 				KAUTH_DEBUG("ATTR - non-root/owner changing timestamps, requiring WRITE_ATTRIBUTES");
10628 				required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES;
10629 			} else {
10630 				KAUTH_DEBUG("ATTR - ERROR: illegal timestamp modification attempted");
10631 				error = EACCES;
10632 				goto out;
10633 			}
10634 		}
10635 	}
10636 
10637 	/*
10638 	 * Changing file mode?
10639 	 */
10640 	if (VATTR_IS_ACTIVE(vap, va_mode) && VATTR_IS_SUPPORTED(&ova, va_mode) && (ova.va_mode != vap->va_mode)) {
10641 		KAUTH_DEBUG("ATTR - mode change from %06o to %06o", ova.va_mode, vap->va_mode);
10642 
10643 		/*
10644 		 * Mode changes always have the same basic auth requirements.
10645 		 */
10646 		if (has_priv_suser) {
10647 			KAUTH_DEBUG("ATTR - superuser mode change, requiring immutability check");
10648 			required_action |= KAUTH_VNODE_CHECKIMMUTABLE;
10649 		} else {
10650 			/* need WRITE_SECURITY */
10651 			KAUTH_DEBUG("ATTR - non-superuser mode change, requiring WRITE_SECURITY");
10652 			required_action |= KAUTH_VNODE_WRITE_SECURITY;
10653 		}
10654 
10655 		/*
10656 		 * Can't set the setgid bit if you're not in the group and not root.  Have to have
10657 		 * existing group information in the case we're not setting it right now.
10658 		 */
10659 		if (vap->va_mode & S_ISGID) {
10660 			required_action |= KAUTH_VNODE_CHECKIMMUTABLE;  /* always required */
10661 			if (!has_priv_suser) {
10662 				if (VATTR_IS_ACTIVE(vap, va_gid)) {
10663 					group = vap->va_gid;
10664 				} else if (VATTR_IS_SUPPORTED(&ova, va_gid)) {
10665 					group = ova.va_gid;
10666 				} else {
10667 					KAUTH_DEBUG("ATTR - ERROR: setgid but no gid available");
10668 					error = EINVAL;
10669 					goto out;
10670 				}
10671 				/*
10672 				 * This might be too restrictive; WRITE_SECURITY might be implied by
10673 				 * membership in this case, rather than being an additional requirement.
10674 				 */
10675 				if ((error = kauth_cred_ismember_gid(cred, group, &ismember)) != 0) {
10676 					KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid);
10677 					goto out;
10678 				}
10679 				if (!ismember) {
10680 					KAUTH_DEBUG("  DENIED - can't set SGID bit, not a member of %d", group);
10681 					error = EPERM;
10682 					goto out;
10683 				}
10684 			}
10685 		}
10686 
10687 		/*
10688 		 * Can't set the setuid bit unless you're root or the file's owner.
10689 		 */
10690 		if (vap->va_mode & S_ISUID) {
10691 			required_action |= KAUTH_VNODE_CHECKIMMUTABLE;  /* always required */
10692 			if (!has_priv_suser) {
10693 				if (VATTR_IS_ACTIVE(vap, va_uid)) {
10694 					owner = vap->va_uid;
10695 				} else if (VATTR_IS_SUPPORTED(&ova, va_uid)) {
10696 					owner = ova.va_uid;
10697 				} else {
10698 					KAUTH_DEBUG("ATTR - ERROR: setuid but no uid available");
10699 					error = EINVAL;
10700 					goto out;
10701 				}
10702 				if (owner != kauth_cred_getuid(cred)) {
10703 					/*
10704 					 * We could allow this if WRITE_SECURITY is permitted, perhaps.
10705 					 */
10706 					KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit");
10707 					error = EPERM;
10708 					goto out;
10709 				}
10710 			}
10711 		}
10712 	}
10713 
10714 	/*
10715 	 * Validate/mask flags changes.  This checks that only the flags in
10716 	 * the UF_SETTABLE mask are being set, and preserves the flags in
10717 	 * the SF_SETTABLE case.
10718 	 *
10719 	 * Since flags changes may be made in conjunction with other changes,
10720 	 * we will ask the auth code to ignore immutability in the case that
10721 	 * the SF_* flags are not set and we are only manipulating the file flags.
10722 	 *
10723 	 */
10724 	if (VATTR_IS_ACTIVE(vap, va_flags)) {
10725 		/* compute changing flags bits */
10726 		vap->va_flags &= ~SF_SYNTHETIC;
10727 		ova.va_flags &= ~SF_SYNTHETIC;
10728 		if (VATTR_IS_SUPPORTED(&ova, va_flags)) {
10729 			fdelta = vap->va_flags ^ ova.va_flags;
10730 		} else {
10731 			fdelta = vap->va_flags;
10732 		}
10733 
10734 		if (fdelta != 0) {
10735 			KAUTH_DEBUG("ATTR - flags changing, requiring WRITE_SECURITY");
10736 			required_action |= KAUTH_VNODE_WRITE_SECURITY;
10737 
10738 			/* check that changing bits are legal */
10739 			if (has_priv_suser) {
10740 				/*
10741 				 * The immutability check will prevent us from clearing the SF_*
10742 				 * flags unless the system securelevel permits it, so just check
10743 				 * for legal flags here.
10744 				 */
10745 				if (fdelta & ~(UF_SETTABLE | SF_SETTABLE)) {
10746 					error = EPERM;
10747 					KAUTH_DEBUG("  DENIED - superuser attempt to set illegal flag(s)");
10748 					goto out;
10749 				}
10750 			} else {
10751 				if (fdelta & ~UF_SETTABLE) {
10752 					error = EPERM;
10753 					KAUTH_DEBUG("  DENIED - user attempt to set illegal flag(s)");
10754 					goto out;
10755 				}
10756 			}
10757 			/*
10758 			 * If the caller has the ability to manipulate file flags,
10759 			 * security is not reduced by ignoring them for this operation.
10760 			 *
10761 			 * A more complete test here would consider the 'after' states of the flags
10762 			 * to determine whether it would permit the operation, but this becomes
10763 			 * very complex.
10764 			 *
10765 			 * Ignoring immutability is conditional on securelevel; this does not bypass
10766 			 * the SF_* flags if securelevel > 0.
10767 			 */
10768 			required_action |= KAUTH_VNODE_NOIMMUTABLE;
10769 		}
10770 	}
10771 
10772 	/*
10773 	 * Validate ownership information.
10774 	 */
10775 	chowner = 0;
10776 	chgroup = 0;
10777 	clear_suid = 0;
10778 	clear_sgid = 0;
10779 
10780 	/*
10781 	 * uid changing
10782 	 * Note that if the filesystem didn't give us a UID, we expect that it doesn't
10783 	 * support them in general, and will ignore it if/when we try to set it.
10784 	 * We might want to clear the uid out of vap completely here.
10785 	 */
10786 	if (VATTR_IS_ACTIVE(vap, va_uid)) {
10787 		if (VATTR_IS_SUPPORTED(&ova, va_uid) && (vap->va_uid != ova.va_uid)) {
10788 			if (!has_priv_suser && (kauth_cred_getuid(cred) != vap->va_uid)) {
10789 				KAUTH_DEBUG("  DENIED - non-superuser cannot change ownershipt to a third party");
10790 				error = EPERM;
10791 				goto out;
10792 			}
10793 			chowner = 1;
10794 		}
10795 		clear_suid = 1;
10796 	}
10797 
10798 	/*
10799 	 * gid changing
10800 	 * Note that if the filesystem didn't give us a GID, we expect that it doesn't
10801 	 * support them in general, and will ignore it if/when we try to set it.
10802 	 * We might want to clear the gid out of vap completely here.
10803 	 */
10804 	if (VATTR_IS_ACTIVE(vap, va_gid)) {
10805 		if (VATTR_IS_SUPPORTED(&ova, va_gid) && (vap->va_gid != ova.va_gid)) {
10806 			if (!has_priv_suser) {
10807 				if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
10808 					KAUTH_DEBUG("  ERROR - got %d checking for membership in %d", error, vap->va_gid);
10809 					goto out;
10810 				}
10811 				if (!ismember) {
10812 					KAUTH_DEBUG("  DENIED - group change from %d to %d but not a member of target group",
10813 					    ova.va_gid, vap->va_gid);
10814 					error = EPERM;
10815 					goto out;
10816 				}
10817 			}
10818 			chgroup = 1;
10819 		}
10820 		clear_sgid = 1;
10821 	}
10822 
10823 	/*
10824 	 * Owner UUID being set or changed.
10825 	 */
10826 	if (VATTR_IS_ACTIVE(vap, va_uuuid)) {
10827 		/* if the owner UUID is not actually changing ... */
10828 		if (VATTR_IS_SUPPORTED(&ova, va_uuuid)) {
10829 			if (kauth_guid_equal(&vap->va_uuuid, &ova.va_uuuid)) {
10830 				goto no_uuuid_change;
10831 			}
10832 
10833 			/*
10834 			 * If the current owner UUID is a null GUID, check
10835 			 * it against the UUID corresponding to the owner UID.
10836 			 */
10837 			if (kauth_guid_equal(&ova.va_uuuid, &kauth_null_guid) &&
10838 			    VATTR_IS_SUPPORTED(&ova, va_uid)) {
10839 				guid_t uid_guid;
10840 
10841 				if (kauth_cred_uid2guid(ova.va_uid, &uid_guid) == 0 &&
10842 				    kauth_guid_equal(&vap->va_uuuid, &uid_guid)) {
10843 					goto no_uuuid_change;
10844 				}
10845 			}
10846 		}
10847 
10848 		/*
10849 		 * The owner UUID cannot be set by a non-superuser to anything other than
10850 		 * their own or a null GUID (to "unset" the owner UUID).
10851 		 * Note that file systems must be prepared to handle the
10852 		 * null UUID case in a manner appropriate for that file
10853 		 * system.
10854 		 */
10855 		if (!has_priv_suser) {
10856 			if ((error = kauth_cred_getguid(cred, &changer)) != 0) {
10857 				KAUTH_DEBUG("  ERROR - got %d trying to get caller UUID", error);
10858 				/* XXX ENOENT here - no UUID - should perhaps become EPERM */
10859 				goto out;
10860 			}
10861 			if (!kauth_guid_equal(&vap->va_uuuid, &changer) &&
10862 			    !kauth_guid_equal(&vap->va_uuuid, &kauth_null_guid)) {
10863 				KAUTH_DEBUG("  ERROR - cannot set supplied owner UUID - not us / null");
10864 				error = EPERM;
10865 				goto out;
10866 			}
10867 		}
10868 		chowner = 1;
10869 		clear_suid = 1;
10870 	}
10871 no_uuuid_change:
10872 	/*
10873 	 * Group UUID being set or changed.
10874 	 */
10875 	if (VATTR_IS_ACTIVE(vap, va_guuid)) {
10876 		/* if the group UUID is not actually changing ... */
10877 		if (VATTR_IS_SUPPORTED(&ova, va_guuid)) {
10878 			if (kauth_guid_equal(&vap->va_guuid, &ova.va_guuid)) {
10879 				goto no_guuid_change;
10880 			}
10881 
10882 			/*
10883 			 * If the current group UUID is a null UUID, check
10884 			 * it against the UUID corresponding to the group GID.
10885 			 */
10886 			if (kauth_guid_equal(&ova.va_guuid, &kauth_null_guid) &&
10887 			    VATTR_IS_SUPPORTED(&ova, va_gid)) {
10888 				guid_t gid_guid;
10889 
10890 				if (kauth_cred_gid2guid(ova.va_gid, &gid_guid) == 0 &&
10891 				    kauth_guid_equal(&vap->va_guuid, &gid_guid)) {
10892 					goto no_guuid_change;
10893 				}
10894 			}
10895 		}
10896 
10897 		/*
10898 		 * The group UUID cannot be set by a non-superuser to anything other than
10899 		 * one of which they are a member or a null GUID (to "unset"
10900 		 * the group UUID).
10901 		 * Note that file systems must be prepared to handle the
10902 		 * null UUID case in a manner appropriate for that file
10903 		 * system.
10904 		 */
10905 		if (!has_priv_suser) {
10906 			if (kauth_guid_equal(&vap->va_guuid, &kauth_null_guid)) {
10907 				ismember = 1;
10908 			} else if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) {
10909 				KAUTH_DEBUG("  ERROR - got %d trying to check group membership", error);
10910 				goto out;
10911 			}
10912 			if (!ismember) {
10913 				KAUTH_DEBUG("  ERROR - cannot set supplied group UUID - not a member / null");
10914 				error = EPERM;
10915 				goto out;
10916 			}
10917 		}
10918 		chgroup = 1;
10919 	}
10920 no_guuid_change:
10921 
10922 	/*
10923 	 * Compute authorisation for group/ownership changes.
10924 	 */
10925 	if (chowner || chgroup || clear_suid || clear_sgid) {
10926 		if (has_priv_suser) {
10927 			KAUTH_DEBUG("ATTR - superuser changing file owner/group, requiring immutability check");
10928 			required_action |= KAUTH_VNODE_CHECKIMMUTABLE;
10929 		} else {
10930 			if (chowner) {
10931 				KAUTH_DEBUG("ATTR - ownership change, requiring TAKE_OWNERSHIP");
10932 				required_action |= KAUTH_VNODE_TAKE_OWNERSHIP;
10933 			}
10934 			if (chgroup && !chowner) {
10935 				KAUTH_DEBUG("ATTR - group change, requiring WRITE_SECURITY");
10936 				required_action |= KAUTH_VNODE_WRITE_SECURITY;
10937 			}
10938 		}
10939 
10940 		/*
10941 		 * clear set-uid and set-gid bits. POSIX only requires this for
10942 		 * non-privileged processes but we do it even for root.
10943 		 */
10944 		if (VATTR_IS_ACTIVE(vap, va_mode)) {
10945 			newmode = vap->va_mode;
10946 		} else if (VATTR_IS_SUPPORTED(&ova, va_mode)) {
10947 			newmode = ova.va_mode;
10948 		} else {
10949 			KAUTH_DEBUG("CHOWN - trying to change owner but cannot get mode from filesystem to mask setugid bits");
10950 			newmode = 0;
10951 		}
10952 
10953 		/* chown always clears setuid/gid bits. An exception is made for
10954 		 * setattrlist which can set both at the same time: <uid, gid, mode> on a file:
10955 		 * setattrlist is allowed to set the new mode on the file and change (chown)
10956 		 * uid/gid.
10957 		 */
10958 		if (newmode & (S_ISUID | S_ISGID)) {
10959 			if (!VATTR_IS_ACTIVE(vap, va_mode)) {
10960 				KAUTH_DEBUG("CHOWN - masking setugid bits from mode %o to %o",
10961 				    newmode, newmode & ~(S_ISUID | S_ISGID));
10962 				newmode &= ~(S_ISUID | S_ISGID);
10963 			}
10964 			VATTR_SET(vap, va_mode, newmode);
10965 		}
10966 	}
10967 
10968 	/*
10969 	 * Authorise changes in the ACL.
10970 	 */
10971 	if (VATTR_IS_ACTIVE(vap, va_acl)) {
10972 		/* no existing ACL */
10973 		if (!VATTR_IS_ACTIVE(&ova, va_acl) || (ova.va_acl == NULL)) {
10974 			/* adding an ACL */
10975 			if (vap->va_acl != NULL) {
10976 				required_action |= KAUTH_VNODE_WRITE_SECURITY;
10977 				KAUTH_DEBUG("CHMOD - adding ACL");
10978 			}
10979 
10980 			/* removing an existing ACL */
10981 		} else if (vap->va_acl == NULL) {
10982 			required_action |= KAUTH_VNODE_WRITE_SECURITY;
10983 			KAUTH_DEBUG("CHMOD - removing ACL");
10984 
10985 			/* updating an existing ACL */
10986 		} else {
10987 			if (vap->va_acl->acl_entrycount != ova.va_acl->acl_entrycount) {
10988 				/* entry count changed, must be different */
10989 				required_action |= KAUTH_VNODE_WRITE_SECURITY;
10990 				KAUTH_DEBUG("CHMOD - adding/removing ACL entries");
10991 			} else if (vap->va_acl->acl_entrycount > 0) {
10992 				/* both ACLs have the same ACE count, said count is 1 or more, bitwise compare ACLs */
10993 				if (memcmp(&vap->va_acl->acl_ace[0], &ova.va_acl->acl_ace[0],
10994 				    sizeof(struct kauth_ace) * vap->va_acl->acl_entrycount)) {
10995 					required_action |= KAUTH_VNODE_WRITE_SECURITY;
10996 					KAUTH_DEBUG("CHMOD - changing ACL entries");
10997 				}
10998 			}
10999 		}
11000 	}
11001 
11002 	/*
11003 	 * Other attributes that require authorisation.
11004 	 */
11005 	if (VATTR_IS_ACTIVE(vap, va_encoding)) {
11006 		required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES;
11007 	}
11008 
11009 out:
11010 	if (VATTR_IS_SUPPORTED(&ova, va_acl) && (ova.va_acl != NULL)) {
11011 		kauth_acl_free(ova.va_acl);
11012 	}
11013 	if (error == 0) {
11014 		*actionp = required_action;
11015 	}
11016 	return error;
11017 }
11018 
11019 static int
setlocklocal_callback(struct vnode * vp,__unused void * cargs)11020 setlocklocal_callback(struct vnode *vp, __unused void *cargs)
11021 {
11022 	vnode_lock_spin(vp);
11023 	vp->v_flag |= VLOCKLOCAL;
11024 	vnode_unlock(vp);
11025 
11026 	return VNODE_RETURNED;
11027 }
11028 
11029 void
vfs_setlocklocal(mount_t mp)11030 vfs_setlocklocal(mount_t mp)
11031 {
11032 	mount_lock_spin(mp);
11033 	mp->mnt_kern_flag |= MNTK_LOCK_LOCAL;
11034 	mount_unlock(mp);
11035 
11036 	/*
11037 	 * The number of active vnodes is expected to be
11038 	 * very small when vfs_setlocklocal is invoked.
11039 	 */
11040 	vnode_iterate(mp, 0, setlocklocal_callback, NULL);
11041 }
11042 
11043 void
vfs_setcompoundopen(mount_t mp)11044 vfs_setcompoundopen(mount_t mp)
11045 {
11046 	mount_lock_spin(mp);
11047 	mp->mnt_compound_ops |= COMPOUND_VNOP_OPEN;
11048 	mount_unlock(mp);
11049 }
11050 
11051 void
vnode_setswapmount(vnode_t vp)11052 vnode_setswapmount(vnode_t vp)
11053 {
11054 	mount_lock(vp->v_mount);
11055 	vp->v_mount->mnt_kern_flag |= MNTK_SWAP_MOUNT;
11056 	mount_unlock(vp->v_mount);
11057 }
11058 
11059 void
vfs_setfskit(mount_t mp)11060 vfs_setfskit(mount_t mp)
11061 {
11062 	mount_lock_spin(mp);
11063 	mp->mnt_kern_flag |= MNTK_FSKIT;
11064 	mount_unlock(mp);
11065 }
11066 
11067 char *
vfs_getfstypenameref_locked(mount_t mp,size_t * lenp)11068 vfs_getfstypenameref_locked(mount_t mp, size_t *lenp)
11069 {
11070 	char *name;
11071 
11072 	if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
11073 		name = mp->fstypename_override;
11074 	} else {
11075 		name = mp->mnt_vfsstat.f_fstypename;
11076 	}
11077 	if (lenp != NULL) {
11078 		*lenp = strlen(name);
11079 	}
11080 	return name;
11081 }
11082 
11083 void
vfs_getfstypename(mount_t mp,char * buf,size_t buflen)11084 vfs_getfstypename(mount_t mp, char *buf, size_t buflen)
11085 {
11086 	mount_lock_spin(mp);
11087 	strlcpy(buf, vfs_getfstypenameref_locked(mp, NULL), buflen);
11088 	mount_unlock(mp);
11089 }
11090 
11091 void
vfs_setfstypename_locked(mount_t mp,const char * name)11092 vfs_setfstypename_locked(mount_t mp, const char *name)
11093 {
11094 	if (name == NULL || name[0] == '\0') {
11095 		mp->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
11096 		mp->fstypename_override[0] = '\0';
11097 	} else {
11098 		strlcpy(mp->fstypename_override, name,
11099 		    sizeof(mp->fstypename_override));
11100 		mp->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
11101 	}
11102 }
11103 
11104 void
vfs_setfstypename(mount_t mp,const char * name)11105 vfs_setfstypename(mount_t mp, const char *name)
11106 {
11107 	mount_lock_spin(mp);
11108 	vfs_setfstypename_locked(mp, name);
11109 	mount_unlock(mp);
11110 }
11111 
11112 int64_t
vnode_getswappin_avail(vnode_t vp)11113 vnode_getswappin_avail(vnode_t vp)
11114 {
11115 	int64_t max_swappin_avail = 0;
11116 
11117 	mount_lock(vp->v_mount);
11118 	if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_SWAPPIN_SUPPORTED) {
11119 		max_swappin_avail = vp->v_mount->mnt_max_swappin_available;
11120 	}
11121 	mount_unlock(vp->v_mount);
11122 
11123 	return max_swappin_avail;
11124 }
11125 
11126 
11127 void
vn_setunionwait(vnode_t vp)11128 vn_setunionwait(vnode_t vp)
11129 {
11130 	vnode_lock_spin(vp);
11131 	vp->v_flag |= VISUNION;
11132 	vnode_unlock(vp);
11133 }
11134 
11135 
11136 void
vn_checkunionwait(vnode_t vp)11137 vn_checkunionwait(vnode_t vp)
11138 {
11139 	vnode_lock_spin(vp);
11140 	while ((vp->v_flag & VISUNION) == VISUNION) {
11141 		msleep((caddr_t)&vp->v_flag, &vp->v_lock, 0, 0, 0);
11142 	}
11143 	vnode_unlock(vp);
11144 }
11145 
11146 void
vn_clearunionwait(vnode_t vp,int locked)11147 vn_clearunionwait(vnode_t vp, int locked)
11148 {
11149 	if (!locked) {
11150 		vnode_lock_spin(vp);
11151 	}
11152 	if ((vp->v_flag & VISUNION) == VISUNION) {
11153 		vp->v_flag &= ~VISUNION;
11154 		wakeup((caddr_t)&vp->v_flag);
11155 	}
11156 	if (!locked) {
11157 		vnode_unlock(vp);
11158 	}
11159 }
11160 
11161 /*
11162  * Removes orphaned apple double files during a rmdir
11163  * Works by:
11164  * 1. vnode_suspend().
11165  * 2. Call VNOP_READDIR() till the end of directory is reached.
11166  * 3. Check if the directory entries returned are regular files with name starting with "._".  If not, return ENOTEMPTY.
11167  * 4. Continue (2) and (3) till end of directory is reached.
11168  * 5. If all the entries in the directory were files with "._" name, delete all the files.
11169  * 6. vnode_resume()
11170  * 7. If deletion of all files succeeded, call VNOP_RMDIR() again.
11171  */
11172 
11173 errno_t
rmdir_remove_orphaned_appleDouble(vnode_t vp,vfs_context_t ctx,int * restart_flag)11174 rmdir_remove_orphaned_appleDouble(vnode_t vp, vfs_context_t ctx, int * restart_flag)
11175 {
11176 #define UIO_BUFF_SIZE 2048
11177 	uio_t auio = NULL;
11178 	int eofflag, siz = UIO_BUFF_SIZE, alloc_size = 0, nentries = 0;
11179 	int open_flag = 0, full_erase_flag = 0;
11180 	UIO_STACKBUF(uio_buf, 1);
11181 	char *rbuf = NULL;
11182 	void *dir_pos;
11183 	void *dir_end;
11184 	struct dirent *dp;
11185 	errno_t error;
11186 
11187 	error = vnode_suspend(vp);
11188 
11189 	/*
11190 	 * restart_flag is set so that the calling rmdir sleeps and resets
11191 	 */
11192 	if (error == EBUSY) {
11193 		*restart_flag = 1;
11194 	}
11195 	if (error != 0) {
11196 		return error;
11197 	}
11198 
11199 	/*
11200 	 * Prevent dataless fault materialization while we have
11201 	 * a suspended vnode.
11202 	 */
11203 	uthread_t ut = current_uthread();
11204 	bool saved_nodatalessfaults =
11205 	    (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? true : false;
11206 	ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11207 
11208 	/*
11209 	 * set up UIO
11210 	 */
11211 	rbuf = kalloc_data(siz, Z_WAITOK);
11212 	alloc_size = siz;
11213 	if (rbuf) {
11214 		auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ,
11215 		    &uio_buf[0], sizeof(uio_buf));
11216 	}
11217 	if (!rbuf || !auio) {
11218 		error = ENOMEM;
11219 		goto outsc;
11220 	}
11221 
11222 	uio_setoffset(auio, 0);
11223 
11224 	eofflag = 0;
11225 
11226 	if ((error = VNOP_OPEN(vp, FREAD, ctx))) {
11227 		goto outsc;
11228 	} else {
11229 		open_flag = 1;
11230 	}
11231 
11232 	/*
11233 	 * First pass checks if all files are appleDouble files.
11234 	 */
11235 
11236 	do {
11237 		siz = UIO_BUFF_SIZE;
11238 		uio_reset(auio, uio_offset(auio), UIO_SYSSPACE, UIO_READ);
11239 		uio_addiov(auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE);
11240 
11241 		if ((error = VNOP_READDIR(vp, auio, 0, &eofflag, &nentries, ctx))) {
11242 			goto outsc;
11243 		}
11244 
11245 		if (uio_resid(auio) != 0) {
11246 			siz -= uio_resid(auio);
11247 		}
11248 
11249 		/*
11250 		 * Iterate through directory
11251 		 */
11252 		dir_pos = (void*) rbuf;
11253 		dir_end = (void*) (rbuf + siz);
11254 		dp = (struct dirent*) (dir_pos);
11255 
11256 		if (dir_pos == dir_end) {
11257 			eofflag = 1;
11258 		}
11259 
11260 		while (dir_pos < dir_end) {
11261 			/*
11262 			 * Check for . and .. as well as directories
11263 			 */
11264 			if (dp->d_ino != 0 &&
11265 			    !((dp->d_namlen == 1 && dp->d_name[0] == '.') ||
11266 			    (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.'))) {
11267 				/*
11268 				 * Check for irregular files and ._ files
11269 				 * If there is a ._._ file abort the op
11270 				 */
11271 				if (dp->d_namlen < 2 ||
11272 				    strncmp(dp->d_name, "._", 2) ||
11273 				    (dp->d_namlen >= 4 && !strncmp(&(dp->d_name[2]), "._", 2))) {
11274 					error = ENOTEMPTY;
11275 					goto outsc;
11276 				}
11277 			}
11278 			dir_pos = (void*) ((uint8_t*)dir_pos + dp->d_reclen);
11279 			dp = (struct dirent*)dir_pos;
11280 		}
11281 
11282 		/*
11283 		 * workaround for HFS/NFS setting eofflag before end of file
11284 		 */
11285 		if (vp->v_tag == VT_HFS && nentries > 2) {
11286 			eofflag = 0;
11287 		}
11288 
11289 		if (vp->v_tag == VT_NFS) {
11290 			if (eofflag && !full_erase_flag) {
11291 				full_erase_flag = 1;
11292 				eofflag = 0;
11293 				uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ);
11294 			} else if (!eofflag && full_erase_flag) {
11295 				full_erase_flag = 0;
11296 			}
11297 		}
11298 	} while (!eofflag);
11299 	/*
11300 	 * If we've made it here all the files in the dir are ._ files.
11301 	 * We can delete the files even though the node is suspended
11302 	 * because we are the owner of the file.
11303 	 */
11304 
11305 	uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ);
11306 	eofflag = 0;
11307 	full_erase_flag = 0;
11308 
11309 	do {
11310 		siz = UIO_BUFF_SIZE;
11311 		uio_reset(auio, uio_offset(auio), UIO_SYSSPACE, UIO_READ);
11312 		uio_addiov(auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE);
11313 
11314 		error = VNOP_READDIR(vp, auio, 0, &eofflag, &nentries, ctx);
11315 
11316 		if (error != 0) {
11317 			goto outsc;
11318 		}
11319 
11320 		if (uio_resid(auio) != 0) {
11321 			siz -= uio_resid(auio);
11322 		}
11323 
11324 		/*
11325 		 * Iterate through directory
11326 		 */
11327 		dir_pos = (void*) rbuf;
11328 		dir_end = (void*) (rbuf + siz);
11329 		dp = (struct dirent*) dir_pos;
11330 
11331 		if (dir_pos == dir_end) {
11332 			eofflag = 1;
11333 		}
11334 
11335 		while (dir_pos < dir_end) {
11336 			/*
11337 			 * Check for . and .. as well as directories
11338 			 */
11339 			if (dp->d_ino != 0 &&
11340 			    !((dp->d_namlen == 1 && dp->d_name[0] == '.') ||
11341 			    (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.'))
11342 			    ) {
11343 				error = unlink1(ctx, vp,
11344 				    CAST_USER_ADDR_T(dp->d_name), UIO_SYSSPACE,
11345 				    VNODE_REMOVE_SKIP_NAMESPACE_EVENT |
11346 				    VNODE_REMOVE_NO_AUDIT_PATH);
11347 
11348 				if (error && error != ENOENT) {
11349 					goto outsc;
11350 				}
11351 			}
11352 			dir_pos = (void*) ((uint8_t*)dir_pos + dp->d_reclen);
11353 			dp = (struct dirent*)dir_pos;
11354 		}
11355 
11356 		/*
11357 		 * workaround for HFS/NFS setting eofflag before end of file
11358 		 */
11359 		if (vp->v_tag == VT_HFS && nentries > 2) {
11360 			eofflag = 0;
11361 		}
11362 
11363 		if (vp->v_tag == VT_NFS) {
11364 			if (eofflag && !full_erase_flag) {
11365 				full_erase_flag = 1;
11366 				eofflag = 0;
11367 				uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ);
11368 			} else if (!eofflag && full_erase_flag) {
11369 				full_erase_flag = 0;
11370 			}
11371 		}
11372 	} while (!eofflag);
11373 
11374 
11375 	error = 0;
11376 
11377 outsc:
11378 	if (open_flag) {
11379 		VNOP_CLOSE(vp, FREAD, ctx);
11380 	}
11381 
11382 	if (auio) {
11383 		uio_free(auio);
11384 	}
11385 	kfree_data(rbuf, alloc_size);
11386 
11387 	if (saved_nodatalessfaults == false) {
11388 		ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11389 	}
11390 
11391 	vnode_resume(vp);
11392 
11393 	return error;
11394 }
11395 
11396 
11397 void
lock_vnode_and_post(vnode_t vp,int kevent_num)11398 lock_vnode_and_post(vnode_t vp, int kevent_num)
11399 {
11400 	/* Only take the lock if there's something there! */
11401 	if (vp->v_knotes.slh_first != NULL) {
11402 		vnode_lock(vp);
11403 		KNOTE(&vp->v_knotes, kevent_num);
11404 		vnode_unlock(vp);
11405 	}
11406 }
11407 
11408 void panic_print_vnodes(void);
11409 
11410 /* define PANIC_PRINTS_VNODES only if investigation is required. */
11411 #ifdef PANIC_PRINTS_VNODES
11412 
11413 static const char *
__vtype(uint16_t vtype)11414 __vtype(uint16_t vtype)
11415 {
11416 	switch (vtype) {
11417 	case VREG:
11418 		return "R";
11419 	case VDIR:
11420 		return "D";
11421 	case VBLK:
11422 		return "B";
11423 	case VCHR:
11424 		return "C";
11425 	case VLNK:
11426 		return "L";
11427 	case VSOCK:
11428 		return "S";
11429 	case VFIFO:
11430 		return "F";
11431 	case VBAD:
11432 		return "x";
11433 	case VSTR:
11434 		return "T";
11435 	case VCPLX:
11436 		return "X";
11437 	default:
11438 		return "?";
11439 	}
11440 }
11441 
11442 /*
11443  * build a path from the bottom up
11444  * NOTE: called from the panic path - no alloc'ing of memory and no locks!
11445  */
11446 static char *
__vpath(vnode_t vp,char * str,int len,int depth)11447 __vpath(vnode_t vp, char *str, int len, int depth)
11448 {
11449 	int vnm_len;
11450 	const char *src;
11451 	char *dst;
11452 
11453 	if (len <= 0) {
11454 		return str;
11455 	}
11456 	/* str + len is the start of the string we created */
11457 	if (!vp->v_name) {
11458 		return str + len;
11459 	}
11460 
11461 	/* follow mount vnodes to get the full path */
11462 	if ((vp->v_flag & VROOT)) {
11463 		if (vp->v_mount != NULL && vp->v_mount->mnt_vnodecovered) {
11464 			return __vpath(vp->v_mount->mnt_vnodecovered,
11465 			           str, len, depth + 1);
11466 		}
11467 		return str + len;
11468 	}
11469 
11470 	src = vp->v_name;
11471 	vnm_len = strlen(src);
11472 	if (vnm_len > len) {
11473 		/* truncate the name to fit in the string */
11474 		src += (vnm_len - len);
11475 		vnm_len = len;
11476 	}
11477 
11478 	/* start from the back and copy just characters (no NULLs) */
11479 
11480 	/* this will chop off leaf path (file) names */
11481 	if (depth > 0) {
11482 		dst = str + len - vnm_len;
11483 		memcpy(dst, src, vnm_len);
11484 		len -= vnm_len;
11485 	} else {
11486 		dst = str + len;
11487 	}
11488 
11489 	if (vp->v_parent && len > 1) {
11490 		/* follow parents up the chain */
11491 		len--;
11492 		*(dst - 1) = '/';
11493 		return __vpath(vp->v_parent, str, len, depth + 1);
11494 	}
11495 
11496 	return dst;
11497 }
11498 
11499 #define SANE_VNODE_PRINT_LIMIT 5000
11500 void
panic_print_vnodes(void)11501 panic_print_vnodes(void)
11502 {
11503 	mount_t mnt;
11504 	vnode_t vp;
11505 	int nvnodes = 0;
11506 	const char *type;
11507 	char *nm;
11508 	char vname[257];
11509 
11510 	paniclog_append_noflush("\n***** VNODES *****\n"
11511 	    "TYPE UREF ICNT PATH\n");
11512 
11513 	/* NULL-terminate the path name */
11514 	vname[sizeof(vname) - 1] = '\0';
11515 
11516 	/*
11517 	 * iterate all vnodelist items in all mounts (mntlist) -> mnt_vnodelist
11518 	 */
11519 	TAILQ_FOREACH(mnt, &mountlist, mnt_list) {
11520 		if (!ml_validate_nofault((vm_offset_t)mnt, sizeof(mount_t))) {
11521 			paniclog_append_noflush("Unable to iterate the mount list %p - encountered an invalid mount pointer %p \n",
11522 			    &mountlist, mnt);
11523 			break;
11524 		}
11525 
11526 		TAILQ_FOREACH(vp, &mnt->mnt_vnodelist, v_mntvnodes) {
11527 			if (!ml_validate_nofault((vm_offset_t)vp, sizeof(vnode_t))) {
11528 				paniclog_append_noflush("Unable to iterate the vnode list %p - encountered an invalid vnode pointer %p \n",
11529 				    &mnt->mnt_vnodelist, vp);
11530 				break;
11531 			}
11532 
11533 			if (++nvnodes > SANE_VNODE_PRINT_LIMIT) {
11534 				return;
11535 			}
11536 			type = __vtype(vp->v_type);
11537 			nm = __vpath(vp, vname, sizeof(vname) - 1, 0);
11538 			paniclog_append_noflush("%s %0d %0d %s\n",
11539 			    type, vp->v_usecount, vp->v_iocount, nm);
11540 		}
11541 	}
11542 }
11543 
11544 #else /* !PANIC_PRINTS_VNODES */
11545 void
panic_print_vnodes(void)11546 panic_print_vnodes(void)
11547 {
11548 	return;
11549 }
11550 #endif
11551 
11552 
11553 #ifdef CONFIG_IOCOUNT_TRACE
11554 static void
record_iocount_trace_vnode(vnode_t vp,int type)11555 record_iocount_trace_vnode(vnode_t vp, int type)
11556 {
11557 	void *stacks[IOCOUNT_TRACE_MAX_FRAMES] = {0};
11558 	int idx = vp->v_iocount_trace[type].idx;
11559 
11560 	if (idx >= IOCOUNT_TRACE_MAX_IDX) {
11561 		return;
11562 	}
11563 
11564 	OSBacktrace((void **)&stacks[0], IOCOUNT_TRACE_MAX_FRAMES);
11565 
11566 	/*
11567 	 * To save index space, only store the unique backtraces. If dup is found,
11568 	 * just bump the count and return.
11569 	 */
11570 	for (int i = 0; i < idx; i++) {
11571 		if (memcmp(&stacks[0], &vp->v_iocount_trace[type].stacks[i][0],
11572 		    sizeof(stacks)) == 0) {
11573 			vp->v_iocount_trace[type].counts[i]++;
11574 			return;
11575 		}
11576 	}
11577 
11578 	memcpy(&vp->v_iocount_trace[type].stacks[idx][0], &stacks[0],
11579 	    sizeof(stacks));
11580 	vp->v_iocount_trace[type].counts[idx] = 1;
11581 	vp->v_iocount_trace[type].idx++;
11582 }
11583 
11584 static void
record_iocount_trace_uthread(vnode_t vp,int count)11585 record_iocount_trace_uthread(vnode_t vp, int count)
11586 {
11587 	struct uthread *ut;
11588 
11589 	ut = current_uthread();
11590 	ut->uu_iocount += count;
11591 
11592 	if (count == 1) {
11593 		if (ut->uu_vpindex < 32) {
11594 			OSBacktrace((void **)&ut->uu_pcs[ut->uu_vpindex][0], 10);
11595 
11596 			ut->uu_vps[ut->uu_vpindex] = vp;
11597 			ut->uu_vpindex++;
11598 		}
11599 	}
11600 }
11601 
11602 static void
record_vp(vnode_t vp,int count)11603 record_vp(vnode_t vp, int count)
11604 {
11605 	if (__probable(bootarg_vnode_iocount_trace == 0 &&
11606 	    bootarg_uthread_iocount_trace == 0)) {
11607 		return;
11608 	}
11609 
11610 #if CONFIG_TRIGGERS
11611 	if (vp->v_resolve) {
11612 		return;
11613 	}
11614 #endif
11615 	if ((vp->v_flag & VSYSTEM)) {
11616 		return;
11617 	}
11618 
11619 	if (bootarg_vnode_iocount_trace) {
11620 		record_iocount_trace_vnode(vp,
11621 		    (count > 0) ? IOCOUNT_TRACE_VGET : IOCOUNT_TRACE_VPUT);
11622 	}
11623 	if (bootarg_uthread_iocount_trace) {
11624 		record_iocount_trace_uthread(vp, count);
11625 	}
11626 }
11627 #endif /* CONFIG_IOCOUNT_TRACE */
11628 
11629 #if CONFIG_TRIGGERS
11630 #define __triggers_unused
11631 #else
11632 #define __triggers_unused       __unused
11633 #endif
11634 
11635 resolver_result_t
vfs_resolver_result(__triggers_unused uint32_t seq,__triggers_unused enum resolver_status stat,__triggers_unused int aux)11636 vfs_resolver_result(__triggers_unused uint32_t seq, __triggers_unused enum resolver_status stat, __triggers_unused int aux)
11637 {
11638 #if CONFIG_TRIGGERS
11639 	/*
11640 	 * |<---   32   --->|<---  28  --->|<- 4 ->|
11641 	 *      sequence        auxiliary    status
11642 	 */
11643 	return (((uint64_t)seq) << 32) |
11644 	       (((uint64_t)(aux & 0x0fffffff)) << 4) |
11645 	       (uint64_t)(stat & 0x0000000F);
11646 #else
11647 	return (0x0ULL) | (((uint64_t)ENOTSUP) << 4) | (((uint64_t)RESOLVER_ERROR) & 0xF);
11648 #endif
11649 }
11650 
11651 #if CONFIG_TRIGGERS
11652 
11653 #define TRIG_DEBUG 0
11654 
11655 #if TRIG_DEBUG
11656 #define TRIG_LOG(...) do { printf("%s: ", __FUNCTION__); printf(__VA_ARGS__); } while (0)
11657 #else
11658 #define TRIG_LOG(...)
11659 #endif
11660 
11661 /*
11662  * Resolver result functions
11663  */
11664 
11665 
11666 enum resolver_status
vfs_resolver_status(resolver_result_t result)11667 vfs_resolver_status(resolver_result_t result)
11668 {
11669 	/* lower 4 bits is status */
11670 	return result & 0x0000000F;
11671 }
11672 
11673 uint32_t
vfs_resolver_sequence(resolver_result_t result)11674 vfs_resolver_sequence(resolver_result_t result)
11675 {
11676 	/* upper 32 bits is sequence */
11677 	return (uint32_t)(result >> 32);
11678 }
11679 
11680 int
vfs_resolver_auxiliary(resolver_result_t result)11681 vfs_resolver_auxiliary(resolver_result_t result)
11682 {
11683 	/* 28 bits of auxiliary */
11684 	return (int)(((uint32_t)(result & 0xFFFFFFF0)) >> 4);
11685 }
11686 
11687 /*
11688  * SPI
11689  * Call in for resolvers to update vnode trigger state
11690  */
11691 int
vnode_trigger_update(vnode_t vp,resolver_result_t result)11692 vnode_trigger_update(vnode_t vp, resolver_result_t result)
11693 {
11694 	vnode_resolve_t rp;
11695 	uint32_t seq;
11696 	enum resolver_status stat;
11697 
11698 	if (vp->v_resolve == NULL) {
11699 		return EINVAL;
11700 	}
11701 
11702 	stat = vfs_resolver_status(result);
11703 	seq = vfs_resolver_sequence(result);
11704 
11705 	if ((stat != RESOLVER_RESOLVED) && (stat != RESOLVER_UNRESOLVED)) {
11706 		return EINVAL;
11707 	}
11708 
11709 	rp = vp->v_resolve;
11710 	lck_mtx_lock(&rp->vr_lock);
11711 
11712 	if (seq > rp->vr_lastseq) {
11713 		if (stat == RESOLVER_RESOLVED) {
11714 			rp->vr_flags |= VNT_RESOLVED;
11715 		} else {
11716 			rp->vr_flags &= ~VNT_RESOLVED;
11717 		}
11718 
11719 		rp->vr_lastseq = seq;
11720 	}
11721 
11722 	lck_mtx_unlock(&rp->vr_lock);
11723 
11724 	return 0;
11725 }
11726 
11727 static int
vnode_resolver_attach(vnode_t vp,vnode_resolve_t rp,boolean_t ref)11728 vnode_resolver_attach(vnode_t vp, vnode_resolve_t rp, boolean_t ref)
11729 {
11730 	int error;
11731 
11732 	vnode_lock_spin(vp);
11733 	if (vp->v_resolve != NULL) {
11734 		vnode_unlock(vp);
11735 		return EINVAL;
11736 	} else {
11737 		vp->v_resolve = rp;
11738 	}
11739 	vnode_unlock(vp);
11740 
11741 	if (ref) {
11742 		error = vnode_ref_ext(vp, O_EVTONLY, VNODE_REF_FORCE);
11743 		if (error != 0) {
11744 			panic("VNODE_REF_FORCE didn't help...");
11745 		}
11746 	}
11747 
11748 	return 0;
11749 }
11750 
11751 /*
11752  * VFS internal interfaces for vnode triggers
11753  *
11754  * vnode must already have an io count on entry
11755  * v_resolve is stable when io count is non-zero
11756  */
11757 static int
vnode_resolver_create(mount_t mp,vnode_t vp,struct vnode_trigger_param * tinfo,boolean_t external)11758 vnode_resolver_create(mount_t mp, vnode_t vp, struct vnode_trigger_param *tinfo, boolean_t external)
11759 {
11760 	vnode_resolve_t rp;
11761 	int result;
11762 	char byte;
11763 
11764 #if 1
11765 	/* minimum pointer test (debugging) */
11766 	if (tinfo->vnt_data) {
11767 		byte = *((char *)tinfo->vnt_data);
11768 	}
11769 #endif
11770 	rp = kalloc_type(struct vnode_resolve, Z_WAITOK | Z_NOFAIL);
11771 
11772 	lck_mtx_init(&rp->vr_lock, &trigger_vnode_lck_grp, &trigger_vnode_lck_attr);
11773 
11774 	rp->vr_resolve_func = tinfo->vnt_resolve_func;
11775 	rp->vr_unresolve_func = tinfo->vnt_unresolve_func;
11776 	rp->vr_rearm_func = tinfo->vnt_rearm_func;
11777 	rp->vr_reclaim_func = tinfo->vnt_reclaim_func;
11778 	rp->vr_data = tinfo->vnt_data;
11779 	rp->vr_lastseq = 0;
11780 	rp->vr_flags = tinfo->vnt_flags & VNT_VALID_MASK;
11781 	if (external) {
11782 		rp->vr_flags |= VNT_EXTERNAL;
11783 	}
11784 
11785 	result = vnode_resolver_attach(vp, rp, external);
11786 	if (result != 0) {
11787 		goto out;
11788 	}
11789 
11790 	if (mp) {
11791 		OSAddAtomic(1, &mp->mnt_numtriggers);
11792 	}
11793 
11794 	return result;
11795 
11796 out:
11797 	kfree_type(struct vnode_resolve, rp);
11798 	return result;
11799 }
11800 
11801 static void
vnode_resolver_release(vnode_resolve_t rp)11802 vnode_resolver_release(vnode_resolve_t rp)
11803 {
11804 	/*
11805 	 * Give them a chance to free any private data
11806 	 */
11807 	if (rp->vr_data && rp->vr_reclaim_func) {
11808 		rp->vr_reclaim_func(NULLVP, rp->vr_data);
11809 	}
11810 
11811 	lck_mtx_destroy(&rp->vr_lock, &trigger_vnode_lck_grp);
11812 	kfree_type(struct vnode_resolve, rp);
11813 }
11814 
11815 /* Called after the vnode has been drained */
11816 static void
vnode_resolver_detach(vnode_t vp)11817 vnode_resolver_detach(vnode_t vp)
11818 {
11819 	vnode_resolve_t rp;
11820 	mount_t mp;
11821 
11822 	mp = vnode_mount(vp);
11823 
11824 	vnode_lock(vp);
11825 	rp = vp->v_resolve;
11826 	vp->v_resolve = NULL;
11827 	vnode_unlock(vp);
11828 
11829 	if ((rp->vr_flags & VNT_EXTERNAL) != 0) {
11830 		vnode_rele_ext(vp, O_EVTONLY, 1);
11831 	}
11832 
11833 	vnode_resolver_release(rp);
11834 
11835 	/* Keep count of active trigger vnodes per mount */
11836 	OSAddAtomic(-1, &mp->mnt_numtriggers);
11837 }
11838 
11839 __private_extern__
11840 void
vnode_trigger_rearm(vnode_t vp,vfs_context_t ctx)11841 vnode_trigger_rearm(vnode_t vp, vfs_context_t ctx)
11842 {
11843 	vnode_resolve_t rp;
11844 	resolver_result_t result;
11845 	enum resolver_status status;
11846 	uint32_t seq;
11847 
11848 	if ((vp->v_resolve == NULL) ||
11849 	    (vp->v_resolve->vr_rearm_func == NULL) ||
11850 	    (vp->v_resolve->vr_flags & VNT_AUTO_REARM) == 0) {
11851 		return;
11852 	}
11853 
11854 	rp = vp->v_resolve;
11855 	lck_mtx_lock(&rp->vr_lock);
11856 
11857 	/*
11858 	 * Check if VFS initiated this unmount. If so, we'll catch it after the unresolve completes.
11859 	 */
11860 	if (rp->vr_flags & VNT_VFS_UNMOUNTED) {
11861 		lck_mtx_unlock(&rp->vr_lock);
11862 		return;
11863 	}
11864 
11865 	/* Check if this vnode is already armed */
11866 	if ((rp->vr_flags & VNT_RESOLVED) == 0) {
11867 		lck_mtx_unlock(&rp->vr_lock);
11868 		return;
11869 	}
11870 
11871 	lck_mtx_unlock(&rp->vr_lock);
11872 
11873 	result = rp->vr_rearm_func(vp, 0, rp->vr_data, ctx);
11874 	status = vfs_resolver_status(result);
11875 	seq = vfs_resolver_sequence(result);
11876 
11877 	lck_mtx_lock(&rp->vr_lock);
11878 	if (seq > rp->vr_lastseq) {
11879 		if (status == RESOLVER_UNRESOLVED) {
11880 			rp->vr_flags &= ~VNT_RESOLVED;
11881 		}
11882 		rp->vr_lastseq = seq;
11883 	}
11884 	lck_mtx_unlock(&rp->vr_lock);
11885 }
11886 
11887 __private_extern__
11888 int
vnode_trigger_resolve(vnode_t vp,struct nameidata * ndp,vfs_context_t ctx)11889 vnode_trigger_resolve(vnode_t vp, struct nameidata *ndp, vfs_context_t ctx)
11890 {
11891 	vnode_resolve_t rp;
11892 	enum path_operation op;
11893 	resolver_result_t result;
11894 	enum resolver_status status;
11895 	uint32_t seq;
11896 
11897 	/*
11898 	 * N.B. we cannot call vfs_context_can_resolve_triggers()
11899 	 * here because we really only want to suppress that in
11900 	 * the event the trigger will be resolved by something in
11901 	 * user-space.  Any triggers that are resolved by the kernel
11902 	 * do not pose a threat of deadlock.
11903 	 */
11904 
11905 	/* Only trigger on topmost vnodes */
11906 	if ((vp->v_resolve == NULL) ||
11907 	    (vp->v_resolve->vr_resolve_func == NULL) ||
11908 	    (vp->v_mountedhere != NULL)) {
11909 		return 0;
11910 	}
11911 
11912 	rp = vp->v_resolve;
11913 	lck_mtx_lock(&rp->vr_lock);
11914 
11915 	/* Check if this vnode is already resolved */
11916 	if (rp->vr_flags & VNT_RESOLVED) {
11917 		lck_mtx_unlock(&rp->vr_lock);
11918 		return 0;
11919 	}
11920 
11921 	lck_mtx_unlock(&rp->vr_lock);
11922 
11923 #if CONFIG_MACF
11924 	if ((rp->vr_flags & VNT_KERN_RESOLVE) == 0) {
11925 		/*
11926 		 * VNT_KERN_RESOLVE indicates this trigger has no parameters
11927 		 * at the discression of the accessing process other than
11928 		 * the act of access. All other triggers must be checked
11929 		 */
11930 		int rv = mac_vnode_check_trigger_resolve(ctx, vp, &ndp->ni_cnd);
11931 		if (rv != 0) {
11932 			return rv;
11933 		}
11934 	}
11935 #endif
11936 
11937 	/*
11938 	 * XXX
11939 	 * assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock)
11940 	 * is there anyway to know this???
11941 	 * there can also be other legitimate lookups in parallel
11942 	 *
11943 	 * XXX - should we call this on a separate thread with a timeout?
11944 	 *
11945 	 * XXX - should we use ISLASTCN to pick the op value???  Perhaps only leafs should
11946 	 * get the richer set and non-leafs should get generic OP_LOOKUP?  TBD
11947 	 */
11948 	op = (ndp->ni_op < OP_MAXOP) ? ndp->ni_op: OP_LOOKUP;
11949 
11950 	result = rp->vr_resolve_func(vp, &ndp->ni_cnd, op, 0, rp->vr_data, ctx);
11951 	status = vfs_resolver_status(result);
11952 	seq = vfs_resolver_sequence(result);
11953 
11954 	lck_mtx_lock(&rp->vr_lock);
11955 	if (seq > rp->vr_lastseq) {
11956 		if (status == RESOLVER_RESOLVED) {
11957 			rp->vr_flags |= VNT_RESOLVED;
11958 		}
11959 		rp->vr_lastseq = seq;
11960 	}
11961 	lck_mtx_unlock(&rp->vr_lock);
11962 
11963 	/* On resolver errors, propagate the error back up */
11964 	return status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : 0;
11965 }
11966 
11967 static int
vnode_trigger_unresolve(vnode_t vp,int flags,vfs_context_t ctx)11968 vnode_trigger_unresolve(vnode_t vp, int flags, vfs_context_t ctx)
11969 {
11970 	vnode_resolve_t rp;
11971 	resolver_result_t result;
11972 	enum resolver_status status;
11973 	uint32_t seq;
11974 
11975 	if ((vp->v_resolve == NULL) || (vp->v_resolve->vr_unresolve_func == NULL)) {
11976 		return 0;
11977 	}
11978 
11979 	rp = vp->v_resolve;
11980 	lck_mtx_lock(&rp->vr_lock);
11981 
11982 	/* Check if this vnode is already resolved */
11983 	if ((rp->vr_flags & VNT_RESOLVED) == 0) {
11984 		printf("vnode_trigger_unresolve: not currently resolved\n");
11985 		lck_mtx_unlock(&rp->vr_lock);
11986 		return 0;
11987 	}
11988 
11989 	rp->vr_flags |= VNT_VFS_UNMOUNTED;
11990 
11991 	lck_mtx_unlock(&rp->vr_lock);
11992 
11993 	/*
11994 	 * XXX
11995 	 * assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock)
11996 	 * there can also be other legitimate lookups in parallel
11997 	 *
11998 	 * XXX - should we call this on a separate thread with a timeout?
11999 	 */
12000 
12001 	result = rp->vr_unresolve_func(vp, flags, rp->vr_data, ctx);
12002 	status = vfs_resolver_status(result);
12003 	seq = vfs_resolver_sequence(result);
12004 
12005 	lck_mtx_lock(&rp->vr_lock);
12006 	if (seq > rp->vr_lastseq) {
12007 		if (status == RESOLVER_UNRESOLVED) {
12008 			rp->vr_flags &= ~VNT_RESOLVED;
12009 		}
12010 		rp->vr_lastseq = seq;
12011 	}
12012 	rp->vr_flags &= ~VNT_VFS_UNMOUNTED;
12013 	lck_mtx_unlock(&rp->vr_lock);
12014 
12015 	/* On resolver errors, propagate the error back up */
12016 	return status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : 0;
12017 }
12018 
12019 static int
triggerisdescendant(mount_t mp,mount_t rmp)12020 triggerisdescendant(mount_t mp, mount_t rmp)
12021 {
12022 	int match = FALSE;
12023 
12024 	/*
12025 	 * walk up vnode covered chain looking for a match
12026 	 */
12027 	name_cache_lock_shared();
12028 
12029 	while (1) {
12030 		vnode_t vp;
12031 
12032 		/* did we encounter "/" ? */
12033 		if (mp->mnt_flag & MNT_ROOTFS) {
12034 			break;
12035 		}
12036 
12037 		vp = mp->mnt_vnodecovered;
12038 		if (vp == NULLVP) {
12039 			break;
12040 		}
12041 
12042 		mp = vp->v_mount;
12043 		if (mp == rmp) {
12044 			match = TRUE;
12045 			break;
12046 		}
12047 	}
12048 
12049 	name_cache_unlock();
12050 
12051 	return match;
12052 }
12053 
12054 struct trigger_unmount_info {
12055 	vfs_context_t   ctx;
12056 	mount_t         top_mp;
12057 	vnode_t         trigger_vp;
12058 	mount_t         trigger_mp;
12059 	uint32_t        trigger_vid;
12060 	int             flags;
12061 };
12062 
12063 static int
trigger_unmount_callback(mount_t mp,void * arg)12064 trigger_unmount_callback(mount_t mp, void * arg)
12065 {
12066 	struct trigger_unmount_info * infop = (struct trigger_unmount_info *)arg;
12067 	boolean_t mountedtrigger = FALSE;
12068 
12069 	/*
12070 	 * When we encounter the top level mount we're done
12071 	 */
12072 	if (mp == infop->top_mp) {
12073 		return VFS_RETURNED_DONE;
12074 	}
12075 
12076 	if ((mp->mnt_vnodecovered == NULL) ||
12077 	    (vnode_getwithref(mp->mnt_vnodecovered) != 0)) {
12078 		return VFS_RETURNED;
12079 	}
12080 
12081 	if ((mp->mnt_vnodecovered->v_mountedhere == mp) &&
12082 	    (mp->mnt_vnodecovered->v_resolve != NULL) &&
12083 	    (mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_RESOLVED)) {
12084 		mountedtrigger = TRUE;
12085 	}
12086 	vnode_put(mp->mnt_vnodecovered);
12087 
12088 	/*
12089 	 * When we encounter a mounted trigger, check if its under the top level mount
12090 	 */
12091 	if (!mountedtrigger || !triggerisdescendant(mp, infop->top_mp)) {
12092 		return VFS_RETURNED;
12093 	}
12094 
12095 	/*
12096 	 * Process any pending nested mount (now that its not referenced)
12097 	 */
12098 	if ((infop->trigger_vp != NULLVP) &&
12099 	    (vnode_getwithvid(infop->trigger_vp, infop->trigger_vid) == 0)) {
12100 		vnode_t vp = infop->trigger_vp;
12101 		int error;
12102 
12103 		vnode_drop(infop->trigger_vp);
12104 		infop->trigger_vp = NULLVP;
12105 
12106 		if (mp == vp->v_mountedhere) {
12107 			vnode_put(vp);
12108 			printf("trigger_unmount_callback: unexpected match '%s'\n",
12109 			    mp->mnt_vfsstat.f_mntonname);
12110 			return VFS_RETURNED;
12111 		}
12112 		if (infop->trigger_mp != vp->v_mountedhere) {
12113 			vnode_put(vp);
12114 			printf("trigger_unmount_callback: trigger mnt changed! (%p != %p)\n",
12115 			    infop->trigger_mp, vp->v_mountedhere);
12116 			goto savenext;
12117 		}
12118 
12119 		error = vnode_trigger_unresolve(vp, infop->flags, infop->ctx);
12120 		vnode_put(vp);
12121 		if (error) {
12122 			printf("unresolving: '%s', err %d\n",
12123 			    vp->v_mountedhere ? vp->v_mountedhere->mnt_vfsstat.f_mntonname :
12124 			    "???", error);
12125 			return VFS_RETURNED_DONE; /* stop iteration on errors */
12126 		}
12127 	} else if (infop->trigger_vp != NULLVP) {
12128 		vnode_drop(infop->trigger_vp);
12129 	}
12130 
12131 savenext:
12132 	/*
12133 	 * We can't call resolver here since we hold a mount iter
12134 	 * ref on mp so save its covered vp for later processing
12135 	 */
12136 	infop->trigger_vp = mp->mnt_vnodecovered;
12137 	if ((infop->trigger_vp != NULLVP) &&
12138 	    (vnode_getwithref(infop->trigger_vp) == 0)) {
12139 		if (infop->trigger_vp->v_mountedhere == mp) {
12140 			infop->trigger_vid = infop->trigger_vp->v_id;
12141 			vnode_hold(infop->trigger_vp);
12142 			infop->trigger_mp = mp;
12143 		}
12144 		vnode_put(infop->trigger_vp);
12145 	}
12146 
12147 	return VFS_RETURNED;
12148 }
12149 
12150 /*
12151  * Attempt to unmount any trigger mounts nested underneath a mount.
12152  * This is a best effort attempt and no retries are performed here.
12153  *
12154  * Note: mp->mnt_rwlock is held exclusively on entry (so be carefull)
12155  */
12156 __private_extern__
12157 void
vfs_nested_trigger_unmounts(mount_t mp,int flags,vfs_context_t ctx)12158 vfs_nested_trigger_unmounts(mount_t mp, int flags, vfs_context_t ctx)
12159 {
12160 	struct trigger_unmount_info info;
12161 
12162 	/* Must have trigger vnodes */
12163 	if (mp->mnt_numtriggers == 0) {
12164 		return;
12165 	}
12166 	/* Avoid recursive requests (by checking covered vnode) */
12167 	if ((mp->mnt_vnodecovered != NULL) &&
12168 	    (vnode_getwithref(mp->mnt_vnodecovered) == 0)) {
12169 		boolean_t recursive = FALSE;
12170 
12171 		if ((mp->mnt_vnodecovered->v_mountedhere == mp) &&
12172 		    (mp->mnt_vnodecovered->v_resolve != NULL) &&
12173 		    (mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_VFS_UNMOUNTED)) {
12174 			recursive = TRUE;
12175 		}
12176 		vnode_put(mp->mnt_vnodecovered);
12177 		if (recursive) {
12178 			return;
12179 		}
12180 	}
12181 
12182 	/*
12183 	 * Attempt to unmount any nested trigger mounts (best effort)
12184 	 */
12185 	info.ctx = ctx;
12186 	info.top_mp = mp;
12187 	info.trigger_vp = NULLVP;
12188 	info.trigger_vid = 0;
12189 	info.trigger_mp = NULL;
12190 	info.flags = flags;
12191 
12192 	(void) vfs_iterate(VFS_ITERATE_TAIL_FIRST, trigger_unmount_callback, &info);
12193 
12194 	/*
12195 	 * Process remaining nested mount (now that its not referenced)
12196 	 */
12197 	if ((info.trigger_vp != NULLVP) &&
12198 	    (vnode_getwithvid(info.trigger_vp, info.trigger_vid) == 0)) {
12199 		vnode_t vp = info.trigger_vp;
12200 
12201 		if (info.trigger_mp == vp->v_mountedhere) {
12202 			(void) vnode_trigger_unresolve(vp, flags, ctx);
12203 		}
12204 		vnode_put(vp);
12205 		vnode_drop(vp);
12206 	} else if (info.trigger_vp != NULLVP) {
12207 		vnode_drop(info.trigger_vp);
12208 	}
12209 }
12210 
12211 int
vfs_addtrigger(mount_t mp,const char * relpath,struct vnode_trigger_info * vtip,vfs_context_t ctx)12212 vfs_addtrigger(mount_t mp, const char *relpath, struct vnode_trigger_info *vtip, vfs_context_t ctx)
12213 {
12214 	struct nameidata *ndp;
12215 	int res;
12216 	vnode_t rvp, vp;
12217 	struct vnode_trigger_param vtp;
12218 
12219 	/*
12220 	 * Must be called for trigger callback, wherein rwlock is held
12221 	 */
12222 	lck_rw_assert(&mp->mnt_rwlock, LCK_RW_ASSERT_HELD);
12223 
12224 	TRIG_LOG("Adding trigger at %s\n", relpath);
12225 	TRIG_LOG("Trying VFS_ROOT\n");
12226 
12227 	ndp = kalloc_type(struct nameidata, Z_WAITOK | Z_NOFAIL);
12228 
12229 	/*
12230 	 * We do a lookup starting at the root of the mountpoint, unwilling
12231 	 * to cross into other mountpoints.
12232 	 */
12233 	res = VFS_ROOT(mp, &rvp, ctx);
12234 	if (res != 0) {
12235 		goto out;
12236 	}
12237 
12238 	TRIG_LOG("Trying namei\n");
12239 
12240 	NDINIT(ndp, LOOKUP, OP_LOOKUP, USEDVP | NOCROSSMOUNT | FOLLOW, UIO_SYSSPACE,
12241 	    CAST_USER_ADDR_T(relpath), ctx);
12242 	ndp->ni_dvp = rvp;
12243 	res = namei(ndp);
12244 	if (res != 0) {
12245 		vnode_put(rvp);
12246 		goto out;
12247 	}
12248 
12249 	vp = ndp->ni_vp;
12250 	nameidone(ndp);
12251 	vnode_put(rvp);
12252 
12253 	TRIG_LOG("Trying vnode_resolver_create()\n");
12254 
12255 	/*
12256 	 * Set up blob.  vnode_create() takes a larger structure
12257 	 * with creation info, and we needed something different
12258 	 * for this case.  One needs to win, or we need to munge both;
12259 	 * vnode_create() wins.
12260 	 */
12261 	bzero(&vtp, sizeof(vtp));
12262 	vtp.vnt_resolve_func = vtip->vti_resolve_func;
12263 	vtp.vnt_unresolve_func = vtip->vti_unresolve_func;
12264 	vtp.vnt_rearm_func = vtip->vti_rearm_func;
12265 	vtp.vnt_reclaim_func = vtip->vti_reclaim_func;
12266 	vtp.vnt_reclaim_func = vtip->vti_reclaim_func;
12267 	vtp.vnt_data = vtip->vti_data;
12268 	vtp.vnt_flags = vtip->vti_flags;
12269 
12270 	res = vnode_resolver_create(mp, vp, &vtp, TRUE);
12271 	vnode_put(vp);
12272 out:
12273 	kfree_type(struct nameidata, ndp);
12274 	TRIG_LOG("Returning %d\n", res);
12275 	return res;
12276 }
12277 
12278 #endif /* CONFIG_TRIGGERS */
12279 
12280 vm_offset_t
kdebug_vnode(vnode_t vp)12281 kdebug_vnode(vnode_t vp)
12282 {
12283 	return VM_KERNEL_ADDRPERM(vp);
12284 }
12285 
12286 static int flush_cache_on_write = 0;
12287 SYSCTL_INT(_kern, OID_AUTO, flush_cache_on_write,
12288     CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0,
12289     "always flush the drive cache on writes to uncached files");
12290 
12291 int
vnode_should_flush_after_write(vnode_t vp,int ioflag)12292 vnode_should_flush_after_write(vnode_t vp, int ioflag)
12293 {
12294 	return flush_cache_on_write
12295 	       && (ISSET(ioflag, IO_NOCACHE) || vnode_isnocache(vp));
12296 }
12297 
12298 /*
12299  * sysctl for use by disk I/O tracing tools to get the list of existing
12300  * vnodes' paths
12301  */
12302 
12303 #define NPATH_WORDS (MAXPATHLEN / sizeof(unsigned long))
12304 struct vnode_trace_paths_context {
12305 	uint64_t count;
12306 	/*
12307 	 * Must be a multiple of 4, then -1, for tracing!
12308 	 */
12309 	unsigned long path[NPATH_WORDS + (4 - (NPATH_WORDS % 4)) - 1];
12310 };
12311 
12312 static int
vnode_trace_path_callback(struct vnode * vp,void * vctx)12313 vnode_trace_path_callback(struct vnode *vp, void *vctx)
12314 {
12315 	struct vnode_trace_paths_context *ctx = vctx;
12316 	size_t path_len = sizeof(ctx->path);
12317 
12318 	int getpath_len = (int)path_len;
12319 	if (vn_getpath(vp, (char *)ctx->path, &getpath_len) == 0) {
12320 		/* vn_getpath() NUL-terminates, and len includes the NUL. */
12321 		assert(getpath_len >= 0);
12322 		path_len = (size_t)getpath_len;
12323 
12324 		assert(path_len <= sizeof(ctx->path));
12325 		kdebug_vfs_lookup(ctx->path, (int)path_len, vp,
12326 		    KDBG_VFS_LOOKUP_FLAG_LOOKUP | KDBG_VFS_LOOKUP_FLAG_NOPROCFILT);
12327 
12328 		if (++(ctx->count) == 1000) {
12329 			thread_yield_to_preemption();
12330 			ctx->count = 0;
12331 		}
12332 	}
12333 
12334 	return VNODE_RETURNED;
12335 }
12336 
12337 static int
vfs_trace_paths_callback(mount_t mp,void * arg)12338 vfs_trace_paths_callback(mount_t mp, void *arg)
12339 {
12340 	if (mp->mnt_flag & MNT_LOCAL) {
12341 		vnode_iterate(mp, VNODE_ITERATE_ALL, vnode_trace_path_callback, arg);
12342 	}
12343 
12344 	return VFS_RETURNED;
12345 }
12346 
12347 static int sysctl_vfs_trace_paths SYSCTL_HANDLER_ARGS {
12348 	struct vnode_trace_paths_context ctx;
12349 
12350 	(void)oidp;
12351 	(void)arg1;
12352 	(void)arg2;
12353 	(void)req;
12354 
12355 	if (!kauth_cred_issuser(kauth_cred_get())) {
12356 		return EPERM;
12357 	}
12358 
12359 	if (!kdebug_enable || !kdebug_debugid_enabled(VFS_LOOKUP)) {
12360 		return EINVAL;
12361 	}
12362 
12363 	bzero(&ctx, sizeof(struct vnode_trace_paths_context));
12364 
12365 	vfs_iterate(0, vfs_trace_paths_callback, &ctx);
12366 
12367 	return 0;
12368 }
12369 
12370 SYSCTL_PROC(_vfs_generic, OID_AUTO, trace_paths, CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED, NULL, 0, &sysctl_vfs_trace_paths, "-", "trace_paths");
12371 
12372 #if CONFIG_FILE_LEASES
12373 #include <IOKit/IOBSD.h>
12374 #include <sys/file_internal.h>
12375 
12376 #define FILE_LEASES_ENTITLEMENT    "com.apple.private.vfs.file-leases"
12377 
12378 static uint32_t lease_break_timeout = 60; /* secs */
12379 
12380 #if (DEVELOPMENT || DEBUG)
12381 static int lease_debug = 0;
12382 static int lease_entitlement_override = 0;
12383 
12384 SYSCTL_NODE(_vfs, OID_AUTO, lease, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs lease");
12385 SYSCTL_UINT(_vfs_lease, OID_AUTO, break_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &lease_break_timeout, 0, "");
12386 SYSCTL_INT(_vfs_lease, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED, &lease_debug, 0, "");
12387 SYSCTL_INT(_vfs_lease, OID_AUTO, entitlement_override, CTLFLAG_RW | CTLFLAG_LOCKED, &lease_entitlement_override, 0, "");
12388 
12389 #define LEASEDBG(fmt, args...)                                       \
12390 do {                                                                 \
12391 	if (__improbable(lease_debug)) {                                 \
12392 	        pid_t cur_pid = proc_getpid(current_proc());             \
12393 	        printf("%s(%d): " fmt "\n", __func__, cur_pid, ##args);  \
12394 	}                                                                \
12395 } while(0)
12396 #else
12397 #define LEASEDBG(fmt, args...)  /**/
12398 #endif /* (DEVELOPMENT || DEBUG) */
12399 
12400 static bool
allow_setlease(vfs_context_t ctx)12401 allow_setlease(vfs_context_t ctx)
12402 {
12403 	bool entitled;
12404 
12405 	entitled = IOTaskHasEntitlement(vfs_context_task(ctx),
12406 	    FILE_LEASES_ENTITLEMENT);
12407 
12408 #if (DEVELOPMENT || DEBUG)
12409 	if (!entitled) {
12410 		entitled = (lease_entitlement_override == 1);
12411 	}
12412 #endif
12413 
12414 	return entitled;
12415 }
12416 
12417 static file_lease_t
file_lease_alloc(struct fileglob * fg,int fl_type,pid_t pid)12418 file_lease_alloc(struct fileglob *fg, int fl_type, pid_t pid)
12419 {
12420 	file_lease_t fl;
12421 
12422 	fl = kalloc_type(struct file_lease, Z_WAITOK);
12423 	/*
12424 	 * Duplicated file descriptors created by dup() or fork() would have the
12425 	 * same 'fileglob' so the lease can be released or modified with the
12426 	 * duplicated fds. Opening the same file (by either same or different
12427 	 * process) would have different 'fileglob' so a lease always follows a
12428 	 * 'fileglob'.
12429 	 */
12430 	fl->fl_fg = fg;
12431 	fl->fl_type = fl_type;
12432 	fl->fl_pid = pid;
12433 	fl->fl_downgrade_start = fl->fl_release_start = 0;
12434 
12435 	return fl;
12436 }
12437 
12438 static void
file_lease_free(file_lease_t fl)12439 file_lease_free(file_lease_t fl)
12440 {
12441 	kfree_type(struct file_lease, fl);
12442 }
12443 
12444 /*
12445  * A read lease can be placed only on a file/directory that is opened for
12446  * read-only which means no other processes have the file/directory opened in
12447  * read-write/write-only mode or mmap'ed writable.
12448  * A write lease can be placed on a file only if there are no other opens
12449  * for the file.
12450  *
12451  * Needs to be called with vnode's lock held.
12452  */
12453 static int
check_for_open_conflict(vnode_t vp,struct fileglob * fg,int fl_type,int expcounts)12454 check_for_open_conflict(vnode_t vp, struct fileglob *fg, int fl_type,
12455     int expcounts)
12456 {
12457 	int error = 0;
12458 
12459 	if (fl_type == F_RDLCK) {
12460 		if (vp->v_writecount > expcounts &&
12461 		    !(vp->v_writecount == 1 && (fg->fg_flag & FWRITE))) {
12462 			error = EAGAIN;
12463 		} else if (ubc_is_mapped_writable(vp)) {
12464 			error = EAGAIN;
12465 		}
12466 	} else if (fl_type == F_WRLCK && vp->v_usecount > expcounts) {
12467 		error = EAGAIN;
12468 	}
12469 
12470 	return error;
12471 }
12472 
12473 /* Needs to be called with vnode's lock held. */
12474 static void
modify_file_lease(vnode_t vp,file_lease_t fl,int new_fl_type,struct fileglob * new_fg)12475 modify_file_lease(vnode_t vp, file_lease_t fl, int new_fl_type,
12476     struct fileglob *new_fg)
12477 {
12478 	LEASEDBG("fl %p changing fl_type from %d to %d (flags 0x%x)",
12479 	    fl, fl->fl_type, new_fl_type, fl->fl_flags);
12480 
12481 	fl->fl_type = new_fl_type;
12482 
12483 	/*
12484 	 * The lease being modified may be using a different file
12485 	 * descriptor, so usurp the fileglob pointer here.  In this
12486 	 * case the old descriptor no longer holds the lease.
12487 	 */
12488 	if (new_fg != NULL) {
12489 		fl->fl_fg = new_fg;
12490 	}
12491 
12492 	if (fl->fl_flags & FL_FLAG_RELEASE_PENDING ||
12493 	    fl->fl_flags & FL_FLAG_DOWNGRADE_PENDING) {
12494 		wakeup(&vp->v_leases);
12495 	}
12496 }
12497 
12498 static int
acquire_file_lease(vnode_t vp,struct fileglob * fg,int fl_type,int expcounts,vfs_context_t ctx)12499 acquire_file_lease(vnode_t vp, struct fileglob *fg, int fl_type, int expcounts,
12500     vfs_context_t ctx)
12501 {
12502 	file_lease_t fl, new_fl, our_fl;
12503 	int error;
12504 
12505 	/* Make sure "expected count" looks sane. */
12506 	if (expcounts < 0 || expcounts > OPEN_MAX) {
12507 		return EINVAL;
12508 	}
12509 
12510 	new_fl = file_lease_alloc(fg, fl_type, vfs_context_pid(ctx));
12511 
12512 	vnode_lock(vp);
12513 
12514 	error = check_for_open_conflict(vp, fg, fl_type, expcounts);
12515 	if (error) {
12516 		LEASEDBG("open conflict on vp %p type %d writecnt %d usecnt %d "
12517 		    "fl_type %d expcounts %d",
12518 		    vp, vp->v_type, vp->v_writecount, vp->v_usecount, fl_type,
12519 		    expcounts);
12520 		goto out;
12521 	}
12522 
12523 	our_fl = NULL;
12524 	LIST_FOREACH(fl, &vp->v_leases, fl_link) {
12525 		/* Does the existing lease belong to us? */
12526 		if (fl->fl_fg == new_fl->fl_fg ||
12527 		    fl->fl_pid == new_fl->fl_pid) {
12528 			our_fl = fl;
12529 			continue;
12530 		}
12531 
12532 		/*
12533 		 * We don't allow placing a new write lease when there is an existing
12534 		 * read lease that doesn't belong to us. We also don't allow putting
12535 		 * a new read lease if there is a pending release on the lease.
12536 		 * Putting a new read lease when there is a pending downgrade on the
12537 		 * lease is fine as it won't cause lease conflict.
12538 		 */
12539 		if (fl_type == F_WRLCK || fl->fl_flags & FL_FLAG_RELEASE_PENDING) {
12540 			break;
12541 		}
12542 	}
12543 
12544 	/*
12545 	 * Found an existing lease that we don't own and it conflicts with the
12546 	 * new lease.
12547 	 */
12548 	if (fl) {
12549 		LEASEDBG("lease conflict on vp %p fl %p fl_type %d cur_fl_type %d",
12550 		    vp, fl, fl_type, fl->fl_type);
12551 		goto out;
12552 	}
12553 
12554 	/* Found an existing lease that we own so just change the type. */
12555 	if (our_fl) {
12556 		LEASEDBG("replace lease on vp %p fl %p old_fl_type %d new_fl_type %d",
12557 		    vp, our_fl, our_fl->fl_type, fl_type);
12558 
12559 		modify_file_lease(vp, our_fl, new_fl->fl_type, new_fl->fl_fg);
12560 		goto out;
12561 	}
12562 
12563 	LEASEDBG("acquired lease on vp %p type %d fl %p fl_type %d fg %p",
12564 	    vp, vp->v_type, new_fl, new_fl->fl_type, new_fl->fl_fg);
12565 
12566 	LIST_INSERT_HEAD(&vp->v_leases, new_fl, fl_link);
12567 	new_fl = NULL;
12568 
12569 out:
12570 	vnode_unlock(vp);
12571 
12572 	if (new_fl) {
12573 		file_lease_free(new_fl);
12574 	}
12575 
12576 	return error;
12577 }
12578 
12579 static int
release_file_lease(vnode_t vp,struct fileglob * fg)12580 release_file_lease(vnode_t vp, struct fileglob *fg)
12581 {
12582 	file_lease_t fl, fl_tmp;
12583 	int error = 0;
12584 
12585 	LEASEDBG("request to release lease on vp %p type %d fg %p",
12586 	    vp, vp->v_type, fg);
12587 
12588 	vnode_lock(vp);
12589 
12590 	LIST_FOREACH_SAFE(fl, &vp->v_leases, fl_link, fl_tmp) {
12591 		if (fl->fl_fg == fg) {
12592 			LEASEDBG("released lease on vp %p fl %p type %d",
12593 			    vp, fl, fl->fl_type);
12594 
12595 			LIST_REMOVE(fl, fl_link);
12596 			modify_file_lease(vp, fl, F_UNLCK, NULL);
12597 			break;
12598 		}
12599 	}
12600 
12601 	vnode_unlock(vp);
12602 
12603 	if (fl) {
12604 		file_lease_free(fl);
12605 	} else {
12606 		error = ENOLCK;
12607 	}
12608 
12609 	return error;
12610 }
12611 
12612 /*
12613  * Acquire or release a file lease according to the given type (F_RDLCK,
12614  * F_WRLCK or F_UNLCK).
12615  *
12616  * Returns:	0			Success
12617  *		EAGAIN			Failed to acquire a file lease due to conflicting opens
12618  *		ENOLCK			Failed to release a file lease due to lease not found
12619  *		EPERM           Current task doesn't have the entitlement
12620  */
12621 int
vnode_setlease(vnode_t vp,struct fileglob * fg,int fl_type,int expcounts,vfs_context_t ctx)12622 vnode_setlease(vnode_t vp, struct fileglob *fg, int fl_type, int expcounts,
12623     vfs_context_t ctx)
12624 {
12625 	int error;
12626 
12627 	if (!allow_setlease(ctx)) {
12628 		return EPERM;
12629 	}
12630 
12631 	error = (fl_type == F_UNLCK) ? release_file_lease(vp, fg) :
12632 	    acquire_file_lease(vp, fg, fl_type, expcounts, ctx);
12633 
12634 	return error;
12635 }
12636 
12637 /*
12638  * Retrieve the currently in place lease for the file.
12639  *
12640  * Returns:
12641  *		F_RDLCK			Read lease
12642  *		F_WRLCK			Write lease
12643  *		F_UNLCK			No lease
12644  */
12645 int
vnode_getlease(vnode_t vp)12646 vnode_getlease(vnode_t vp)
12647 {
12648 	file_lease_t fl;
12649 	int fl_type = F_UNLCK;
12650 
12651 	vnode_lock(vp);
12652 
12653 	/*
12654 	 * There should be only one type of lease in the list as read and write
12655 	 * leases can't co-exist for the same file.
12656 	 */
12657 	fl = LIST_FIRST(&vp->v_leases);
12658 	if (fl) {
12659 		fl_type = fl->fl_type;
12660 	}
12661 
12662 	vnode_unlock(vp);
12663 
12664 	LEASEDBG("vp %p fl %p fl_type %d", vp, fl, fl_type);
12665 
12666 	return fl_type;
12667 }
12668 
12669 /* Must be called with vnode's lock held. */
12670 static bool
check_for_lease_conflict(vnode_t vp,int breaker_fl_type,vfs_context_t ctx)12671 check_for_lease_conflict(vnode_t vp, int breaker_fl_type, vfs_context_t ctx)
12672 {
12673 	file_lease_t fl;
12674 	pid_t pid = vfs_context_pid(ctx);
12675 	bool is_conflict = false;
12676 
12677 	LIST_FOREACH(fl, &vp->v_leases, fl_link) {
12678 		if ((fl->fl_type == F_WRLCK && fl->fl_pid != pid) ||
12679 		    (breaker_fl_type == F_WRLCK && fl->fl_pid != pid)) {
12680 			LEASEDBG("conflict detected on vp %p type %d fl_type %d "
12681 			    "breaker_fl_type %d",
12682 			    vp, vp->v_type, fl->fl_type, breaker_fl_type);
12683 
12684 			is_conflict = true;
12685 			break;
12686 		}
12687 	}
12688 
12689 	return is_conflict;
12690 }
12691 
12692 static uint64_t
absolutetime_elapsed_in_secs(uint64_t start)12693 absolutetime_elapsed_in_secs(uint64_t start)
12694 {
12695 	uint64_t elapsed, elapsed_sec;
12696 	uint64_t now = mach_absolute_time();
12697 
12698 	elapsed = now - start;
12699 	absolutetime_to_nanoseconds(elapsed, &elapsed_sec);
12700 	elapsed_sec /= NSEC_PER_SEC;
12701 
12702 	return elapsed_sec;
12703 }
12704 
12705 /* Must be called with vnode's lock held. */
12706 static void
handle_lease_break_timedout(vnode_t vp)12707 handle_lease_break_timedout(vnode_t vp)
12708 {
12709 	file_lease_t fl, fl_tmp;
12710 	uint64_t elapsed_sec;
12711 
12712 	LIST_FOREACH_SAFE(fl, &vp->v_leases, fl_link, fl_tmp) {
12713 		if (fl->fl_flags & FL_FLAG_DOWNGRADE_PENDING) {
12714 			elapsed_sec = absolutetime_elapsed_in_secs(fl->fl_downgrade_start);
12715 
12716 			if (elapsed_sec >= lease_break_timeout) {
12717 				LEASEDBG("force downgrade on vp %p for fl %p elapsed %llu "
12718 				    "timeout %u", vp, fl, elapsed_sec, lease_break_timeout);
12719 
12720 				fl->fl_flags &= ~FL_FLAG_DOWNGRADE_PENDING;
12721 				fl->fl_downgrade_start = 0;
12722 				modify_file_lease(vp, fl, F_RDLCK, NULL);
12723 				continue;
12724 			}
12725 		}
12726 		if (fl->fl_flags & FL_FLAG_RELEASE_PENDING) {
12727 			elapsed_sec = absolutetime_elapsed_in_secs(fl->fl_release_start);
12728 
12729 			if (elapsed_sec >= lease_break_timeout) {
12730 				LEASEDBG("force release on vp %p for fl %p elapsed %llu "
12731 				    "timeout %u", vp, fl, elapsed_sec, lease_break_timeout);
12732 
12733 				LIST_REMOVE(fl, fl_link);
12734 				file_lease_free(fl);
12735 				continue;
12736 			}
12737 		}
12738 	}
12739 
12740 	/* Wakeup the lease breaker(s). */
12741 	wakeup(&vp->v_leases);
12742 }
12743 
12744 /* Must be called with vnode's lock held. */
12745 static void
wait_for_lease_break(vnode_t vp,int breaker_fl_type,vfs_context_t ctx)12746 wait_for_lease_break(vnode_t vp, int breaker_fl_type, vfs_context_t ctx)
12747 {
12748 	file_lease_t fl;
12749 	struct timespec ts;
12750 	uint64_t elapsed_sec, start_time;
12751 	int error;
12752 
12753 restart:
12754 	fl = LIST_FIRST(&vp->v_leases);
12755 	assert(fl);
12756 
12757 	/*
12758 	 * In a rare case it is possible that the lease that we are blocked on has
12759 	 * been released and a new lease has been put in place after we are
12760 	 * signalled to wake up. In this particular, we would treat it as no
12761 	 * conflict and proceed. This could only happen for directory leasing.
12762 	 */
12763 	if ((fl->fl_flags & (FL_FLAG_DOWNGRADE_PENDING | FL_FLAG_RELEASE_PENDING)) == 0) {
12764 		LEASEDBG("new lease in place on vp %p fl %p fl_type %d "
12765 		    "breaker_fl_type %d",
12766 		    vp, fl, fl->fl_type, breaker_fl_type);
12767 
12768 		return;
12769 	}
12770 	/*
12771 	 * Figure out which timer to use for lease break timedout as we could have
12772 	 * both timers active. If both timers active, pick the one with earliest
12773 	 * start time.
12774 	 */
12775 	if (fl->fl_release_start) {
12776 		if (fl->fl_downgrade_start == 0 ||
12777 		    fl->fl_downgrade_start < fl->fl_release_start) {
12778 			start_time = fl->fl_release_start;
12779 		} else {
12780 			start_time = fl->fl_downgrade_start;
12781 		}
12782 	} else {
12783 		start_time = fl->fl_downgrade_start;
12784 	}
12785 	assert(start_time > 0);
12786 
12787 	elapsed_sec = absolutetime_elapsed_in_secs(start_time);
12788 
12789 	LEASEDBG("elapsed_sec %llu release_start %llu downgrade_start %llu",
12790 	    elapsed_sec, fl->fl_release_start, fl->fl_downgrade_start);
12791 
12792 	ts.tv_sec = (lease_break_timeout > elapsed_sec ?
12793 	    (lease_break_timeout - elapsed_sec) : 0);
12794 	ts.tv_nsec = (ts.tv_sec == 0 ? 1 : 0);
12795 	error = msleep(&vp->v_leases, &vp->v_lock, PVFS, __func__, &ts);
12796 
12797 	if (error == 0 || error != EWOULDBLOCK) {
12798 		/*
12799 		 * Woken up due to lease is released/downgraded by lease holder.
12800 		 * We don't expect any other error from msleep() beside EWOULDBLOCK.
12801 		 * Check if there is any further conflicts. If so, then continue to
12802 		 * wait for the next conflict to resolve.
12803 		 */
12804 		if (check_for_lease_conflict(vp, breaker_fl_type, ctx)) {
12805 			goto restart;
12806 		}
12807 	} else {
12808 		/*
12809 		 * Woken due to lease break timeout expired (EWOULDBLOCK returned).
12810 		 * Break/downgrade all conflicting leases.
12811 		 */
12812 		handle_lease_break_timedout(vp);
12813 
12814 		if (check_for_lease_conflict(vp, breaker_fl_type, ctx)) {
12815 			goto restart;
12816 		}
12817 	}
12818 }
12819 
12820 /* Must be called with vnode's lock held. */
12821 static void
send_lease_break_event(vnode_t vp,uint32_t event)12822 send_lease_break_event(vnode_t vp, uint32_t event)
12823 {
12824 	if (vp->v_knotes.slh_first != NULL) {
12825 		KNOTE(&vp->v_knotes, event);
12826 	}
12827 }
12828 
12829 static bool
is_dataless_file(vnode_t vp,vfs_context_t ctx)12830 is_dataless_file(vnode_t vp, vfs_context_t ctx)
12831 {
12832 	struct vnode_attr va;
12833 	bool is_dataless = false;
12834 	int error;
12835 
12836 	VATTR_INIT(&va);
12837 	VATTR_WANTED(&va, va_flags);
12838 
12839 	error = vnode_getattr(vp, &va, ctx);
12840 	if (!error && (va.va_flags & SF_DATALESS)) {
12841 		is_dataless = true;
12842 	}
12843 
12844 	return is_dataless;
12845 }
12846 
12847 /*
12848  * Break lease(s) in place for the file when there is conflict.
12849  * This function would return 0 for almost all call sites. The only exception
12850  * is when it is called from open1() with O_NONBLOCK flag and it needs to block
12851  * waiting for the lease conflict(s) to resolve. In this case EWOULDBLOCK is
12852  * returned.
12853  */
12854 int
vnode_breaklease(vnode_t vp,uint32_t oflags,vfs_context_t ctx)12855 vnode_breaklease(vnode_t vp, uint32_t oflags, vfs_context_t ctx)
12856 {
12857 	file_lease_t fl;
12858 	uint64_t now;
12859 	int fl_type;
12860 	int error = 0;
12861 
12862 	vnode_lock(vp);
12863 
12864 	if (__probable(LIST_EMPTY(&vp->v_leases))) {
12865 		goto out_unlock;
12866 	}
12867 
12868 	/* Determine the access mode requested by the lease breaker. */
12869 	fl_type = (oflags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC)) ? F_WRLCK : F_RDLCK;
12870 
12871 	/*
12872 	 * If the lease-breaker is just reading, check that it can break
12873 	 * leases first. If the lease-breaker is writing, or if the
12874 	 * context was not specified, we always break.
12875 	 * We skip lease break if the lease-breaker is dataless manipulator and
12876 	 * the file is dataless.
12877 	 */
12878 	if ((fl_type == F_RDLCK && !vfs_context_can_break_leases(ctx)) ||
12879 	    (vfs_context_is_dataless_manipulator(ctx) && (vp->v_type == VREG) &&
12880 	    is_dataless_file(vp, ctx))) {
12881 		goto out_unlock;
12882 	}
12883 
12884 	if (!check_for_lease_conflict(vp, fl_type, ctx)) {
12885 		goto out_unlock;
12886 	}
12887 
12888 	now = mach_absolute_time();
12889 
12890 	LEASEDBG("break lease on vp %p type %d oflags 0x%x cur_time %llu",
12891 	    vp, vp->v_type, oflags, now);
12892 
12893 	/*
12894 	 * We get to this point then this means all lease(s) are conflict and
12895 	 * we need to send the lease break event to the lease holder(s).
12896 	 * It is possible that a lease could have both downgrade and release events
12897 	 * pending triggered by multiple breakers trying to open the file in
12898 	 * different modes. Both events would have different lease break timers.
12899 	 * Consider the following case:
12900 	 * 1. Process A holds the write lease on file X.
12901 	 * 2. Provess B opens the file X in read-only mode.
12902 	 *    This triggers downgrade lease event to Process A.
12903 	 * 3. While downgrade is pending, Process C opens the file X in read-write
12904 	 *    mode. This triggers release lease event to Process A.
12905 	 */
12906 	LIST_FOREACH(fl, &vp->v_leases, fl_link) {
12907 		if (fl_type == F_WRLCK) {
12908 			/* File is opened for writing or truncate. */
12909 			if (fl->fl_flags & FL_FLAG_RELEASE_PENDING) {
12910 				continue;
12911 			}
12912 			fl->fl_release_start = now;
12913 			fl->fl_flags |= FL_FLAG_RELEASE_PENDING;
12914 			send_lease_break_event(vp, NOTE_LEASE_RELEASE);
12915 		} else {
12916 			/* File is opened for reading. */
12917 			if (fl->fl_flags & FL_FLAG_DOWNGRADE_PENDING ||
12918 			    fl->fl_flags & FL_FLAG_RELEASE_PENDING) {
12919 				continue;
12920 			}
12921 			fl->fl_downgrade_start = now;
12922 			fl->fl_flags |= FL_FLAG_DOWNGRADE_PENDING;
12923 			send_lease_break_event(vp, NOTE_LEASE_DOWNGRADE);
12924 		}
12925 	}
12926 
12927 	/*
12928 	 * If open is requested with O_NONBLOCK, then we can't block and wait for
12929 	 * the lease to be released/downgraded. Just bail out with EWOULDBLOCK.
12930 	 */
12931 	if (oflags & O_NONBLOCK) {
12932 		error = EWOULDBLOCK;
12933 		goto out;
12934 	}
12935 
12936 	wait_for_lease_break(vp, fl_type, ctx);
12937 
12938 out:
12939 	LEASEDBG("break lease on vp %p oflags 0x%x, error %d", vp, oflags, error);
12940 
12941 out_unlock:
12942 	vnode_unlock(vp);
12943 
12944 	return error;
12945 }
12946 
12947 /*
12948  * Get parent vnode by parent ID (only for file system that supports
12949  * MNTK_PATH_FROM_ID).
12950  * On success, the parent's vnode is returned with iocount held.
12951  */
12952 static vnode_t
vnode_getparent_byid(vnode_t vp)12953 vnode_getparent_byid(vnode_t vp)
12954 {
12955 	struct vnode_attr va;
12956 	vnode_t dvp = NULLVP;
12957 	vfs_context_t ctx = vfs_context_current();
12958 	int error;
12959 
12960 	if (!(vp->v_mount->mnt_kern_flag & MNTK_PATH_FROM_ID)) {
12961 		goto out;
12962 	}
12963 
12964 	VATTR_INIT(&va);
12965 	VATTR_WANTED(&va, va_parentid);
12966 
12967 	/* Get the vnode's parent id from the file system. */
12968 	error = vnode_getattr(vp, &va, ctx);
12969 	if (error || !VATTR_IS_SUPPORTED(&va, va_parentid)) {
12970 		goto out;
12971 	}
12972 
12973 	/*
12974 	 * Ask the file system for the parent vnode.
12975 	 * We are ignoring the error here as we don't expect the parent vnode to be
12976 	 * populated on error.
12977 	 */
12978 	(void)VFS_VGET(vp->v_mount, (ino64_t)va.va_parentid, &dvp, ctx);
12979 
12980 out:
12981 	return dvp;
12982 }
12983 
12984 /*
12985  * Break directory's lease.
12986  * If 'need_parent' is true, then parent is obtained via vnode_getparent() (or
12987  * vnode_getparent_byid()) on the provided 'vp'.
12988  */
12989 void
vnode_breakdirlease(vnode_t vp,bool need_parent,uint32_t oflags)12990 vnode_breakdirlease(vnode_t vp, bool need_parent, uint32_t oflags)
12991 {
12992 	vnode_t dvp;
12993 
12994 	if ((vnode_vtype(vp) != VREG && vnode_vtype(vp) != VDIR) ||
12995 	    (vp == rootvnode)) {
12996 		return;
12997 	}
12998 
12999 	/*
13000 	 * If parent is not provided, first try to get it from the name cache.
13001 	 * If failed, then we will attempt to ask the file system for parent vnode.
13002 	 * This is just a best effort as both attempts could still fail.
13003 	 */
13004 	if (need_parent) {
13005 		dvp = vnode_getparent(vp);
13006 		if (__improbable(dvp == NULLVP)) {
13007 			dvp = vnode_getparent_byid(vp);
13008 		}
13009 	} else {
13010 		dvp = vp;
13011 	}
13012 
13013 	if (__probable(dvp != NULLVP)) {
13014 		/* Always break dir leases. */
13015 		(void)vnode_breaklease(dvp, oflags, vfs_context_current());
13016 	}
13017 
13018 	if (need_parent && (dvp != NULLVP)) {
13019 		vnode_put(dvp);
13020 	}
13021 }
13022 
13023 /*
13024  * Revoke all lease(s) in place for the file.
13025  * This is called when the vnode is reclaimed.
13026  */
13027 void
vnode_revokelease(vnode_t vp,bool locked)13028 vnode_revokelease(vnode_t vp, bool locked)
13029 {
13030 	file_lease_t fl, fl_tmp;
13031 	bool need_wakeup = false;
13032 
13033 	if ((vnode_vtype(vp) != VREG && vnode_vtype(vp) != VDIR)) {
13034 		return;
13035 	}
13036 
13037 	if (!locked) {
13038 		vnode_lock(vp);
13039 	}
13040 
13041 	LIST_FOREACH_SAFE(fl, &vp->v_leases, fl_link, fl_tmp) {
13042 		LIST_REMOVE(fl, fl_link);
13043 		file_lease_free(fl);
13044 		need_wakeup = true;
13045 	}
13046 
13047 	/* Wakeup any lease breaker(s) that might be currently blocked. */
13048 	if (__improbable(need_wakeup)) {
13049 		wakeup(&vp->v_leases);
13050 	}
13051 
13052 	if (!locked) {
13053 		vnode_unlock(vp);
13054 	}
13055 }
13056 
13057 #endif /* CONFIG_FILE_LEASES */
13058