xref: /xnu-8792.61.2/bsd/vfs/vfs_cache.c (revision 42e220869062b56f8d7d0726fd4c88954f87902c)
1 /*
2  * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30  * Copyright (c) 1989, 1993, 1995
31  *	The Regents of the University of California.  All rights reserved.
32  *
33  * This code is derived from software contributed to Berkeley by
34  * Poul-Henning Kamp of the FreeBSD Project.
35  *
36  * Redistribution and use in source and binary forms, with or without
37  * modification, are permitted provided that the following conditions
38  * are met:
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  * 3. All advertising materials mentioning features or use of this software
45  *    must display the following acknowledgement:
46  *	This product includes software developed by the University of
47  *	California, Berkeley and its contributors.
48  * 4. Neither the name of the University nor the names of its contributors
49  *    may be used to endorse or promote products derived from this software
50  *    without specific prior written permission.
51  *
52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62  * SUCH DAMAGE.
63  *
64  *
65  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
66  */
67 /*
68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69  * support for mandatory and extensible security protections.  This notice
70  * is included in support of clause 2.2 (b) of the Apple Public License,
71  * Version 2.0.
72  */
73 #include <sys/param.h>
74 #include <sys/systm.h>
75 #include <sys/time.h>
76 #include <sys/mount_internal.h>
77 #include <sys/vnode_internal.h>
78 #include <miscfs/specfs/specdev.h>
79 #include <sys/namei.h>
80 #include <sys/errno.h>
81 #include <kern/kalloc.h>
82 #include <sys/kauth.h>
83 #include <sys/user.h>
84 #include <sys/paths.h>
85 #include <os/overflow.h>
86 
87 #if CONFIG_MACF
88 #include <security/mac_framework.h>
89 #endif
90 
91 /*
92  * Name caching works as follows:
93  *
94  * Names found by directory scans are retained in a cache
95  * for future reference.  It is managed LRU, so frequently
96  * used names will hang around.  Cache is indexed by hash value
97  * obtained from (vp, name) where vp refers to the directory
98  * containing name.
99  *
100  * If it is a "negative" entry, (i.e. for a name that is known NOT to
101  * exist) the vnode pointer will be NULL.
102  *
103  * Upon reaching the last segment of a path, if the reference
104  * is for DELETE, or NOCACHE is set (rewrite), and the
105  * name is located in the cache, it will be dropped.
106  */
107 
108 /*
109  * Structures associated with name cacheing.
110  */
111 
112 ZONE_DEFINE_TYPE(namecache_zone, "namecache", struct namecache, ZC_NONE);
113 
114 LIST_HEAD(nchashhead, namecache) * nchashtbl;    /* Hash Table */
115 u_long  nchashmask;
116 u_long  nchash;                         /* size of hash table - 1 */
117 long    numcache;                       /* number of cache entries allocated */
118 int     desiredNodes;
119 int     desiredNegNodes;
120 int     ncs_negtotal;
121 TUNABLE_WRITEABLE(int, nc_disabled, "-novfscache", 0);
122 TAILQ_HEAD(, namecache) nchead;         /* chain of all name cache entries */
123 TAILQ_HEAD(, namecache) neghead;        /* chain of only negative cache entries */
124 
125 
126 #if COLLECT_STATS
127 
128 struct  nchstats nchstats;              /* cache effectiveness statistics */
129 
130 #define NCHSTAT(v) {            \
131 	nchstats.v++;           \
132 }
133 #define NAME_CACHE_LOCK()               name_cache_lock()
134 #define NAME_CACHE_UNLOCK()             name_cache_unlock()
135 #define NAME_CACHE_LOCK_SHARED()        name_cache_lock()
136 
137 #else
138 
139 #define NCHSTAT(v)
140 #define NAME_CACHE_LOCK()               name_cache_lock()
141 #define NAME_CACHE_UNLOCK()             name_cache_unlock()
142 #define NAME_CACHE_LOCK_SHARED()        name_cache_lock_shared()
143 
144 #endif
145 
146 
147 /* vars for name cache list lock */
148 static LCK_GRP_DECLARE(namecache_lck_grp, "Name Cache");
149 static LCK_RW_DECLARE(namecache_rw_lock, &namecache_lck_grp);
150 
151 static LCK_GRP_DECLARE(strcache_lck_grp, "String Cache");
152 static LCK_ATTR_DECLARE(strcache_lck_attr, 0, 0);
153 LCK_RW_DECLARE_ATTR(strtable_rw_lock, &strcache_lck_grp, &strcache_lck_attr);
154 
155 static LCK_GRP_DECLARE(rootvnode_lck_grp, "rootvnode");
156 LCK_RW_DECLARE(rootvnode_rw_lock, &rootvnode_lck_grp);
157 
158 #define NUM_STRCACHE_LOCKS 1024
159 
160 lck_mtx_t strcache_mtx_locks[NUM_STRCACHE_LOCKS];
161 
162 
163 static vnode_t cache_lookup_locked(vnode_t dvp, struct componentname *cnp);
164 static const char *add_name_internal(const char *, uint32_t, u_int, boolean_t, u_int);
165 static void init_string_table(void);
166 static void cache_delete(struct namecache *, int);
167 static void cache_enter_locked(vnode_t dvp, vnode_t vp, struct componentname *cnp, const char *strname);
168 static void cache_purge_locked(vnode_t vp, kauth_cred_t *credp);
169 
170 #ifdef DUMP_STRING_TABLE
171 /*
172  * Internal dump function used for debugging
173  */
174 void dump_string_table(void);
175 #endif  /* DUMP_STRING_TABLE */
176 
177 static void init_crc32(void);
178 static unsigned int crc32tab[256];
179 
180 
181 #define NCHHASH(dvp, hash_val) \
182 	(&nchashtbl[(dvp->v_id ^ (hash_val)) & nchashmask])
183 
184 /*
185  * This function tries to check if a directory vp is a subdirectory of dvp
186  * only from valid v_parent pointers. It is called with the name cache lock
187  * held and does not drop the lock anytime inside the function.
188  *
189  * It returns a boolean that indicates whether or not it was able to
190  * successfully infer the parent/descendent relationship via the v_parent
191  * pointers, or if it could not infer such relationship and that the decision
192  * must be delegated to the owning filesystem.
193  *
194  * If it does not defer the decision, i.e. it was successfuly able to determine
195  * the parent/descendent relationship,  *is_subdir tells the caller if vp is a
196  * subdirectory of dvp.
197  *
198  * If the decision is deferred, *next_vp is where it stopped i.e. *next_vp
199  * is the vnode whose parent is to be determined from the filesystem.
200  * *is_subdir, in this case, is not indicative of anything and should be
201  * ignored.
202  *
203  * The return value and output args should be used as follows :
204  *
205  * defer = cache_check_vnode_issubdir(vp, dvp, is_subdir, next_vp);
206  * if (!defer) {
207  *      if (*is_subdir)
208  *              vp is subdirectory;
209  *      else
210  *              vp is not a subdirectory;
211  * } else {
212  *      if (*next_vp)
213  *              check this vnode's parent from the filesystem
214  *      else
215  *              error (likely because of forced unmount).
216  * }
217  *
218  */
219 static boolean_t
cache_check_vnode_issubdir(vnode_t vp,vnode_t dvp,boolean_t * is_subdir,vnode_t * next_vp)220 cache_check_vnode_issubdir(vnode_t vp, vnode_t dvp, boolean_t *is_subdir,
221     vnode_t *next_vp)
222 {
223 	vnode_t tvp = vp;
224 	int defer = FALSE;
225 
226 	*is_subdir = FALSE;
227 	*next_vp = NULLVP;
228 	while (1) {
229 		mount_t tmp;
230 
231 		if (tvp == dvp) {
232 			*is_subdir = TRUE;
233 			break;
234 		} else if (tvp == rootvnode) {
235 			/* *is_subdir = FALSE */
236 			break;
237 		}
238 
239 		tmp = tvp->v_mount;
240 		while ((tvp->v_flag & VROOT) && tmp && tmp->mnt_vnodecovered &&
241 		    tvp != dvp && tvp != rootvnode) {
242 			tvp = tmp->mnt_vnodecovered;
243 			tmp = tvp->v_mount;
244 		}
245 
246 		/*
247 		 * If dvp is not at the top of a mount "stack" then
248 		 * vp is not a subdirectory of dvp either.
249 		 */
250 		if (tvp == dvp || tvp == rootvnode) {
251 			/* *is_subdir = FALSE */
252 			break;
253 		}
254 
255 		if (!tmp) {
256 			defer = TRUE;
257 			*next_vp = NULLVP;
258 			break;
259 		}
260 
261 		if ((tvp->v_flag & VISHARDLINK) || !(tvp->v_parent)) {
262 			defer = TRUE;
263 			*next_vp = tvp;
264 			break;
265 		}
266 
267 		tvp = tvp->v_parent;
268 	}
269 
270 	return defer;
271 }
272 
273 /* maximum times retry from potentially transient errors in vnode_issubdir */
274 #define MAX_ERROR_RETRY 3
275 
276 /*
277  * This function checks if a given directory (vp) is a subdirectory of dvp.
278  * It walks backwards from vp and if it hits dvp in its parent chain,
279  * it is a subdirectory. If it encounters the root directory, it is not
280  * a subdirectory.
281  *
282  * This function returns an error if it is unsuccessful and 0 on success.
283  *
284  * On entry (and exit) vp has an iocount and if this function has to take
285  * any iocounts on other vnodes in the parent chain traversal, it releases them.
286  */
287 int
vnode_issubdir(vnode_t vp,vnode_t dvp,int * is_subdir,vfs_context_t ctx)288 vnode_issubdir(vnode_t vp, vnode_t dvp, int *is_subdir, vfs_context_t ctx)
289 {
290 	vnode_t start_vp, tvp;
291 	vnode_t vp_with_iocount;
292 	int error = 0;
293 	char dotdotbuf[] = "..";
294 	int error_retry_count = 0; /* retry count for potentially transient
295 	                            *  errors */
296 
297 	*is_subdir = FALSE;
298 	tvp = start_vp = vp;
299 	/*
300 	 * Anytime we acquire an iocount in this function, we save the vnode
301 	 * in this variable and release it before exiting.
302 	 */
303 	vp_with_iocount = NULLVP;
304 
305 	while (1) {
306 		boolean_t defer;
307 		vnode_t pvp;
308 		uint32_t vid = 0;
309 		struct componentname cn;
310 		boolean_t is_subdir_locked = FALSE;
311 
312 		if (tvp == dvp) {
313 			*is_subdir = TRUE;
314 			break;
315 		} else if (tvp == rootvnode) {
316 			/* *is_subdir = FALSE */
317 			break;
318 		}
319 
320 		NAME_CACHE_LOCK_SHARED();
321 
322 		defer = cache_check_vnode_issubdir(tvp, dvp, &is_subdir_locked,
323 		    &tvp);
324 
325 		if (defer && tvp) {
326 			vid = vnode_vid(tvp);
327 			vnode_hold(tvp);
328 		}
329 
330 		NAME_CACHE_UNLOCK();
331 
332 		if (!defer) {
333 			*is_subdir = is_subdir_locked;
334 			break;
335 		}
336 
337 		if (!tvp) {
338 			if (error_retry_count++ < MAX_ERROR_RETRY) {
339 				tvp = vp;
340 				continue;
341 			}
342 			error = ENOENT;
343 			break;
344 		}
345 
346 		if (tvp != start_vp) {
347 			if (vp_with_iocount) {
348 				vnode_put(vp_with_iocount);
349 				vp_with_iocount = NULLVP;
350 			}
351 
352 			error = vnode_getwithvid(tvp, vid);
353 			vnode_drop(tvp);
354 			if (error) {
355 				if (error_retry_count++ < MAX_ERROR_RETRY) {
356 					tvp = vp;
357 					error = 0;
358 					continue;
359 				}
360 				break;
361 			}
362 			vp_with_iocount = tvp;
363 		} else {
364 			tvp = vnode_drop(tvp);
365 		}
366 
367 		bzero(&cn, sizeof(cn));
368 		cn.cn_nameiop = LOOKUP;
369 		cn.cn_flags = ISLASTCN | ISDOTDOT;
370 		cn.cn_context = ctx;
371 		cn.cn_pnbuf = &dotdotbuf[0];
372 		cn.cn_pnlen = sizeof(dotdotbuf);
373 		cn.cn_nameptr = cn.cn_pnbuf;
374 		cn.cn_namelen = 2;
375 
376 		pvp = NULLVP;
377 		if ((error = VNOP_LOOKUP(tvp, &pvp, &cn, ctx))) {
378 			break;
379 		}
380 
381 		if (!(tvp->v_flag & VISHARDLINK) && tvp->v_parent != pvp) {
382 			(void)vnode_update_identity(tvp, pvp, NULL, 0, 0,
383 			    VNODE_UPDATE_PARENT);
384 		}
385 
386 		if (vp_with_iocount) {
387 			vnode_put(vp_with_iocount);
388 		}
389 
390 		vp_with_iocount = tvp = pvp;
391 	}
392 
393 	if (vp_with_iocount) {
394 		vnode_put(vp_with_iocount);
395 	}
396 
397 	return error;
398 }
399 
400 /*
401  * This function builds the path in "buff" from the supplied vnode.
402  * The length of the buffer *INCLUDING* the trailing zero byte is
403  * returned in outlen.  NOTE: the length includes the trailing zero
404  * byte and thus the length is one greater than what strlen would
405  * return.  This is important and lots of code elsewhere in the kernel
406  * assumes this behavior.
407  *
408  * This function can call vnop in file system if the parent vnode
409  * does not exist or when called for hardlinks via volfs path.
410  * If BUILDPATH_NO_FS_ENTER is set in flags, it only uses values present
411  * in the name cache and does not enter the file system.
412  *
413  * If BUILDPATH_CHECK_MOVED is set in flags, we return EAGAIN when
414  * we encounter ENOENT during path reconstruction.  ENOENT means that
415  * one of the parents moved while we were building the path.  The
416  * caller can special handle this case by calling build_path again.
417  *
418  * If BUILDPATH_VOLUME_RELATIVE is set in flags, we return path
419  * that is relative to the nearest mount point, i.e. do not
420  * cross over mount points during building the path.
421  *
422  * passed in vp must have a valid io_count reference
423  *
424  * If parent vnode is non-NULL it also must have an io count.  This
425  * allows build_path_with_parent to be safely called for operations
426  * unlink, rmdir and rename that already have io counts on the target
427  * and the directory. In this way build_path_with_parent does not have
428  * to try and obtain an additional io count on the parent.  Taking an
429  * io count ont the parent can lead to dead lock if a forced unmount
430  * occures at the right moment. For a fuller explaination on how this
431  * can occur see the comment for vn_getpath_with_parent.
432  *
433  */
434 int
build_path_with_parent(vnode_t first_vp,vnode_t parent_vp,char * buff,int buflen,int * outlen,size_t * mntpt_outlen,int flags,vfs_context_t ctx)435 build_path_with_parent(vnode_t first_vp, vnode_t parent_vp, char *buff, int buflen,
436     int *outlen, size_t *mntpt_outlen, int flags, vfs_context_t ctx)
437 {
438 	vnode_t vp, tvp;
439 	vnode_t vp_with_iocount;
440 	vnode_t proc_root_dir_vp;
441 	char *end;
442 	char *mntpt_end;
443 	const char *str;
444 	unsigned int  len;
445 	int  ret = 0;
446 	int  fixhardlink;
447 
448 	if (first_vp == NULLVP) {
449 		return EINVAL;
450 	}
451 
452 	if (buflen <= 1) {
453 		return ENOSPC;
454 	}
455 
456 	/*
457 	 * Grab the process fd so we can evaluate fd_rdir.
458 	 */
459 	if (!(flags & BUILDPATH_NO_PROCROOT)) {
460 		proc_root_dir_vp = vfs_context_proc(ctx)->p_fd.fd_rdir;
461 	} else {
462 		proc_root_dir_vp = NULL;
463 	}
464 
465 	vp_with_iocount = NULLVP;
466 again:
467 	vp = first_vp;
468 
469 	end = &buff[buflen - 1];
470 	*end = '\0';
471 	mntpt_end = NULL;
472 
473 	/*
474 	 * Catch a special corner case here: chroot to /full/path/to/dir, chdir to
475 	 * it, then open it. Without this check, the path to it will be
476 	 * /full/path/to/dir instead of "/".
477 	 */
478 	if (proc_root_dir_vp == first_vp) {
479 		*--end = '/';
480 		goto out;
481 	}
482 
483 	/*
484 	 * holding the NAME_CACHE_LOCK in shared mode is
485 	 * sufficient to stabilize both the vp->v_parent chain
486 	 * and the 'vp->v_mount->mnt_vnodecovered' chain
487 	 *
488 	 * if we need to drop this lock, we must first grab the v_id
489 	 * from the vnode we're currently working with... if that
490 	 * vnode doesn't already have an io_count reference (the vp
491 	 * passed in comes with one), we must grab a reference
492 	 * after we drop the NAME_CACHE_LOCK via vnode_getwithvid...
493 	 * deadlocks may result if you call vnode_get while holding
494 	 * the NAME_CACHE_LOCK... we lazily release the reference
495 	 * we pick up the next time we encounter a need to drop
496 	 * the NAME_CACHE_LOCK or before we return from this routine
497 	 */
498 	NAME_CACHE_LOCK_SHARED();
499 
500 #if CONFIG_FIRMLINKS
501 	if (!(flags & BUILDPATH_NO_FIRMLINK) &&
502 	    (vp->v_flag & VFMLINKTARGET) && vp->v_fmlink && (vp->v_fmlink->v_type == VDIR)) {
503 		vp = vp->v_fmlink;
504 	}
505 #endif
506 
507 	/*
508 	 * Check if this is the root of a file system.
509 	 */
510 	while (vp && vp->v_flag & VROOT) {
511 		if (vp->v_mount == NULL) {
512 			ret = EINVAL;
513 			goto out_unlock;
514 		}
515 		if ((vp->v_mount->mnt_flag & MNT_ROOTFS) || (vp == proc_root_dir_vp)) {
516 			/*
517 			 * It's the root of the root file system, so it's
518 			 * just "/".
519 			 */
520 			*--end = '/';
521 
522 			goto out_unlock;
523 		} else {
524 			/*
525 			 * This the root of the volume and the caller does not
526 			 * want to cross mount points.  Therefore just return
527 			 * '/' as the relative path.
528 			 */
529 #if CONFIG_FIRMLINKS
530 			if (!(flags & BUILDPATH_NO_FIRMLINK) &&
531 			    (vp->v_flag & VFMLINKTARGET) && vp->v_fmlink && (vp->v_fmlink->v_type == VDIR)) {
532 				vp = vp->v_fmlink;
533 			} else
534 #endif
535 			if (flags & BUILDPATH_VOLUME_RELATIVE) {
536 				*--end = '/';
537 				goto out_unlock;
538 			} else {
539 				vp = vp->v_mount->mnt_vnodecovered;
540 				if (!mntpt_end && vp) {
541 					mntpt_end = end;
542 				}
543 			}
544 		}
545 	}
546 
547 	while ((vp != NULLVP) && (vp->v_parent != vp)) {
548 		int  vid;
549 
550 		/*
551 		 * For hardlinks the v_name may be stale, so if its OK
552 		 * to enter a file system, ask the file system for the
553 		 * name and parent (below).
554 		 */
555 		fixhardlink = (vp->v_flag & VISHARDLINK) &&
556 		    (vp->v_mount->mnt_kern_flag & MNTK_PATH_FROM_ID) &&
557 		    !(flags & BUILDPATH_NO_FS_ENTER);
558 
559 		if (!fixhardlink) {
560 			str = vp->v_name;
561 
562 			if (str == NULL || *str == '\0') {
563 				if (vp->v_parent != NULL) {
564 					ret = EINVAL;
565 				} else {
566 					ret = ENOENT;
567 				}
568 				goto out_unlock;
569 			}
570 			len = (unsigned int)strlen(str);
571 			/*
572 			 * Check that there's enough space (including space for the '/')
573 			 */
574 			if ((unsigned int)(end - buff) < (len + 1)) {
575 				ret = ENOSPC;
576 				goto out_unlock;
577 			}
578 			/*
579 			 * Copy the name backwards.
580 			 */
581 			str += len;
582 
583 			for (; len > 0; len--) {
584 				*--end = *--str;
585 			}
586 			/*
587 			 * Add a path separator.
588 			 */
589 			*--end = '/';
590 		}
591 
592 		/*
593 		 * Walk up the parent chain.
594 		 */
595 		if (((vp->v_parent != NULLVP) && !fixhardlink) ||
596 		    (flags & BUILDPATH_NO_FS_ENTER)) {
597 			/*
598 			 * In this if () block we are not allowed to enter the filesystem
599 			 * to conclusively get the most accurate parent identifier.
600 			 * As a result, if 'vp' does not identify '/' and it
601 			 * does not have a valid v_parent, then error out
602 			 * and disallow further path construction
603 			 */
604 			if ((vp->v_parent == NULLVP) && (rootvnode != vp)) {
605 				/*
606 				 * Only '/' is allowed to have a NULL parent
607 				 * pointer. Upper level callers should ideally
608 				 * re-drive name lookup on receiving a ENOENT.
609 				 */
610 				ret = ENOENT;
611 
612 				/* The code below will exit early if 'tvp = vp' == NULL */
613 			}
614 			vp = vp->v_parent;
615 
616 			/*
617 			 * if the vnode we have in hand isn't a directory and it
618 			 * has a v_parent, then we started with the resource fork
619 			 * so skip up to avoid getting a duplicate copy of the
620 			 * file name in the path.
621 			 */
622 			if (vp && !vnode_isdir(vp) && vp->v_parent) {
623 				vp = vp->v_parent;
624 			}
625 		} else {
626 			/*
627 			 * No parent, go get it if supported.
628 			 */
629 			struct vnode_attr  va;
630 			vnode_t  dvp;
631 
632 			/*
633 			 * Make sure file system supports obtaining a path from id.
634 			 */
635 			if (!(vp->v_mount->mnt_kern_flag & MNTK_PATH_FROM_ID)) {
636 				ret = ENOENT;
637 				goto out_unlock;
638 			}
639 			vid = vp->v_id;
640 
641 			vnode_hold(vp);
642 			NAME_CACHE_UNLOCK();
643 
644 			if (vp != first_vp && vp != parent_vp && vp != vp_with_iocount) {
645 				if (vp_with_iocount) {
646 					vnode_put(vp_with_iocount);
647 					vp_with_iocount = NULLVP;
648 				}
649 				if (vnode_getwithvid(vp, vid)) {
650 					vnode_drop(vp);
651 					goto again;
652 				}
653 				vp_with_iocount = vp;
654 			}
655 
656 			vnode_drop(vp);
657 
658 			VATTR_INIT(&va);
659 			VATTR_WANTED(&va, va_parentid);
660 
661 			if (fixhardlink) {
662 				VATTR_WANTED(&va, va_name);
663 				va.va_name = zalloc(ZV_NAMEI);
664 			} else {
665 				va.va_name = NULL;
666 			}
667 			/*
668 			 * Ask the file system for its parent id and for its name (optional).
669 			 */
670 			ret = vnode_getattr(vp, &va, ctx);
671 
672 			if (fixhardlink) {
673 				if ((ret == 0) && (VATTR_IS_SUPPORTED(&va, va_name))) {
674 					str = va.va_name;
675 					vnode_update_identity(vp, NULL, str, (unsigned int)strlen(str), 0, VNODE_UPDATE_NAME);
676 				} else if (vp->v_name) {
677 					str = vp->v_name;
678 					ret = 0;
679 				} else {
680 					ret = ENOENT;
681 					goto bad_news;
682 				}
683 				len = (unsigned int)strlen(str);
684 
685 				/*
686 				 * Check that there's enough space.
687 				 */
688 				if ((unsigned int)(end - buff) < (len + 1)) {
689 					ret = ENOSPC;
690 				} else {
691 					/* Copy the name backwards. */
692 					str += len;
693 
694 					for (; len > 0; len--) {
695 						*--end = *--str;
696 					}
697 					/*
698 					 * Add a path separator.
699 					 */
700 					*--end = '/';
701 				}
702 bad_news:
703 				zfree(ZV_NAMEI, va.va_name);
704 			}
705 			if (ret || !VATTR_IS_SUPPORTED(&va, va_parentid)) {
706 				ret = ENOENT;
707 				goto out;
708 			}
709 			/*
710 			 * Ask the file system for the parent vnode.
711 			 */
712 			if ((ret = VFS_VGET(vp->v_mount, (ino64_t)va.va_parentid, &dvp, ctx))) {
713 				goto out;
714 			}
715 
716 			if (!fixhardlink && (vp->v_parent != dvp)) {
717 				vnode_update_identity(vp, dvp, NULL, 0, 0, VNODE_UPDATE_PARENT);
718 			}
719 
720 			if (vp_with_iocount) {
721 				vnode_put(vp_with_iocount);
722 			}
723 			vp = dvp;
724 			vp_with_iocount = vp;
725 
726 			NAME_CACHE_LOCK_SHARED();
727 
728 			/*
729 			 * if the vnode we have in hand isn't a directory and it
730 			 * has a v_parent, then we started with the resource fork
731 			 * so skip up to avoid getting a duplicate copy of the
732 			 * file name in the path.
733 			 */
734 			if (vp && !vnode_isdir(vp) && vp->v_parent) {
735 				vp = vp->v_parent;
736 			}
737 		}
738 
739 		if (vp && (flags & BUILDPATH_CHECKACCESS)) {
740 			vid = vp->v_id;
741 
742 			vnode_hold(vp);
743 			NAME_CACHE_UNLOCK();
744 
745 			if (vp != first_vp && vp != parent_vp && vp != vp_with_iocount) {
746 				if (vp_with_iocount) {
747 					vnode_put(vp_with_iocount);
748 					vp_with_iocount = NULLVP;
749 				}
750 				if (vnode_getwithvid(vp, vid)) {
751 					vnode_drop(vp);
752 					goto again;
753 				}
754 				vp_with_iocount = vp;
755 			}
756 			vnode_drop(vp);
757 
758 			if ((ret = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx))) {
759 				goto out;       /* no peeking */
760 			}
761 			NAME_CACHE_LOCK_SHARED();
762 		}
763 
764 		/*
765 		 * When a mount point is crossed switch the vp.
766 		 * Continue until we find the root or we find
767 		 * a vnode that's not the root of a mounted
768 		 * file system.
769 		 */
770 		tvp = vp;
771 
772 		while (tvp) {
773 			if (tvp == proc_root_dir_vp) {
774 				goto out_unlock;        /* encountered the root */
775 			}
776 
777 #if CONFIG_FIRMLINKS
778 			if (!(flags & BUILDPATH_NO_FIRMLINK) &&
779 			    (tvp->v_flag & VFMLINKTARGET) && tvp->v_fmlink && (tvp->v_fmlink->v_type == VDIR)) {
780 				tvp = tvp->v_fmlink;
781 				break;
782 			}
783 #endif
784 
785 			if (!(tvp->v_flag & VROOT) || !tvp->v_mount) {
786 				break;                  /* not the root of a mounted FS */
787 			}
788 			if (flags & BUILDPATH_VOLUME_RELATIVE) {
789 				/* Do not cross over mount points */
790 				tvp = NULL;
791 			} else {
792 				tvp = tvp->v_mount->mnt_vnodecovered;
793 				if (!mntpt_end && tvp) {
794 					mntpt_end = end;
795 				}
796 			}
797 		}
798 		if (tvp == NULLVP) {
799 			goto out_unlock;
800 		}
801 		vp = tvp;
802 	}
803 out_unlock:
804 	NAME_CACHE_UNLOCK();
805 out:
806 	if (vp_with_iocount) {
807 		vnode_put(vp_with_iocount);
808 	}
809 	/*
810 	 * Slide the name down to the beginning of the buffer.
811 	 */
812 	memmove(buff, end, &buff[buflen] - end);
813 
814 	/*
815 	 * length includes the trailing zero byte
816 	 */
817 	*outlen = (int)(&buff[buflen] - end);
818 	if (mntpt_outlen && mntpt_end) {
819 		*mntpt_outlen = (size_t)*outlen - (size_t)(&buff[buflen] - mntpt_end);
820 	}
821 
822 	/* One of the parents was moved during path reconstruction.
823 	 * The caller is interested in knowing whether any of the
824 	 * parents moved via BUILDPATH_CHECK_MOVED, so return EAGAIN.
825 	 */
826 	if ((ret == ENOENT) && (flags & BUILDPATH_CHECK_MOVED)) {
827 		ret = EAGAIN;
828 	}
829 
830 	return ret;
831 }
832 
833 int
build_path(vnode_t first_vp,char * buff,int buflen,int * outlen,int flags,vfs_context_t ctx)834 build_path(vnode_t first_vp, char *buff, int buflen, int *outlen, int flags, vfs_context_t ctx)
835 {
836 	return build_path_with_parent(first_vp, NULL, buff, buflen, outlen, NULL, flags, ctx);
837 }
838 
839 /*
840  * return NULLVP if vp's parent doesn't
841  * exist, or we can't get a valid iocount
842  * else return the parent of vp
843  */
844 vnode_t
vnode_getparent(vnode_t vp)845 vnode_getparent(vnode_t vp)
846 {
847 	vnode_t pvp = NULLVP;
848 	int     pvid;
849 
850 	NAME_CACHE_LOCK_SHARED();
851 
852 	pvp = vp->v_parent;
853 
854 	/*
855 	 * v_parent is stable behind the name_cache lock
856 	 * however, the only thing we can really guarantee
857 	 * is that we've grabbed a valid iocount on the
858 	 * parent of 'vp' at the time we took the name_cache lock...
859 	 * once we drop the lock, vp could get re-parented
860 	 */
861 	if (pvp != NULLVP) {
862 		pvid = pvp->v_id;
863 
864 		vnode_hold(pvp);
865 		NAME_CACHE_UNLOCK();
866 
867 		if (vnode_getwithvid(pvp, pvid) != 0) {
868 			vnode_drop(pvp);
869 			pvp = NULL;
870 		} else {
871 			vnode_drop(pvp);
872 		}
873 	} else {
874 		NAME_CACHE_UNLOCK();
875 	}
876 	return pvp;
877 }
878 
879 const char *
vnode_getname(vnode_t vp)880 vnode_getname(vnode_t vp)
881 {
882 	const char *name = NULL;
883 
884 	NAME_CACHE_LOCK_SHARED();
885 
886 	if (vp->v_name) {
887 		name = vfs_addname(vp->v_name, (unsigned int)strlen(vp->v_name), 0, 0);
888 	}
889 	NAME_CACHE_UNLOCK();
890 
891 	return name;
892 }
893 
894 void
vnode_putname(const char * name)895 vnode_putname(const char *name)
896 {
897 	vfs_removename(name);
898 }
899 
900 static const char unknown_vnodename[] = "(unknown vnode name)";
901 
902 const char *
vnode_getname_printable(vnode_t vp)903 vnode_getname_printable(vnode_t vp)
904 {
905 	const char *name = vnode_getname(vp);
906 	if (name != NULL) {
907 		return name;
908 	}
909 
910 	switch (vp->v_type) {
911 	case VCHR:
912 	case VBLK:
913 	{
914 		/*
915 		 * Create an artificial dev name from
916 		 * major and minor device number
917 		 */
918 		char dev_name[64];
919 		(void) snprintf(dev_name, sizeof(dev_name),
920 		    "%c(%u, %u)", VCHR == vp->v_type ? 'c':'b',
921 		    major(vp->v_rdev), minor(vp->v_rdev));
922 		/*
923 		 * Add the newly created dev name to the name
924 		 * cache to allow easier cleanup. Also,
925 		 * vfs_addname allocates memory for the new name
926 		 * and returns it.
927 		 */
928 		NAME_CACHE_LOCK_SHARED();
929 		name = vfs_addname(dev_name, (unsigned int)strlen(dev_name), 0, 0);
930 		NAME_CACHE_UNLOCK();
931 		return name;
932 	}
933 	default:
934 		return unknown_vnodename;
935 	}
936 }
937 
938 void
vnode_putname_printable(const char * name)939 vnode_putname_printable(const char *name)
940 {
941 	if (name == unknown_vnodename) {
942 		return;
943 	}
944 	vnode_putname(name);
945 }
946 
947 
948 /*
949  * if VNODE_UPDATE_PARENT, and we can take
950  * a reference on dvp, then update vp with
951  * it's new parent... if vp already has a parent,
952  * then drop the reference vp held on it
953  *
954  * if VNODE_UPDATE_NAME,
955  * then drop string ref on v_name if it exists, and if name is non-NULL
956  * then pick up a string reference on name and record it in v_name...
957  * optionally pass in the length and hashval of name if known
958  *
959  * if VNODE_UPDATE_CACHE, flush the name cache entries associated with vp
960  */
961 void
vnode_update_identity(vnode_t vp,vnode_t dvp,const char * name,int name_len,uint32_t name_hashval,int flags)962 vnode_update_identity(vnode_t vp, vnode_t dvp, const char *name, int name_len, uint32_t name_hashval, int flags)
963 {
964 	struct  namecache *ncp;
965 	vnode_t old_parentvp = NULLVP;
966 	int isstream = (vp->v_flag & VISNAMEDSTREAM);
967 	int kusecountbumped = 0;
968 	kauth_cred_t tcred = NULL;
969 	const char *vname = NULL;
970 	const char *tname = NULL;
971 
972 	if (name_len < 0) {
973 		return;
974 	}
975 
976 	if (flags & VNODE_UPDATE_PARENT) {
977 		if (dvp && vnode_ref(dvp) != 0) {
978 			dvp = NULLVP;
979 		}
980 		/* Don't count a stream's parent ref during unmounts */
981 		if (isstream && dvp && (dvp != vp) && (dvp != vp->v_parent) && (dvp->v_type == VREG)) {
982 			vnode_lock_spin(dvp);
983 			++dvp->v_kusecount;
984 			kusecountbumped = 1;
985 			vnode_unlock(dvp);
986 		}
987 	} else {
988 		dvp = NULLVP;
989 	}
990 	if ((flags & VNODE_UPDATE_NAME)) {
991 		if (name != vp->v_name) {
992 			if (name && *name) {
993 				if (name_len == 0) {
994 					name_len = (int)strlen(name);
995 				}
996 				tname = vfs_addname(name, name_len, name_hashval, 0);
997 			}
998 		} else {
999 			flags &= ~VNODE_UPDATE_NAME;
1000 		}
1001 	}
1002 	if ((flags & (VNODE_UPDATE_PURGE | VNODE_UPDATE_PARENT | VNODE_UPDATE_CACHE | VNODE_UPDATE_NAME | VNODE_UPDATE_PURGEFIRMLINK))) {
1003 		NAME_CACHE_LOCK();
1004 
1005 #if CONFIG_FIRMLINKS
1006 		if (flags & VNODE_UPDATE_PURGEFIRMLINK) {
1007 			vnode_t old_fvp = vp->v_fmlink;
1008 			if (old_fvp) {
1009 				vnode_lock_spin(vp);
1010 				vp->v_flag &= ~VFMLINKTARGET;
1011 				vp->v_fmlink = NULLVP;
1012 				vnode_unlock(vp);
1013 				NAME_CACHE_UNLOCK();
1014 
1015 				/*
1016 				 * vnode_rele can result in cascading series of
1017 				 * usecount releases. The combination of calling
1018 				 * vnode_recycle and dont_reenter (3rd arg to
1019 				 * vnode_rele_internal) ensures we don't have
1020 				 * that issue.
1021 				 */
1022 				vnode_recycle(old_fvp);
1023 				vnode_rele_internal(old_fvp, O_EVTONLY, 1, 0);
1024 
1025 				NAME_CACHE_LOCK();
1026 			}
1027 		}
1028 #endif
1029 
1030 		if ((flags & VNODE_UPDATE_PURGE)) {
1031 			if (vp->v_parent) {
1032 				vp->v_parent->v_nc_generation++;
1033 			}
1034 
1035 			while ((ncp = LIST_FIRST(&vp->v_nclinks))) {
1036 				cache_delete(ncp, 1);
1037 			}
1038 
1039 			while ((ncp = TAILQ_FIRST(&vp->v_ncchildren))) {
1040 				cache_delete(ncp, 1);
1041 			}
1042 
1043 			/*
1044 			 * Use a temp variable to avoid kauth_cred_drop() while NAME_CACHE_LOCK is held
1045 			 */
1046 			tcred = vnode_cred(vp);
1047 			vp->v_cred = NOCRED;
1048 			vp->v_authorized_actions = 0;
1049 			vp->v_cred_timestamp = 0;
1050 		}
1051 		if ((flags & VNODE_UPDATE_NAME)) {
1052 			vname = vp->v_name;
1053 			vp->v_name = tname;
1054 		}
1055 		if (flags & VNODE_UPDATE_PARENT) {
1056 			if (dvp != vp && dvp != vp->v_parent) {
1057 				old_parentvp = vp->v_parent;
1058 				vp->v_parent = dvp;
1059 				dvp = NULLVP;
1060 
1061 				if (old_parentvp) {
1062 					flags |= VNODE_UPDATE_CACHE;
1063 				}
1064 			}
1065 		}
1066 		if (flags & VNODE_UPDATE_CACHE) {
1067 			while ((ncp = LIST_FIRST(&vp->v_nclinks))) {
1068 				cache_delete(ncp, 1);
1069 			}
1070 		}
1071 		NAME_CACHE_UNLOCK();
1072 
1073 		if (vname != NULL) {
1074 			vfs_removename(vname);
1075 		}
1076 
1077 		kauth_cred_set(&tcred, NOCRED);
1078 	}
1079 	if (dvp != NULLVP) {
1080 		/* Back-out the ref we took if we lost a race for vp->v_parent. */
1081 		if (kusecountbumped) {
1082 			vnode_lock_spin(dvp);
1083 			if (dvp->v_kusecount > 0) {
1084 				--dvp->v_kusecount;
1085 			}
1086 			vnode_unlock(dvp);
1087 		}
1088 		vnode_rele(dvp);
1089 	}
1090 	if (old_parentvp) {
1091 		struct  uthread *ut;
1092 		vnode_t vreclaims = NULLVP;
1093 
1094 		if (isstream) {
1095 			vnode_lock_spin(old_parentvp);
1096 			if ((old_parentvp->v_type != VDIR) && (old_parentvp->v_kusecount > 0)) {
1097 				--old_parentvp->v_kusecount;
1098 			}
1099 			vnode_unlock(old_parentvp);
1100 		}
1101 		ut = current_uthread();
1102 
1103 		/*
1104 		 * indicated to vnode_rele that it shouldn't do a
1105 		 * vnode_reclaim at this time... instead it will
1106 		 * chain the vnode to the uu_vreclaims list...
1107 		 * we'll be responsible for calling vnode_reclaim
1108 		 * on each of the vnodes in this list...
1109 		 */
1110 		ut->uu_defer_reclaims = 1;
1111 		ut->uu_vreclaims = NULLVP;
1112 
1113 		while ((vp = old_parentvp) != NULLVP) {
1114 			vnode_hold(vp);
1115 			vnode_lock_spin(vp);
1116 			vnode_rele_internal(vp, 0, 0, 1);
1117 
1118 			/*
1119 			 * check to see if the vnode is now in the state
1120 			 * that would have triggered a vnode_reclaim in vnode_rele
1121 			 * if it is, we save it's parent pointer and then NULL
1122 			 * out the v_parent field... we'll drop the reference
1123 			 * that was held on the next iteration of this loop...
1124 			 * this short circuits a potential deep recursion if we
1125 			 * have a long chain of parents in this state...
1126 			 * we'll sit in this loop until we run into
1127 			 * a parent in this chain that is not in this state
1128 			 *
1129 			 * make our check and the vnode_rele atomic
1130 			 * with respect to the current vnode we're working on
1131 			 * by holding the vnode lock
1132 			 * if vnode_rele deferred the vnode_reclaim and has put
1133 			 * this vnode on the list to be reaped by us, than
1134 			 * it has left this vnode with an iocount == 1
1135 			 */
1136 			if (ut->uu_vreclaims == vp) {
1137 				/*
1138 				 * This vnode is on the head of the uu_vreclaims chain
1139 				 * which means vnode_rele wanted to do a vnode_reclaim
1140 				 * on this vnode. Pull the parent pointer now so that when we do the
1141 				 * vnode_reclaim for each of the vnodes in the uu_vreclaims
1142 				 * list, we won't recurse back through here
1143 				 *
1144 				 * need to do a convert here in case vnode_rele_internal
1145 				 * returns with the lock held in the spin mode... it
1146 				 * can drop and retake the lock under certain circumstances
1147 				 */
1148 				vnode_lock_convert(vp);
1149 
1150 				NAME_CACHE_LOCK();
1151 				old_parentvp = vp->v_parent;
1152 				vp->v_parent = NULLVP;
1153 				NAME_CACHE_UNLOCK();
1154 			} else {
1155 				/*
1156 				 * we're done... we ran into a vnode that isn't
1157 				 * being terminated
1158 				 */
1159 				old_parentvp = NULLVP;
1160 			}
1161 			vnode_drop_and_unlock(vp);
1162 		}
1163 		vreclaims = ut->uu_vreclaims;
1164 		ut->uu_vreclaims = NULLVP;
1165 		ut->uu_defer_reclaims = 0;
1166 
1167 		while ((vp = vreclaims) != NULLVP) {
1168 			vreclaims = vp->v_defer_reclaimlist;
1169 
1170 			/*
1171 			 * vnode_put will drive the vnode_reclaim if
1172 			 * we are still the only reference on this vnode
1173 			 */
1174 			vnode_put(vp);
1175 		}
1176 	}
1177 }
1178 
1179 #if CONFIG_FIRMLINKS
1180 errno_t
vnode_setasfirmlink(vnode_t vp,vnode_t target_vp)1181 vnode_setasfirmlink(vnode_t vp, vnode_t target_vp)
1182 {
1183 	int error = 0;
1184 	vnode_t old_target_vp = NULLVP;
1185 	vnode_t old_target_vp_v_fmlink = NULLVP;
1186 	kauth_cred_t target_vp_cred = NULL;
1187 	kauth_cred_t old_target_vp_cred = NULL;
1188 
1189 	if (!vp) {
1190 		return EINVAL;
1191 	}
1192 
1193 	if (target_vp) {
1194 		if (vp->v_fmlink == target_vp) { /* Will be checked again under the name cache lock */
1195 			return 0;
1196 		}
1197 
1198 		/*
1199 		 * Firmlink source and target will take both a usecount
1200 		 * and kusecount on each other.
1201 		 */
1202 		if ((error = vnode_ref_ext(target_vp, O_EVTONLY, VNODE_REF_FORCE))) {
1203 			return error;
1204 		}
1205 
1206 		if ((error = vnode_ref_ext(vp, O_EVTONLY, VNODE_REF_FORCE))) {
1207 			vnode_rele_ext(target_vp, O_EVTONLY, 1);
1208 			return error;
1209 		}
1210 	}
1211 
1212 	NAME_CACHE_LOCK();
1213 
1214 	old_target_vp = vp->v_fmlink;
1215 	if (target_vp && (target_vp == old_target_vp)) {
1216 		NAME_CACHE_UNLOCK();
1217 		return 0;
1218 	}
1219 	vp->v_fmlink = target_vp;
1220 
1221 	vnode_lock_spin(vp);
1222 	vp->v_flag &= ~VFMLINKTARGET;
1223 	vnode_unlock(vp);
1224 
1225 	if (target_vp) {
1226 		target_vp->v_fmlink = vp;
1227 		vnode_lock_spin(target_vp);
1228 		target_vp->v_flag |= VFMLINKTARGET;
1229 		vnode_unlock(target_vp);
1230 		cache_purge_locked(vp, &target_vp_cred);
1231 	}
1232 
1233 	if (old_target_vp) {
1234 		old_target_vp_v_fmlink = old_target_vp->v_fmlink;
1235 		old_target_vp->v_fmlink = NULLVP;
1236 		vnode_lock_spin(old_target_vp);
1237 		old_target_vp->v_flag &= ~VFMLINKTARGET;
1238 		vnode_unlock(old_target_vp);
1239 		cache_purge_locked(vp, &old_target_vp_cred);
1240 	}
1241 
1242 	NAME_CACHE_UNLOCK();
1243 
1244 	kauth_cred_set(&target_vp_cred, NOCRED);
1245 
1246 	if (old_target_vp) {
1247 		kauth_cred_set(&old_target_vp_cred, NOCRED);
1248 
1249 		vnode_rele_ext(old_target_vp, O_EVTONLY, 1);
1250 		if (old_target_vp_v_fmlink) {
1251 			vnode_rele_ext(old_target_vp_v_fmlink, O_EVTONLY, 1);
1252 		}
1253 	}
1254 
1255 	return 0;
1256 }
1257 
1258 errno_t
vnode_getfirmlink(vnode_t vp,vnode_t * target_vp)1259 vnode_getfirmlink(vnode_t vp, vnode_t *target_vp)
1260 {
1261 	int error;
1262 
1263 	if (!vp->v_fmlink) {
1264 		return ENODEV;
1265 	}
1266 
1267 	NAME_CACHE_LOCK_SHARED();
1268 	if (vp->v_fmlink && !(vp->v_flag & VFMLINKTARGET) &&
1269 	    (vnode_get(vp->v_fmlink) == 0)) {
1270 		vnode_t tvp = vp->v_fmlink;
1271 
1272 		vnode_lock_spin(tvp);
1273 		if (tvp->v_lflag & (VL_TERMINATE | VL_DEAD)) {
1274 			vnode_unlock(tvp);
1275 			NAME_CACHE_UNLOCK();
1276 			vnode_put(tvp);
1277 			return ENOENT;
1278 		}
1279 		if (!(tvp->v_flag & VFMLINKTARGET)) {
1280 			panic("firmlink target for vnode %p does not have flag set", vp);
1281 		}
1282 		vnode_unlock(tvp);
1283 		*target_vp = tvp;
1284 		error = 0;
1285 	} else {
1286 		*target_vp = NULLVP;
1287 		error = ENODEV;
1288 	}
1289 	NAME_CACHE_UNLOCK();
1290 	return error;
1291 }
1292 
1293 #else /* CONFIG_FIRMLINKS */
1294 
1295 errno_t
vnode_setasfirmlink(__unused vnode_t vp,__unused vnode_t src_vp)1296 vnode_setasfirmlink(__unused vnode_t vp, __unused vnode_t src_vp)
1297 {
1298 	return ENOTSUP;
1299 }
1300 
1301 errno_t
vnode_getfirmlink(__unused vnode_t vp,__unused vnode_t * target_vp)1302 vnode_getfirmlink(__unused vnode_t vp, __unused vnode_t *target_vp)
1303 {
1304 	return ENOTSUP;
1305 }
1306 
1307 #endif
1308 
1309 /*
1310  * Mark a vnode as having multiple hard links.  HFS makes use of this
1311  * because it keeps track of each link separately, and wants to know
1312  * which link was actually used.
1313  *
1314  * This will cause the name cache to force a VNOP_LOOKUP on the vnode
1315  * so that HFS can post-process the lookup.  Also, volfs will call
1316  * VNOP_GETATTR2 to determine the parent, instead of using v_parent.
1317  */
1318 void
vnode_setmultipath(vnode_t vp)1319 vnode_setmultipath(vnode_t vp)
1320 {
1321 	vnode_lock_spin(vp);
1322 
1323 	/*
1324 	 * In theory, we're changing the vnode's identity as far as the
1325 	 * name cache is concerned, so we ought to grab the name cache lock
1326 	 * here.  However, there is already a race, and grabbing the name
1327 	 * cache lock only makes the race window slightly smaller.
1328 	 *
1329 	 * The race happens because the vnode already exists in the name
1330 	 * cache, and could be found by one thread before another thread
1331 	 * can set the hard link flag.
1332 	 */
1333 
1334 	vp->v_flag |= VISHARDLINK;
1335 
1336 	vnode_unlock(vp);
1337 }
1338 
1339 
1340 
1341 /*
1342  * backwards compatibility
1343  */
1344 void
vnode_uncache_credentials(vnode_t vp)1345 vnode_uncache_credentials(vnode_t vp)
1346 {
1347 	vnode_uncache_authorized_action(vp, KAUTH_INVALIDATE_CACHED_RIGHTS);
1348 }
1349 
1350 
1351 /*
1352  * use the exclusive form of NAME_CACHE_LOCK to protect the update of the
1353  * following fields in the vnode: v_cred_timestamp, v_cred, v_authorized_actions
1354  * we use this lock so that we can look at the v_cred and v_authorized_actions
1355  * atomically while behind the NAME_CACHE_LOCK in shared mode in 'cache_lookup_path',
1356  * which is the super-hot path... if we are updating the authorized actions for this
1357  * vnode, we are already in the super-slow and far less frequented path so its not
1358  * that bad that we take the lock exclusive for this case... of course we strive
1359  * to hold it for the minimum amount of time possible
1360  */
1361 
1362 void
vnode_uncache_authorized_action(vnode_t vp,kauth_action_t action)1363 vnode_uncache_authorized_action(vnode_t vp, kauth_action_t action)
1364 {
1365 	kauth_cred_t tcred = NOCRED;
1366 
1367 	NAME_CACHE_LOCK();
1368 
1369 	vp->v_authorized_actions &= ~action;
1370 
1371 	if (action == KAUTH_INVALIDATE_CACHED_RIGHTS &&
1372 	    IS_VALID_CRED(vp->v_cred)) {
1373 		/*
1374 		 * Use a temp variable to avoid kauth_cred_unref() while NAME_CACHE_LOCK is held
1375 		 */
1376 		tcred = vnode_cred(vp);
1377 		vp->v_cred = NOCRED;
1378 	}
1379 	NAME_CACHE_UNLOCK();
1380 
1381 	kauth_cred_set(&tcred, NOCRED);
1382 }
1383 
1384 
1385 /* disable vnode_cache_is_authorized() by setting vnode_cache_defeat */
1386 static TUNABLE(int, bootarg_vnode_cache_defeat, "-vnode_cache_defeat", 0);
1387 
1388 boolean_t
vnode_cache_is_authorized(vnode_t vp,vfs_context_t ctx,kauth_action_t action)1389 vnode_cache_is_authorized(vnode_t vp, vfs_context_t ctx, kauth_action_t action)
1390 {
1391 	kauth_cred_t    ucred;
1392 	boolean_t       retval = FALSE;
1393 
1394 	/* Boot argument to defeat rights caching */
1395 	if (bootarg_vnode_cache_defeat) {
1396 		return FALSE;
1397 	}
1398 
1399 	if ((vp->v_mount->mnt_kern_flag & (MNTK_AUTH_OPAQUE | MNTK_AUTH_CACHE_TTL))) {
1400 		/*
1401 		 * a TTL is enabled on the rights cache... handle it here
1402 		 * a TTL of 0 indicates that no rights should be cached
1403 		 */
1404 		if (vp->v_mount->mnt_authcache_ttl) {
1405 			if (!(vp->v_mount->mnt_kern_flag & MNTK_AUTH_CACHE_TTL)) {
1406 				/*
1407 				 * For filesystems marked only MNTK_AUTH_OPAQUE (generally network ones),
1408 				 * we will only allow a SEARCH right on a directory to be cached...
1409 				 * that cached right always has a default TTL associated with it
1410 				 */
1411 				if (action != KAUTH_VNODE_SEARCH || vp->v_type != VDIR) {
1412 					vp = NULLVP;
1413 				}
1414 			}
1415 			if (vp != NULLVP && vnode_cache_is_stale(vp) == TRUE) {
1416 				vnode_uncache_authorized_action(vp, vp->v_authorized_actions);
1417 				vp = NULLVP;
1418 			}
1419 		} else {
1420 			vp = NULLVP;
1421 		}
1422 	}
1423 	if (vp != NULLVP) {
1424 		ucred = vfs_context_ucred(ctx);
1425 
1426 		NAME_CACHE_LOCK_SHARED();
1427 
1428 		if (vnode_cred(vp) == ucred && (vp->v_authorized_actions & action) == action) {
1429 			retval = TRUE;
1430 		}
1431 
1432 		NAME_CACHE_UNLOCK();
1433 	}
1434 	return retval;
1435 }
1436 
1437 
1438 void
vnode_cache_authorized_action(vnode_t vp,vfs_context_t ctx,kauth_action_t action)1439 vnode_cache_authorized_action(vnode_t vp, vfs_context_t ctx, kauth_action_t action)
1440 {
1441 	kauth_cred_t tcred = NOCRED;
1442 	kauth_cred_t ucred;
1443 	struct timeval tv;
1444 	boolean_t ttl_active = FALSE;
1445 
1446 	ucred = vfs_context_ucred(ctx);
1447 
1448 	if (!IS_VALID_CRED(ucred) || action == 0) {
1449 		return;
1450 	}
1451 
1452 	if ((vp->v_mount->mnt_kern_flag & (MNTK_AUTH_OPAQUE | MNTK_AUTH_CACHE_TTL))) {
1453 		/*
1454 		 * a TTL is enabled on the rights cache... handle it here
1455 		 * a TTL of 0 indicates that no rights should be cached
1456 		 */
1457 		if (vp->v_mount->mnt_authcache_ttl == 0) {
1458 			return;
1459 		}
1460 
1461 		if (!(vp->v_mount->mnt_kern_flag & MNTK_AUTH_CACHE_TTL)) {
1462 			/*
1463 			 * only cache SEARCH action for filesystems marked
1464 			 * MNTK_AUTH_OPAQUE on VDIRs...
1465 			 * the lookup_path code will time these out
1466 			 */
1467 			if ((action & ~KAUTH_VNODE_SEARCH) || vp->v_type != VDIR) {
1468 				return;
1469 			}
1470 		}
1471 		ttl_active = TRUE;
1472 
1473 		microuptime(&tv);
1474 	}
1475 	NAME_CACHE_LOCK();
1476 
1477 	if (vnode_cred(vp) != ucred) {
1478 		/*
1479 		 * Use a temp variable to avoid kauth_cred_drop() while NAME_CACHE_LOCK is held
1480 		 */
1481 		tcred = vnode_cred(vp);
1482 		vp->v_cred = NOCRED;
1483 		kauth_cred_set(&vp->v_cred, ucred);
1484 		vp->v_authorized_actions = 0;
1485 	}
1486 	if (ttl_active == TRUE && vp->v_authorized_actions == 0) {
1487 		/*
1488 		 * only reset the timestamnp on the
1489 		 * first authorization cached after the previous
1490 		 * timer has expired or we're switching creds...
1491 		 * 'vnode_cache_is_authorized' will clear the
1492 		 * authorized actions if the TTL is active and
1493 		 * it has expired
1494 		 */
1495 		vp->v_cred_timestamp = (int)tv.tv_sec;
1496 	}
1497 	vp->v_authorized_actions |= action;
1498 
1499 	NAME_CACHE_UNLOCK();
1500 
1501 	kauth_cred_set(&tcred, NOCRED);
1502 }
1503 
1504 
1505 boolean_t
vnode_cache_is_stale(vnode_t vp)1506 vnode_cache_is_stale(vnode_t vp)
1507 {
1508 	struct timeval  tv;
1509 	boolean_t       retval;
1510 
1511 	microuptime(&tv);
1512 
1513 	if ((tv.tv_sec - vp->v_cred_timestamp) > vp->v_mount->mnt_authcache_ttl) {
1514 		retval = TRUE;
1515 	} else {
1516 		retval = FALSE;
1517 	}
1518 
1519 	return retval;
1520 }
1521 
1522 
1523 
1524 /*
1525  * Returns:	0			Success
1526  *		ERECYCLE		vnode was recycled from underneath us.  Force lookup to be re-driven from namei.
1527  *                                              This errno value should not be seen by anyone outside of the kernel.
1528  */
1529 int
cache_lookup_path(struct nameidata * ndp,struct componentname * cnp,vnode_t dp,vfs_context_t ctx,int * dp_authorized,vnode_t last_dp)1530 cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp,
1531     vfs_context_t ctx, int *dp_authorized, vnode_t last_dp)
1532 {
1533 	char            *cp;            /* pointer into pathname argument */
1534 	int             vid;
1535 	int             vvid = 0;       /* protected by vp != NULLVP */
1536 	vnode_t         vp = NULLVP;
1537 	vnode_t         tdp = NULLVP;
1538 	kauth_cred_t    ucred;
1539 	boolean_t       ttl_enabled = FALSE;
1540 	struct timeval  tv;
1541 	mount_t         mp;
1542 	unsigned int    hash;
1543 	int             error = 0;
1544 	boolean_t       dotdotchecked = FALSE;
1545 
1546 #if CONFIG_TRIGGERS
1547 	vnode_t         trigger_vp;
1548 #endif /* CONFIG_TRIGGERS */
1549 
1550 	ucred = vfs_context_ucred(ctx);
1551 	ndp->ni_flag &= ~(NAMEI_TRAILINGSLASH);
1552 
1553 	NAME_CACHE_LOCK_SHARED();
1554 
1555 	if (dp->v_mount && (dp->v_mount->mnt_kern_flag & (MNTK_AUTH_OPAQUE | MNTK_AUTH_CACHE_TTL))) {
1556 		ttl_enabled = TRUE;
1557 		microuptime(&tv);
1558 	}
1559 	for (;;) {
1560 		/*
1561 		 * Search a directory.
1562 		 *
1563 		 * The cn_hash value is for use by cache_lookup
1564 		 * The last component of the filename is left accessible via
1565 		 * cnp->cn_nameptr for callers that need the name.
1566 		 */
1567 		hash = 0;
1568 		cp = cnp->cn_nameptr;
1569 
1570 		while (*cp && (*cp != '/')) {
1571 			hash = crc32tab[((hash >> 24) ^ (unsigned char)*cp++)] ^ hash << 8;
1572 		}
1573 		/*
1574 		 * the crc generator can legitimately generate
1575 		 * a 0... however, 0 for us means that we
1576 		 * haven't computed a hash, so use 1 instead
1577 		 */
1578 		if (hash == 0) {
1579 			hash = 1;
1580 		}
1581 		cnp->cn_hash = hash;
1582 		cnp->cn_namelen = (int)(cp - cnp->cn_nameptr);
1583 
1584 		ndp->ni_pathlen -= cnp->cn_namelen;
1585 		ndp->ni_next = cp;
1586 
1587 		/*
1588 		 * Replace multiple slashes by a single slash and trailing slashes
1589 		 * by a null.  This must be done before VNOP_LOOKUP() because some
1590 		 * fs's don't know about trailing slashes.  Remember if there were
1591 		 * trailing slashes to handle symlinks, existing non-directories
1592 		 * and non-existing files that won't be directories specially later.
1593 		 */
1594 		while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
1595 			cp++;
1596 			ndp->ni_pathlen--;
1597 
1598 			if (*cp == '\0') {
1599 				ndp->ni_flag |= NAMEI_TRAILINGSLASH;
1600 				*ndp->ni_next = '\0';
1601 			}
1602 		}
1603 		ndp->ni_next = cp;
1604 
1605 		cnp->cn_flags &= ~(MAKEENTRY | ISLASTCN | ISDOTDOT);
1606 
1607 		if (*cp == '\0') {
1608 			cnp->cn_flags |= ISLASTCN;
1609 		}
1610 
1611 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') {
1612 			cnp->cn_flags |= ISDOTDOT;
1613 #if CONFIG_FIRMLINKS
1614 			/*
1615 			 * If this is a firmlink target then dp has to be switched to the
1616 			 * firmlink "source" before exiting this loop.
1617 			 *
1618 			 * For a firmlink "target", the policy is to pick the parent of the
1619 			 * firmlink "source" as the parent. This means that you can never
1620 			 * get to the "real" parent of firmlink target via a dotdot lookup.
1621 			 */
1622 			if (dp->v_fmlink && (dp->v_flag & VFMLINKTARGET) && (dp->v_fmlink->v_type == VDIR)) {
1623 				dp = dp->v_fmlink;
1624 			}
1625 #endif
1626 		}
1627 
1628 		*dp_authorized = 0;
1629 #if NAMEDRSRCFORK
1630 		/*
1631 		 * Process a request for a file's resource fork.
1632 		 *
1633 		 * Consume the _PATH_RSRCFORKSPEC suffix and tag the path.
1634 		 */
1635 		if ((ndp->ni_pathlen == sizeof(_PATH_RSRCFORKSPEC)) &&
1636 		    (cp[1] == '.' && cp[2] == '.') &&
1637 		    bcmp(cp, _PATH_RSRCFORKSPEC, sizeof(_PATH_RSRCFORKSPEC)) == 0) {
1638 			/* Skip volfs file systems that don't support native streams. */
1639 			if ((dp->v_mount != NULL) &&
1640 			    (dp->v_mount->mnt_flag & MNT_DOVOLFS) &&
1641 			    (dp->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS) == 0) {
1642 				goto skiprsrcfork;
1643 			}
1644 			cnp->cn_flags |= CN_WANTSRSRCFORK;
1645 			cnp->cn_flags |= ISLASTCN;
1646 			ndp->ni_next[0] = '\0';
1647 			ndp->ni_pathlen = 1;
1648 		}
1649 skiprsrcfork:
1650 #endif
1651 
1652 #if CONFIG_MACF
1653 
1654 		/*
1655 		 * Name cache provides authorization caching (see below)
1656 		 * that will short circuit MAC checks in lookup().
1657 		 * We must perform MAC check here.  On denial
1658 		 * dp_authorized will remain 0 and second check will
1659 		 * be perfomed in lookup().
1660 		 */
1661 		if (!(cnp->cn_flags & DONOTAUTH)) {
1662 			error = mac_vnode_check_lookup(ctx, dp, cnp);
1663 			if (error) {
1664 				NAME_CACHE_UNLOCK();
1665 				goto errorout;
1666 			}
1667 		}
1668 #endif /* MAC */
1669 		if (ttl_enabled &&
1670 		    (dp->v_mount->mnt_authcache_ttl == 0 ||
1671 		    ((tv.tv_sec - dp->v_cred_timestamp) > dp->v_mount->mnt_authcache_ttl))) {
1672 			break;
1673 		}
1674 
1675 		/*
1676 		 * NAME_CACHE_LOCK holds these fields stable
1677 		 *
1678 		 * We can't cache KAUTH_VNODE_SEARCHBYANYONE for root correctly
1679 		 * so we make an ugly check for root here. root is always
1680 		 * allowed and breaking out of here only to find out that is
1681 		 * authorized by virtue of being root is very very expensive.
1682 		 * However, the check for not root is valid only for filesystems
1683 		 * which use local authorization.
1684 		 *
1685 		 * XXX: Remove the check for root when we can reliably set
1686 		 * KAUTH_VNODE_SEARCHBYANYONE as root.
1687 		 */
1688 		if ((vnode_cred(dp) != ucred || !(dp->v_authorized_actions & KAUTH_VNODE_SEARCH)) &&
1689 		    !(dp->v_authorized_actions & KAUTH_VNODE_SEARCHBYANYONE) &&
1690 		    (ttl_enabled || !vfs_context_issuser(ctx))) {
1691 			break;
1692 		}
1693 
1694 		/*
1695 		 * indicate that we're allowed to traverse this directory...
1696 		 * even if we fail the cache lookup or decide to bail for
1697 		 * some other reason, this information is valid and is used
1698 		 * to avoid doing a vnode_authorize before the call to VNOP_LOOKUP
1699 		 */
1700 		*dp_authorized = 1;
1701 
1702 		if ((cnp->cn_flags & (ISLASTCN | ISDOTDOT))) {
1703 			if (cnp->cn_nameiop != LOOKUP) {
1704 				break;
1705 			}
1706 			if (cnp->cn_flags & LOCKPARENT) {
1707 				break;
1708 			}
1709 			if (cnp->cn_flags & NOCACHE) {
1710 				break;
1711 			}
1712 
1713 			if (cnp->cn_flags & ISDOTDOT) {
1714 				/*
1715 				 * Force directory hardlinks to go to
1716 				 * file system for ".." requests.
1717 				 */
1718 				if ((dp->v_flag & VISHARDLINK)) {
1719 					break;
1720 				}
1721 				/*
1722 				 * Quit here only if we can't use
1723 				 * the parent directory pointer or
1724 				 * don't have one.  Otherwise, we'll
1725 				 * use it below.
1726 				 */
1727 				if ((dp->v_flag & VROOT) ||
1728 				    dp == ndp->ni_rootdir ||
1729 				    dp->v_parent == NULLVP) {
1730 					break;
1731 				}
1732 			}
1733 		}
1734 
1735 		if ((cnp->cn_flags & CN_SKIPNAMECACHE)) {
1736 			/*
1737 			 * Force lookup to go to the filesystem with
1738 			 * all cnp fields set up.
1739 			 */
1740 			break;
1741 		}
1742 
1743 		/*
1744 		 * "." and ".." aren't supposed to be cached, so check
1745 		 * for them before checking the cache.
1746 		 */
1747 		if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
1748 			vp = dp;
1749 		} else if ((cnp->cn_flags & ISDOTDOT)) {
1750 			/*
1751 			 * If this is a chrooted process, we need to check if
1752 			 * the process is trying to break out of its chrooted
1753 			 * jail. We do that by trying to determine if dp is
1754 			 * a subdirectory of ndp->ni_rootdir. If we aren't
1755 			 * able to determine that by the v_parent pointers, we
1756 			 * will leave the fast path.
1757 			 *
1758 			 * Since this function may see dotdot components
1759 			 * many times and it has the name cache lock held for
1760 			 * the entire duration, we optimise this by doing this
1761 			 * check only once per cache_lookup_path call.
1762 			 * If dotdotchecked is set, it means we've done this
1763 			 * check once already and don't need to do it again.
1764 			 */
1765 			if (!dotdotchecked && (ndp->ni_rootdir != rootvnode)) {
1766 				vnode_t tvp = dp;
1767 				boolean_t defer = FALSE;
1768 				boolean_t is_subdir = FALSE;
1769 
1770 				defer = cache_check_vnode_issubdir(tvp,
1771 				    ndp->ni_rootdir, &is_subdir, &tvp);
1772 
1773 				if (defer) {
1774 					/* defer to Filesystem */
1775 					break;
1776 				} else if (!is_subdir) {
1777 					/*
1778 					 * This process is trying to break  out
1779 					 * of its chrooted jail, so all its
1780 					 * dotdot accesses will be translated to
1781 					 * its root directory.
1782 					 */
1783 					vp = ndp->ni_rootdir;
1784 				} else {
1785 					/*
1786 					 * All good, let this dotdot access
1787 					 * proceed normally
1788 					 */
1789 					vp = dp->v_parent;
1790 				}
1791 				dotdotchecked = TRUE;
1792 			} else {
1793 				vp = dp->v_parent;
1794 			}
1795 		} else {
1796 			if ((vp = cache_lookup_locked(dp, cnp)) == NULLVP) {
1797 				break;
1798 			}
1799 
1800 			if ((vp->v_flag & VISHARDLINK)) {
1801 				/*
1802 				 * The file system wants a VNOP_LOOKUP on this vnode
1803 				 */
1804 				vp = NULL;
1805 				break;
1806 			}
1807 
1808 #if CONFIG_FIRMLINKS
1809 			if (vp->v_fmlink && !(vp->v_flag & VFMLINKTARGET)) {
1810 				if (cnp->cn_flags & CN_FIRMLINK_NOFOLLOW ||
1811 				    ((vp->v_type != VDIR) && (vp->v_type != VLNK))) {
1812 					/* Leave it to the filesystem */
1813 					vp = NULLVP;
1814 					break;
1815 				}
1816 
1817 				/*
1818 				 * Always switch to the target unless it is a VLNK
1819 				 * and it is the last component and we have NOFOLLOW
1820 				 * semantics
1821 				 */
1822 				if (vp->v_type == VDIR) {
1823 					vp = vp->v_fmlink;
1824 				} else if ((cnp->cn_flags & FOLLOW) ||
1825 				    (ndp->ni_flag & NAMEI_TRAILINGSLASH) || *ndp->ni_next == '/') {
1826 					if (ndp->ni_loopcnt >= MAXSYMLINKS - 1) {
1827 						vp = NULLVP;
1828 						break;
1829 					}
1830 					ndp->ni_loopcnt++;
1831 					vp = vp->v_fmlink;
1832 				}
1833 			}
1834 #endif
1835 		}
1836 		if ((cnp->cn_flags & ISLASTCN)) {
1837 			break;
1838 		}
1839 
1840 		if (vp->v_type != VDIR) {
1841 			if (vp->v_type != VLNK) {
1842 				vp = NULL;
1843 			}
1844 			break;
1845 		}
1846 
1847 		if ((mp = vp->v_mountedhere) && ((cnp->cn_flags & NOCROSSMOUNT) == 0)) {
1848 			vnode_t tmp_vp = mp->mnt_realrootvp;
1849 			if (tmp_vp == NULLVP || mp->mnt_generation != mount_generation ||
1850 			    mp->mnt_realrootvp_vid != tmp_vp->v_id) {
1851 				break;
1852 			}
1853 			vp = tmp_vp;
1854 		}
1855 
1856 #if CONFIG_TRIGGERS
1857 		/*
1858 		 * After traversing all mountpoints stacked here, if we have a
1859 		 * trigger in hand, resolve it.  Note that we don't need to
1860 		 * leave the fast path if the mount has already happened.
1861 		 */
1862 		if (vp->v_resolve) {
1863 			break;
1864 		}
1865 #endif /* CONFIG_TRIGGERS */
1866 
1867 
1868 		dp = vp;
1869 		vp = NULLVP;
1870 
1871 		cnp->cn_nameptr = ndp->ni_next + 1;
1872 		ndp->ni_pathlen--;
1873 		while (*cnp->cn_nameptr == '/') {
1874 			cnp->cn_nameptr++;
1875 			ndp->ni_pathlen--;
1876 		}
1877 	}
1878 	if (vp != NULLVP) {
1879 		vvid = vp->v_id;
1880 		vnode_hold(vp);
1881 	}
1882 	vid = dp->v_id;
1883 
1884 	vnode_hold(dp);
1885 	NAME_CACHE_UNLOCK();
1886 
1887 	tdp = NULLVP;
1888 	if ((vp != NULLVP) && (vp->v_type != VLNK) &&
1889 	    ((cnp->cn_flags & (ISLASTCN | LOCKPARENT | WANTPARENT | SAVESTART)) == ISLASTCN)) {
1890 		/*
1891 		 * if we've got a child and it's the last component, and
1892 		 * the lookup doesn't need to return the parent then we
1893 		 * can skip grabbing an iocount on the parent, since all
1894 		 * we're going to do with it is a vnode_put just before
1895 		 * we return from 'lookup'.  If it's a symbolic link,
1896 		 * we need the parent in case the link happens to be
1897 		 * a relative pathname.
1898 		 */
1899 		tdp = dp;
1900 		dp = NULLVP;
1901 	} else {
1902 need_dp:
1903 		/*
1904 		 * return the last directory we looked at
1905 		 * with an io reference held. If it was the one passed
1906 		 * in as a result of the last iteration of VNOP_LOOKUP,
1907 		 * it should already hold an io ref. No need to increase ref.
1908 		 */
1909 		if (last_dp != dp) {
1910 			if (dp == ndp->ni_usedvp) {
1911 				/*
1912 				 * if this vnode matches the one passed in via USEDVP
1913 				 * than this context already holds an io_count... just
1914 				 * use vnode_get to get an extra ref for lookup to play
1915 				 * with... can't use the getwithvid variant here because
1916 				 * it will block behind a vnode_drain which would result
1917 				 * in a deadlock (since we already own an io_count that the
1918 				 * vnode_drain is waiting on)... vnode_get grabs the io_count
1919 				 * immediately w/o waiting... it always succeeds
1920 				 */
1921 				vnode_get(dp);
1922 			} else if ((error = vnode_getwithvid_drainok(dp, vid))) {
1923 				/*
1924 				 * failure indicates the vnode
1925 				 * changed identity or is being
1926 				 * TERMINATED... in either case
1927 				 * punt this lookup.
1928 				 *
1929 				 * don't necessarily return ENOENT, though, because
1930 				 * we really want to go back to disk and make sure it's
1931 				 * there or not if someone else is changing this
1932 				 * vnode. That being said, the one case where we do want
1933 				 * to return ENOENT is when the vnode's mount point is
1934 				 * in the process of unmounting and we might cause a deadlock
1935 				 * in our attempt to take an iocount. An ENODEV error return
1936 				 * is from vnode_get* is an indication this but we change that
1937 				 * ENOENT for upper layers.
1938 				 */
1939 				if (error == ENODEV) {
1940 					error = ENOENT;
1941 				} else {
1942 					error = ERECYCLE;
1943 				}
1944 				vnode_drop(dp);
1945 				goto errorout;
1946 			}
1947 		}
1948 		vnode_drop(dp);
1949 	}
1950 	if (vp != NULLVP) {
1951 		if ((vnode_getwithvid_drainok(vp, vvid))) {
1952 			vnode_drop(vp);
1953 			vp = NULLVP;
1954 
1955 			/*
1956 			 * can't get reference on the vp we'd like
1957 			 * to return... if we didn't grab a reference
1958 			 * on the directory (due to fast path bypass),
1959 			 * then we need to do it now... we can't return
1960 			 * with both ni_dvp and ni_vp NULL, and no
1961 			 * error condition
1962 			 */
1963 			if (dp == NULLVP) {
1964 				dp = tdp;
1965 				tdp = NULLVP;
1966 				goto need_dp;
1967 			}
1968 		}
1969 		if (vp != NULLVP) {
1970 			vnode_drop(vp);
1971 		}
1972 	}
1973 
1974 	if (tdp) {
1975 		vnode_drop(tdp);
1976 		tdp = NULLVP;
1977 	}
1978 
1979 	ndp->ni_dvp = dp;
1980 	ndp->ni_vp  = vp;
1981 
1982 #if CONFIG_TRIGGERS
1983 	trigger_vp = vp ? vp : dp;
1984 	if ((error == 0) && (trigger_vp != NULLVP) && vnode_isdir(trigger_vp)) {
1985 		error = vnode_trigger_resolve(trigger_vp, ndp, ctx);
1986 		if (error) {
1987 			if (vp) {
1988 				vnode_put(vp);
1989 			}
1990 			if (dp) {
1991 				vnode_put(dp);
1992 			}
1993 			goto errorout;
1994 		}
1995 	}
1996 #endif /* CONFIG_TRIGGERS */
1997 
1998 errorout:
1999 	/*
2000 	 * If we came into cache_lookup_path after an iteration of the lookup loop that
2001 	 * resulted in a call to VNOP_LOOKUP, then VNOP_LOOKUP returned a vnode with a io ref
2002 	 * on it.  It is now the job of cache_lookup_path to drop the ref on this vnode
2003 	 * when it is no longer needed.  If we get to this point, and last_dp is not NULL
2004 	 * and it is ALSO not the dvp we want to return to caller of this function, it MUST be
2005 	 * the case that we got to a subsequent path component and this previous vnode is
2006 	 * no longer needed.  We can then drop the io ref on it.
2007 	 */
2008 	if ((last_dp != NULLVP) && (last_dp != ndp->ni_dvp)) {
2009 		vnode_put(last_dp);
2010 	}
2011 
2012 	//initialized to 0, should be the same if no error cases occurred.
2013 	return error;
2014 }
2015 
2016 
2017 static vnode_t
cache_lookup_locked(vnode_t dvp,struct componentname * cnp)2018 cache_lookup_locked(vnode_t dvp, struct componentname *cnp)
2019 {
2020 	struct namecache *ncp;
2021 	struct nchashhead *ncpp;
2022 	long namelen = cnp->cn_namelen;
2023 	unsigned int hashval = cnp->cn_hash;
2024 
2025 	if (nc_disabled) {
2026 		return NULL;
2027 	}
2028 
2029 	ncpp = NCHHASH(dvp, cnp->cn_hash);
2030 	LIST_FOREACH(ncp, ncpp, nc_hash) {
2031 		if ((ncp->nc_dvp == dvp) && (ncp->nc_hashval == hashval)) {
2032 			if (strncmp(ncp->nc_name, cnp->cn_nameptr, namelen) == 0 && ncp->nc_name[namelen] == 0) {
2033 				break;
2034 			}
2035 		}
2036 	}
2037 	if (ncp == 0) {
2038 		/*
2039 		 * We failed to find an entry
2040 		 */
2041 		NCHSTAT(ncs_miss);
2042 		return NULL;
2043 	}
2044 	NCHSTAT(ncs_goodhits);
2045 
2046 	return ncp->nc_vp;
2047 }
2048 
2049 
2050 unsigned int hash_string(const char *cp, int len);
2051 //
2052 // Have to take a len argument because we may only need to
2053 // hash part of a componentname.
2054 //
2055 unsigned int
hash_string(const char * cp,int len)2056 hash_string(const char *cp, int len)
2057 {
2058 	unsigned hash = 0;
2059 
2060 	if (len) {
2061 		while (len--) {
2062 			hash = crc32tab[((hash >> 24) ^ (unsigned char)*cp++)] ^ hash << 8;
2063 		}
2064 	} else {
2065 		while (*cp != '\0') {
2066 			hash = crc32tab[((hash >> 24) ^ (unsigned char)*cp++)] ^ hash << 8;
2067 		}
2068 	}
2069 	/*
2070 	 * the crc generator can legitimately generate
2071 	 * a 0... however, 0 for us means that we
2072 	 * haven't computed a hash, so use 1 instead
2073 	 */
2074 	if (hash == 0) {
2075 		hash = 1;
2076 	}
2077 	return hash;
2078 }
2079 
2080 
2081 /*
2082  * Lookup an entry in the cache
2083  *
2084  * We don't do this if the segment name is long, simply so the cache
2085  * can avoid holding long names (which would either waste space, or
2086  * add greatly to the complexity).
2087  *
2088  * Lookup is called with dvp pointing to the directory to search,
2089  * cnp pointing to the name of the entry being sought. If the lookup
2090  * succeeds, the vnode is returned in *vpp, and a status of -1 is
2091  * returned. If the lookup determines that the name does not exist
2092  * (negative cacheing), a status of ENOENT is returned. If the lookup
2093  * fails, a status of zero is returned.
2094  */
2095 
2096 int
cache_lookup(struct vnode * dvp,struct vnode ** vpp,struct componentname * cnp)2097 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
2098 {
2099 	struct namecache *ncp;
2100 	struct nchashhead *ncpp;
2101 	long namelen = cnp->cn_namelen;
2102 	unsigned int hashval;
2103 	boolean_t       have_exclusive = FALSE;
2104 	uint32_t vid;
2105 	vnode_t  vp;
2106 
2107 	if (cnp->cn_hash == 0) {
2108 		cnp->cn_hash = hash_string(cnp->cn_nameptr, cnp->cn_namelen);
2109 	}
2110 	hashval = cnp->cn_hash;
2111 
2112 	if (nc_disabled) {
2113 		return 0;
2114 	}
2115 
2116 	NAME_CACHE_LOCK_SHARED();
2117 
2118 relook:
2119 	ncpp = NCHHASH(dvp, cnp->cn_hash);
2120 	LIST_FOREACH(ncp, ncpp, nc_hash) {
2121 		if ((ncp->nc_dvp == dvp) && (ncp->nc_hashval == hashval)) {
2122 			if (strncmp(ncp->nc_name, cnp->cn_nameptr, namelen) == 0 && ncp->nc_name[namelen] == 0) {
2123 				break;
2124 			}
2125 		}
2126 	}
2127 	/* We failed to find an entry */
2128 	if (ncp == 0) {
2129 		NCHSTAT(ncs_miss);
2130 		NAME_CACHE_UNLOCK();
2131 		return 0;
2132 	}
2133 
2134 	/* We don't want to have an entry, so dump it */
2135 	if ((cnp->cn_flags & MAKEENTRY) == 0) {
2136 		if (have_exclusive == TRUE) {
2137 			NCHSTAT(ncs_badhits);
2138 			cache_delete(ncp, 1);
2139 			NAME_CACHE_UNLOCK();
2140 			return 0;
2141 		}
2142 		NAME_CACHE_UNLOCK();
2143 		NAME_CACHE_LOCK();
2144 		have_exclusive = TRUE;
2145 		goto relook;
2146 	}
2147 	vp = ncp->nc_vp;
2148 
2149 	/* We found a "positive" match, return the vnode */
2150 	if (vp) {
2151 		NCHSTAT(ncs_goodhits);
2152 
2153 		vid = vp->v_id;
2154 		vnode_hold(vp);
2155 		NAME_CACHE_UNLOCK();
2156 
2157 		if (vnode_getwithvid(vp, vid)) {
2158 			vnode_drop(vp);
2159 #if COLLECT_STATS
2160 			NAME_CACHE_LOCK();
2161 			NCHSTAT(ncs_badvid);
2162 			NAME_CACHE_UNLOCK();
2163 #endif
2164 			return 0;
2165 		}
2166 		vnode_drop(vp);
2167 		*vpp = vp;
2168 		return -1;
2169 	}
2170 
2171 	/* We found a negative match, and want to create it, so purge */
2172 	if (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) {
2173 		if (have_exclusive == TRUE) {
2174 			NCHSTAT(ncs_badhits);
2175 			cache_delete(ncp, 1);
2176 			NAME_CACHE_UNLOCK();
2177 			return 0;
2178 		}
2179 		NAME_CACHE_UNLOCK();
2180 		NAME_CACHE_LOCK();
2181 		have_exclusive = TRUE;
2182 		goto relook;
2183 	}
2184 
2185 	/*
2186 	 * We found a "negative" match, ENOENT notifies client of this match.
2187 	 */
2188 	NCHSTAT(ncs_neghits);
2189 
2190 	NAME_CACHE_UNLOCK();
2191 	return ENOENT;
2192 }
2193 
2194 const char *
cache_enter_create(vnode_t dvp,vnode_t vp,struct componentname * cnp)2195 cache_enter_create(vnode_t dvp, vnode_t vp, struct componentname *cnp)
2196 {
2197 	const char *strname;
2198 
2199 	if (cnp->cn_hash == 0) {
2200 		cnp->cn_hash = hash_string(cnp->cn_nameptr, cnp->cn_namelen);
2201 	}
2202 
2203 	/*
2204 	 * grab 2 references on the string entered
2205 	 * one for the cache_enter_locked to consume
2206 	 * and the second to be consumed by v_name (vnode_create call point)
2207 	 */
2208 	strname = add_name_internal(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, TRUE, 0);
2209 
2210 	NAME_CACHE_LOCK();
2211 
2212 	cache_enter_locked(dvp, vp, cnp, strname);
2213 
2214 	NAME_CACHE_UNLOCK();
2215 
2216 	return strname;
2217 }
2218 
2219 
2220 /*
2221  * Add an entry to the cache...
2222  * but first check to see if the directory
2223  * that this entry is to be associated with has
2224  * had any cache_purges applied since we took
2225  * our identity snapshot... this check needs to
2226  * be done behind the name cache lock
2227  */
2228 void
cache_enter_with_gen(struct vnode * dvp,struct vnode * vp,struct componentname * cnp,int gen)2229 cache_enter_with_gen(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, int gen)
2230 {
2231 	if (cnp->cn_hash == 0) {
2232 		cnp->cn_hash = hash_string(cnp->cn_nameptr, cnp->cn_namelen);
2233 	}
2234 
2235 	NAME_CACHE_LOCK();
2236 
2237 	if (dvp->v_nc_generation == gen) {
2238 		(void)cache_enter_locked(dvp, vp, cnp, NULL);
2239 	}
2240 
2241 	NAME_CACHE_UNLOCK();
2242 }
2243 
2244 
2245 /*
2246  * Add an entry to the cache.
2247  */
2248 void
cache_enter(struct vnode * dvp,struct vnode * vp,struct componentname * cnp)2249 cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
2250 {
2251 	const char *strname;
2252 
2253 	if (cnp->cn_hash == 0) {
2254 		cnp->cn_hash = hash_string(cnp->cn_nameptr, cnp->cn_namelen);
2255 	}
2256 
2257 	/*
2258 	 * grab 1 reference on the string entered
2259 	 * for the cache_enter_locked to consume
2260 	 */
2261 	strname = add_name_internal(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, FALSE, 0);
2262 
2263 	NAME_CACHE_LOCK();
2264 
2265 	cache_enter_locked(dvp, vp, cnp, strname);
2266 
2267 	NAME_CACHE_UNLOCK();
2268 }
2269 
2270 
2271 static void
cache_enter_locked(struct vnode * dvp,struct vnode * vp,struct componentname * cnp,const char * strname)2272 cache_enter_locked(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, const char *strname)
2273 {
2274 	struct namecache *ncp, *negp;
2275 	struct nchashhead *ncpp;
2276 
2277 	if (nc_disabled) {
2278 		return;
2279 	}
2280 
2281 	/*
2282 	 * if the entry is for -ve caching vp is null
2283 	 */
2284 	if ((vp != NULLVP) && (LIST_FIRST(&vp->v_nclinks))) {
2285 		/*
2286 		 * someone beat us to the punch..
2287 		 * this vnode is already in the cache
2288 		 */
2289 		if (strname != NULL) {
2290 			vfs_removename(strname);
2291 		}
2292 		return;
2293 	}
2294 	/*
2295 	 * We allocate a new entry if we are less than the maximum
2296 	 * allowed and the one at the front of the list is in use.
2297 	 * Otherwise we use the one at the front of the list.
2298 	 */
2299 	if (numcache < desiredNodes &&
2300 	    ((ncp = nchead.tqh_first) == NULL ||
2301 	    ncp->nc_hash.le_prev != 0)) {
2302 		/*
2303 		 * Allocate one more entry
2304 		 */
2305 		ncp = zalloc(namecache_zone);
2306 		numcache++;
2307 	} else {
2308 		/*
2309 		 * reuse an old entry
2310 		 */
2311 		ncp = TAILQ_FIRST(&nchead);
2312 		TAILQ_REMOVE(&nchead, ncp, nc_entry);
2313 
2314 		if (ncp->nc_hash.le_prev != 0) {
2315 			/*
2316 			 * still in use... we need to
2317 			 * delete it before re-using it
2318 			 */
2319 			NCHSTAT(ncs_stolen);
2320 			cache_delete(ncp, 0);
2321 		}
2322 	}
2323 	NCHSTAT(ncs_enters);
2324 
2325 	/*
2326 	 * Fill in cache info, if vp is NULL this is a "negative" cache entry.
2327 	 */
2328 	ncp->nc_vp = vp;
2329 	ncp->nc_dvp = dvp;
2330 	ncp->nc_hashval = cnp->cn_hash;
2331 
2332 	if (strname == NULL) {
2333 		ncp->nc_name = add_name_internal(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, FALSE, 0);
2334 	} else {
2335 		ncp->nc_name = strname;
2336 	}
2337 
2338 	//
2339 	// If the bytes of the name associated with the vnode differ,
2340 	// use the name associated with the vnode since the file system
2341 	// may have set that explicitly in the case of a lookup on a
2342 	// case-insensitive file system where the case of the looked up
2343 	// name differs from what is on disk.  For more details, see:
2344 	//   <rdar://problem/8044697> FSEvents doesn't always decompose diacritical unicode chars in the paths of the changed directories
2345 	//
2346 	const char *vn_name = vp ? vp->v_name : NULL;
2347 	unsigned int len = vn_name ? (unsigned int)strlen(vn_name) : 0;
2348 	if (vn_name && ncp && ncp->nc_name && strncmp(ncp->nc_name, vn_name, len) != 0) {
2349 		unsigned int hash = hash_string(vn_name, len);
2350 
2351 		vfs_removename(ncp->nc_name);
2352 		ncp->nc_name = add_name_internal(vn_name, len, hash, FALSE, 0);
2353 		ncp->nc_hashval = hash;
2354 	}
2355 
2356 	/*
2357 	 * make us the newest entry in the cache
2358 	 * i.e. we'll be the last to be stolen
2359 	 */
2360 	TAILQ_INSERT_TAIL(&nchead, ncp, nc_entry);
2361 
2362 	ncpp = NCHHASH(dvp, cnp->cn_hash);
2363 #if DIAGNOSTIC
2364 	{
2365 		struct namecache *p;
2366 
2367 		for (p = ncpp->lh_first; p != 0; p = p->nc_hash.le_next) {
2368 			if (p == ncp) {
2369 				panic("cache_enter: duplicate");
2370 			}
2371 		}
2372 	}
2373 #endif
2374 	/*
2375 	 * make us available to be found via lookup
2376 	 */
2377 	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2378 
2379 	if (vp) {
2380 		/*
2381 		 * add to the list of name cache entries
2382 		 * that point at vp
2383 		 */
2384 		LIST_INSERT_HEAD(&vp->v_nclinks, ncp, nc_un.nc_link);
2385 	} else {
2386 		/*
2387 		 * this is a negative cache entry (vp == NULL)
2388 		 * stick it on the negative cache list.
2389 		 */
2390 		TAILQ_INSERT_TAIL(&neghead, ncp, nc_un.nc_negentry);
2391 
2392 		ncs_negtotal++;
2393 
2394 		if (ncs_negtotal > desiredNegNodes) {
2395 			/*
2396 			 * if we've reached our desired limit
2397 			 * of negative cache entries, delete
2398 			 * the oldest
2399 			 */
2400 			negp = TAILQ_FIRST(&neghead);
2401 			cache_delete(negp, 1);
2402 		}
2403 	}
2404 	/*
2405 	 * add us to the list of name cache entries that
2406 	 * are children of dvp
2407 	 */
2408 	if (vp) {
2409 		TAILQ_INSERT_TAIL(&dvp->v_ncchildren, ncp, nc_child);
2410 	} else {
2411 		TAILQ_INSERT_HEAD(&dvp->v_ncchildren, ncp, nc_child);
2412 	}
2413 }
2414 
2415 
2416 /*
2417  * Initialize CRC-32 remainder table.
2418  */
2419 static void
init_crc32(void)2420 init_crc32(void)
2421 {
2422 	/*
2423 	 * the CRC-32 generator polynomial is:
2424 	 *   x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^10
2425 	 *        + x^8  + x^7  + x^5  + x^4  + x^2  + x + 1
2426 	 */
2427 	unsigned int crc32_polynomial = 0x04c11db7;
2428 	unsigned int i, j;
2429 
2430 	/*
2431 	 * pre-calculate the CRC-32 remainder for each possible octet encoding
2432 	 */
2433 	for (i = 0; i < 256; i++) {
2434 		unsigned int crc_rem = i << 24;
2435 
2436 		for (j = 0; j < 8; j++) {
2437 			if (crc_rem & 0x80000000) {
2438 				crc_rem = (crc_rem << 1) ^ crc32_polynomial;
2439 			} else {
2440 				crc_rem = (crc_rem << 1);
2441 			}
2442 		}
2443 		crc32tab[i] = crc_rem;
2444 	}
2445 }
2446 
2447 
2448 /*
2449  * Name cache initialization, from vfs_init() when we are booting
2450  */
2451 void
nchinit(void)2452 nchinit(void)
2453 {
2454 	desiredNegNodes = (desiredvnodes / 10);
2455 	desiredNodes = desiredvnodes + desiredNegNodes;
2456 
2457 	TAILQ_INIT(&nchead);
2458 	TAILQ_INIT(&neghead);
2459 
2460 	init_crc32();
2461 
2462 	nchashtbl = hashinit(MAX(CONFIG_NC_HASH, (2 * desiredNodes)), M_CACHE, &nchash);
2463 	nchashmask = nchash;
2464 	nchash++;
2465 
2466 	init_string_table();
2467 
2468 	for (int i = 0; i < NUM_STRCACHE_LOCKS; i++) {
2469 		lck_mtx_init(&strcache_mtx_locks[i], &strcache_lck_grp, &strcache_lck_attr);
2470 	}
2471 }
2472 
2473 void
name_cache_lock_shared(void)2474 name_cache_lock_shared(void)
2475 {
2476 	lck_rw_lock_shared(&namecache_rw_lock);
2477 }
2478 
2479 void
name_cache_lock(void)2480 name_cache_lock(void)
2481 {
2482 	lck_rw_lock_exclusive(&namecache_rw_lock);
2483 }
2484 
2485 void
name_cache_unlock(void)2486 name_cache_unlock(void)
2487 {
2488 	lck_rw_done(&namecache_rw_lock);
2489 }
2490 
2491 
2492 int
resize_namecache(int newsize)2493 resize_namecache(int newsize)
2494 {
2495 	struct nchashhead   *new_table;
2496 	struct nchashhead   *old_table;
2497 	struct nchashhead   *old_head, *head;
2498 	struct namecache    *entry, *next;
2499 	uint32_t            i, hashval;
2500 	int                 dNodes, dNegNodes, nelements;
2501 	u_long              new_size, old_size;
2502 
2503 	if (newsize < 0) {
2504 		return EINVAL;
2505 	}
2506 
2507 	dNegNodes = (newsize / 10);
2508 	dNodes = newsize + dNegNodes;
2509 	// we don't support shrinking yet
2510 	if (dNodes <= desiredNodes) {
2511 		return 0;
2512 	}
2513 
2514 	if (os_mul_overflow(dNodes, 2, &nelements)) {
2515 		return EINVAL;
2516 	}
2517 
2518 	new_table = hashinit(nelements, M_CACHE, &nchashmask);
2519 	new_size  = nchashmask + 1;
2520 
2521 	if (new_table == NULL) {
2522 		return ENOMEM;
2523 	}
2524 
2525 	NAME_CACHE_LOCK();
2526 	// do the switch!
2527 	old_table = nchashtbl;
2528 	nchashtbl = new_table;
2529 	old_size  = nchash;
2530 	nchash    = new_size;
2531 
2532 	// walk the old table and insert all the entries into
2533 	// the new table
2534 	//
2535 	for (i = 0; i < old_size; i++) {
2536 		old_head = &old_table[i];
2537 		for (entry = old_head->lh_first; entry != NULL; entry = next) {
2538 			//
2539 			// XXXdbg - Beware: this assumes that hash_string() does
2540 			//                  the same thing as what happens in
2541 			//                  lookup() over in vfs_lookup.c
2542 			hashval = hash_string(entry->nc_name, 0);
2543 			entry->nc_hashval = hashval;
2544 			head = NCHHASH(entry->nc_dvp, hashval);
2545 
2546 			next = entry->nc_hash.le_next;
2547 			LIST_INSERT_HEAD(head, entry, nc_hash);
2548 		}
2549 	}
2550 	desiredNodes = dNodes;
2551 	desiredNegNodes = dNegNodes;
2552 
2553 	NAME_CACHE_UNLOCK();
2554 	hashdestroy(old_table, M_CACHE, old_size - 1);
2555 
2556 	return 0;
2557 }
2558 
2559 static void
cache_delete(struct namecache * ncp,int free_entry)2560 cache_delete(struct namecache *ncp, int free_entry)
2561 {
2562 	NCHSTAT(ncs_deletes);
2563 
2564 	if (ncp->nc_vp) {
2565 		LIST_REMOVE(ncp, nc_un.nc_link);
2566 	} else {
2567 		TAILQ_REMOVE(&neghead, ncp, nc_un.nc_negentry);
2568 		ncs_negtotal--;
2569 	}
2570 	TAILQ_REMOVE(&(ncp->nc_dvp->v_ncchildren), ncp, nc_child);
2571 
2572 	LIST_REMOVE(ncp, nc_hash);
2573 	/*
2574 	 * this field is used to indicate
2575 	 * that the entry is in use and
2576 	 * must be deleted before it can
2577 	 * be reused...
2578 	 */
2579 	ncp->nc_hash.le_prev = NULL;
2580 
2581 	vfs_removename(ncp->nc_name);
2582 	ncp->nc_name = NULL;
2583 	if (free_entry) {
2584 		TAILQ_REMOVE(&nchead, ncp, nc_entry);
2585 		zfree(namecache_zone, ncp);
2586 		numcache--;
2587 	}
2588 }
2589 
2590 
2591 /*
2592  * purge the entry associated with the
2593  * specified vnode from the name cache
2594  */
2595 static void
cache_purge_locked(vnode_t vp,kauth_cred_t * credp)2596 cache_purge_locked(vnode_t vp, kauth_cred_t *credp)
2597 {
2598 	struct namecache *ncp;
2599 
2600 	*credp = NULL;
2601 	if ((LIST_FIRST(&vp->v_nclinks) == NULL) &&
2602 	    (TAILQ_FIRST(&vp->v_ncchildren) == NULL) &&
2603 	    (vnode_cred(vp) == NOCRED) &&
2604 	    (vp->v_parent == NULLVP)) {
2605 		return;
2606 	}
2607 
2608 	if (vp->v_parent) {
2609 		vp->v_parent->v_nc_generation++;
2610 	}
2611 
2612 	while ((ncp = LIST_FIRST(&vp->v_nclinks))) {
2613 		cache_delete(ncp, 1);
2614 	}
2615 
2616 	while ((ncp = TAILQ_FIRST(&vp->v_ncchildren))) {
2617 		cache_delete(ncp, 1);
2618 	}
2619 
2620 	/*
2621 	 * Use a temp variable to avoid kauth_cred_unref() while NAME_CACHE_LOCK is held
2622 	 */
2623 	*credp = vnode_cred(vp);
2624 	vp->v_cred = NOCRED;
2625 	vp->v_authorized_actions = 0;
2626 }
2627 
2628 void
cache_purge(vnode_t vp)2629 cache_purge(vnode_t vp)
2630 {
2631 	kauth_cred_t tcred = NULL;
2632 
2633 	if ((LIST_FIRST(&vp->v_nclinks) == NULL) &&
2634 	    (TAILQ_FIRST(&vp->v_ncchildren) == NULL) &&
2635 	    (vnode_cred(vp) == NOCRED) &&
2636 	    (vp->v_parent == NULLVP)) {
2637 		return;
2638 	}
2639 
2640 	NAME_CACHE_LOCK();
2641 
2642 	cache_purge_locked(vp, &tcred);
2643 
2644 	NAME_CACHE_UNLOCK();
2645 
2646 	kauth_cred_set(&tcred, NOCRED);
2647 }
2648 
2649 /*
2650  * Purge all negative cache entries that are children of the
2651  * given vnode.  A case-insensitive file system (or any file
2652  * system that has multiple equivalent names for the same
2653  * directory entry) can use this when creating or renaming
2654  * to remove negative entries that may no longer apply.
2655  */
2656 void
cache_purge_negatives(vnode_t vp)2657 cache_purge_negatives(vnode_t vp)
2658 {
2659 	struct namecache *ncp, *next_ncp;
2660 
2661 	NAME_CACHE_LOCK();
2662 
2663 	TAILQ_FOREACH_SAFE(ncp, &vp->v_ncchildren, nc_child, next_ncp) {
2664 		if (ncp->nc_vp) {
2665 			break;
2666 		}
2667 
2668 		cache_delete(ncp, 1);
2669 	}
2670 
2671 	NAME_CACHE_UNLOCK();
2672 }
2673 
2674 /*
2675  * Flush all entries referencing a particular filesystem.
2676  *
2677  * Since we need to check it anyway, we will flush all the invalid
2678  * entries at the same time.
2679  */
2680 void
cache_purgevfs(struct mount * mp)2681 cache_purgevfs(struct mount *mp)
2682 {
2683 	struct nchashhead *ncpp;
2684 	struct namecache *ncp;
2685 
2686 	NAME_CACHE_LOCK();
2687 	/* Scan hash tables for applicable entries */
2688 	for (ncpp = &nchashtbl[nchash - 1]; ncpp >= nchashtbl; ncpp--) {
2689 restart:
2690 		for (ncp = ncpp->lh_first; ncp != 0; ncp = ncp->nc_hash.le_next) {
2691 			if (ncp->nc_dvp->v_mount == mp) {
2692 				cache_delete(ncp, 0);
2693 				goto restart;
2694 			}
2695 		}
2696 	}
2697 	NAME_CACHE_UNLOCK();
2698 }
2699 
2700 
2701 
2702 //
2703 // String ref routines
2704 //
2705 static LIST_HEAD(stringhead, string_t) * string_ref_table;
2706 static u_long   string_table_mask;
2707 static uint32_t filled_buckets = 0;
2708 
2709 
2710 typedef struct string_t {
2711 	LIST_ENTRY(string_t)  hash_chain;
2712 	char                  *str;
2713 	uint32_t              strbuflen;
2714 	uint32_t              refcount;
2715 } string_t;
2716 
2717 
2718 static void
resize_string_ref_table(void)2719 resize_string_ref_table(void)
2720 {
2721 	struct stringhead *new_table;
2722 	struct stringhead *old_table;
2723 	struct stringhead *old_head, *head;
2724 	string_t          *entry, *next;
2725 	uint32_t           i, hashval;
2726 	u_long             new_mask, old_mask;
2727 
2728 	/*
2729 	 * need to hold the table lock exclusively
2730 	 * in order to grow the table... need to recheck
2731 	 * the need to resize again after we've taken
2732 	 * the lock exclusively in case some other thread
2733 	 * beat us to the punch
2734 	 */
2735 	lck_rw_lock_exclusive(&strtable_rw_lock);
2736 
2737 	if (4 * filled_buckets < ((string_table_mask + 1) * 3)) {
2738 		lck_rw_done(&strtable_rw_lock);
2739 		return;
2740 	}
2741 	assert(string_table_mask < INT32_MAX);
2742 	new_table = hashinit((int)(string_table_mask + 1) * 2, M_CACHE, &new_mask);
2743 
2744 	if (new_table == NULL) {
2745 		printf("failed to resize the hash table.\n");
2746 		lck_rw_done(&strtable_rw_lock);
2747 		return;
2748 	}
2749 
2750 	// do the switch!
2751 	old_table         = string_ref_table;
2752 	string_ref_table  = new_table;
2753 	old_mask          = string_table_mask;
2754 	string_table_mask = new_mask;
2755 	filled_buckets    = 0;
2756 
2757 	// walk the old table and insert all the entries into
2758 	// the new table
2759 	//
2760 	for (i = 0; i <= old_mask; i++) {
2761 		old_head = &old_table[i];
2762 		for (entry = old_head->lh_first; entry != NULL; entry = next) {
2763 			hashval = hash_string((const char *)entry->str, 0);
2764 			head = &string_ref_table[hashval & string_table_mask];
2765 			if (head->lh_first == NULL) {
2766 				filled_buckets++;
2767 			}
2768 			next = entry->hash_chain.le_next;
2769 			LIST_INSERT_HEAD(head, entry, hash_chain);
2770 		}
2771 	}
2772 	lck_rw_done(&strtable_rw_lock);
2773 
2774 	hashdestroy(old_table, M_CACHE, old_mask);
2775 }
2776 
2777 
2778 static void
init_string_table(void)2779 init_string_table(void)
2780 {
2781 	string_ref_table = hashinit(CONFIG_VFS_NAMES, M_CACHE, &string_table_mask);
2782 }
2783 
2784 
2785 const char *
vfs_addname(const char * name,uint32_t len,u_int hashval,u_int flags)2786 vfs_addname(const char *name, uint32_t len, u_int hashval, u_int flags)
2787 {
2788 	return add_name_internal(name, len, hashval, FALSE, flags);
2789 }
2790 
2791 
2792 static const char *
add_name_internal(const char * name,uint32_t len,u_int hashval,boolean_t need_extra_ref,__unused u_int flags)2793 add_name_internal(const char *name, uint32_t len, u_int hashval, boolean_t need_extra_ref, __unused u_int flags)
2794 {
2795 	struct stringhead *head;
2796 	string_t          *entry;
2797 	uint32_t          chain_len = 0;
2798 	uint32_t          hash_index;
2799 	uint32_t          lock_index;
2800 	char              *ptr;
2801 
2802 	if (len > MAXPATHLEN) {
2803 		len = MAXPATHLEN;
2804 	}
2805 
2806 	/*
2807 	 * if the length already accounts for the null-byte, then
2808 	 * subtract one so later on we don't index past the end
2809 	 * of the string.
2810 	 */
2811 	if (len > 0 && name[len - 1] == '\0') {
2812 		len--;
2813 	}
2814 	if (hashval == 0) {
2815 		hashval = hash_string(name, len);
2816 	}
2817 
2818 	/*
2819 	 * take this lock 'shared' to keep the hash stable
2820 	 * if someone else decides to grow the pool they
2821 	 * will take this lock exclusively
2822 	 */
2823 	lck_rw_lock_shared(&strtable_rw_lock);
2824 
2825 	/*
2826 	 * If the table gets more than 3/4 full, resize it
2827 	 */
2828 	if (4 * filled_buckets >= ((string_table_mask + 1) * 3)) {
2829 		lck_rw_done(&strtable_rw_lock);
2830 
2831 		resize_string_ref_table();
2832 
2833 		lck_rw_lock_shared(&strtable_rw_lock);
2834 	}
2835 	hash_index = hashval & string_table_mask;
2836 	lock_index = hash_index % NUM_STRCACHE_LOCKS;
2837 
2838 	head = &string_ref_table[hash_index];
2839 
2840 	lck_mtx_lock_spin(&strcache_mtx_locks[lock_index]);
2841 
2842 	for (entry = head->lh_first; entry != NULL; chain_len++, entry = entry->hash_chain.le_next) {
2843 		if (strncmp(entry->str, name, len) == 0 && entry->str[len] == 0) {
2844 			entry->refcount++;
2845 			break;
2846 		}
2847 	}
2848 	if (entry == NULL) {
2849 		const uint32_t buflen = len + 1;
2850 
2851 		lck_mtx_convert_spin(&strcache_mtx_locks[lock_index]);
2852 		/*
2853 		 * it wasn't already there so add it.
2854 		 */
2855 		entry = kalloc_type(string_t, Z_WAITOK);
2856 
2857 		if (head->lh_first == NULL) {
2858 			OSAddAtomic(1, &filled_buckets);
2859 		}
2860 		ptr = kalloc_data(buflen, Z_WAITOK);
2861 		strncpy(ptr, name, len);
2862 		ptr[len] = '\0';
2863 		entry->str = ptr;
2864 		entry->strbuflen = buflen;
2865 		entry->refcount = 1;
2866 		LIST_INSERT_HEAD(head, entry, hash_chain);
2867 	}
2868 	if (need_extra_ref == TRUE) {
2869 		entry->refcount++;
2870 	}
2871 
2872 	lck_mtx_unlock(&strcache_mtx_locks[lock_index]);
2873 	lck_rw_done(&strtable_rw_lock);
2874 
2875 	return (const char *)entry->str;
2876 }
2877 
2878 
2879 int
vfs_removename(const char * nameref)2880 vfs_removename(const char *nameref)
2881 {
2882 	struct stringhead *head;
2883 	string_t          *entry;
2884 	uint32_t           hashval;
2885 	uint32_t           hash_index;
2886 	uint32_t           lock_index;
2887 	int                retval = ENOENT;
2888 
2889 	hashval = hash_string(nameref, 0);
2890 
2891 	/*
2892 	 * take this lock 'shared' to keep the hash stable
2893 	 * if someone else decides to grow the pool they
2894 	 * will take this lock exclusively
2895 	 */
2896 	lck_rw_lock_shared(&strtable_rw_lock);
2897 	/*
2898 	 * must compute the head behind the table lock
2899 	 * since the size and location of the table
2900 	 * can change on the fly
2901 	 */
2902 	hash_index = hashval & string_table_mask;
2903 	lock_index = hash_index % NUM_STRCACHE_LOCKS;
2904 
2905 	head = &string_ref_table[hash_index];
2906 
2907 	lck_mtx_lock_spin(&strcache_mtx_locks[lock_index]);
2908 
2909 	for (entry = head->lh_first; entry != NULL; entry = entry->hash_chain.le_next) {
2910 		if (entry->str == nameref) {
2911 			entry->refcount--;
2912 
2913 			if (entry->refcount == 0) {
2914 				LIST_REMOVE(entry, hash_chain);
2915 
2916 				if (head->lh_first == NULL) {
2917 					OSAddAtomic(-1, &filled_buckets);
2918 				}
2919 			} else {
2920 				entry = NULL;
2921 			}
2922 			retval = 0;
2923 			break;
2924 		}
2925 	}
2926 	lck_mtx_unlock(&strcache_mtx_locks[lock_index]);
2927 	lck_rw_done(&strtable_rw_lock);
2928 
2929 	if (entry) {
2930 		assert(entry->refcount == 0);
2931 		kfree_data(entry->str, entry->strbuflen);
2932 		entry->str = NULL;
2933 		entry->strbuflen = 0;
2934 		kfree_type(string_t, entry);
2935 	}
2936 
2937 	return retval;
2938 }
2939 
2940 
2941 #ifdef DUMP_STRING_TABLE
2942 void
dump_string_table(void)2943 dump_string_table(void)
2944 {
2945 	struct stringhead *head;
2946 	string_t          *entry;
2947 	u_long            i;
2948 
2949 	lck_rw_lock_shared(&strtable_rw_lock);
2950 
2951 	for (i = 0; i <= string_table_mask; i++) {
2952 		head = &string_ref_table[i];
2953 		for (entry = head->lh_first; entry != NULL; entry = entry->hash_chain.le_next) {
2954 			printf("%6d - %s\n", entry->refcount, entry->str);
2955 		}
2956 	}
2957 	lck_rw_done(&strtable_rw_lock);
2958 }
2959 #endif  /* DUMP_STRING_TABLE */
2960