xref: /xnu-11417.140.69/bsd/vfs/vfs_lookup.c (revision 43a90889846e00bfb5cf1d255cdc0a701a1e05a4)
1 /*
2  * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30  * Copyright (c) 1982, 1986, 1989, 1993
31  *	The Regents of the University of California.  All rights reserved.
32  * (c) UNIX System Laboratories, Inc.
33  * All or some portions of this file are derived from material licensed
34  * to the University of California by American Telephone and Telegraph
35  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
36  * the permission of UNIX System Laboratories, Inc.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. All advertising materials mentioning features or use of this software
47  *    must display the following acknowledgement:
48  *	This product includes software developed by the University of
49  *	California, Berkeley and its contributors.
50  * 4. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  *	@(#)vfs_lookup.c	8.10 (Berkeley) 5/27/95
67  */
68 /*
69  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
70  * support for mandatory and extensible security protections.  This notice
71  * is included in support of clause 2.2 (b) of the Apple Public License,
72  * Version 2.0.
73  */
74 
75 #include <sys/param.h>
76 #include <sys/systm.h>
77 #include <sys/syslimits.h>
78 #include <sys/time.h>
79 #include <sys/namei.h>
80 #include <sys/vm.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/errno.h>
84 #include <kern/kalloc.h>
85 #include <sys/filedesc.h>
86 #include <sys/proc_internal.h>
87 #include <sys/kdebug.h>
88 #include <sys/unistd.h>         /* For _PC_NAME_MAX */
89 #include <sys/uio_internal.h>
90 #include <sys/kauth.h>
91 #include <kern/zalloc.h>
92 #include <security/audit/audit.h>
93 #if CONFIG_MACF
94 #include <security/mac_framework.h>
95 #endif
96 #include <os/atomic_private.h>
97 
98 #include <sys/paths.h>
99 
100 #if NAMEDRSRCFORK
101 #include <sys/xattr.h>
102 #endif
103 /*
104  * The minimum volfs-style pathname is 9.
105  * Example:  "/.vol/1/2"
106  */
107 #define VOLFS_MIN_PATH_LEN  9
108 
109 
110 #if CONFIG_VOLFS
111 static int vfs_getrealpath(const char * path, char * realpath, size_t bufsize, vfs_context_t ctx);
112 #define MAX_VOLFS_RESTARTS 5
113 #endif
114 
115 static int              lookup_traverse_mountpoints(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, int vbusyflags, vfs_context_t ctx);
116 static int              lookup_handle_symlink(struct nameidata *ndp, vnode_t *new_dp, bool* dp_has_iocount, vfs_context_t ctx);
117 static int              lookup_authorize_search(vnode_t dp, struct componentname *cnp, int dp_authorized_in_cache, vfs_context_t ctx);
118 static void             lookup_consider_update_cache(vnode_t dvp, vnode_t vp, struct componentname *cnp, int nc_generation);
119 static int              lookup_handle_found_vnode(struct nameidata *ndp, struct componentname *cnp, int rdonly,
120     int vbusyflags, int *keep_going, int nc_generation,
121     int wantparent, int atroot, vfs_context_t ctx);
122 static int              lookup_handle_emptyname(struct nameidata *ndp, struct componentname *cnp, int wantparent);
123 
124 #if NAMEDRSRCFORK
125 static int              lookup_handle_rsrc_fork(vnode_t dp, struct nameidata *ndp, struct componentname *cnp, int wantparent, vfs_context_t ctx);
126 #endif
127 
128 extern lck_rw_t rootvnode_rw_lock;
129 
130 #define RESOLVE_NOFOLLOW_ANY  0x00000001
131 #define RESOLVE_CHECKED       0x80000000
132 
133 /*
134  * Convert a pathname into a pointer to a locked inode.
135  *
136  * The FOLLOW flag is set when symbolic links are to be followed
137  * when they occur at the end of the name translation process.
138  * Symbolic links are always followed for all other pathname
139  * components other than the last.
140  *
141  * The segflg defines whether the name is to be copied from user
142  * space or kernel space.
143  *
144  * Overall outline of namei:
145  *
146  *	copy in name
147  *	get starting directory
148  *	while (!done && !error) {
149  *		call lookup to search path.
150  *		if symbolic link, massage name in buffer and continue
151  *	}
152  *
153  * Returns:	0			Success
154  *		ENOENT			No such file or directory
155  *		ELOOP			Too many levels of symbolic links
156  *		ENAMETOOLONG		Filename too long
157  *		copyinstr:EFAULT	Bad address
158  *		copyinstr:ENAMETOOLONG	Filename too long
159  *		lookup:EBADF		Bad file descriptor
160  *		lookup:EROFS
161  *		lookup:EACCES
162  *		lookup:EPERM
163  *		lookup:ERECYCLE	 vnode was recycled from underneath us in lookup.
164  *						 This means we should re-drive lookup from this point.
165  *		lookup: ???
166  *		VNOP_READLINK:???
167  */
168 int
namei(struct nameidata * ndp)169 namei(struct nameidata *ndp)
170 {
171 	struct vnode *dp;       /* the directory we are searching */
172 	struct vnode *usedvp = ndp->ni_dvp;  /* store pointer to vp in case we must loop due to
173 	                                      *                                          heavy vnode pressure */
174 	uint32_t cnpflags = ndp->ni_cnd.cn_flags; /* store in case we have to restore after loop */
175 	int error;
176 	struct componentname *cnp = &ndp->ni_cnd;
177 	vfs_context_t ctx = cnp->cn_context;
178 	proc_t p = vfs_context_proc(ctx);
179 #if CONFIG_AUDIT
180 /* XXX ut should be from context */
181 	uthread_t ut = current_uthread();
182 #endif
183 
184 #if CONFIG_VOLFS
185 	int volfs_restarts = 0;
186 #endif
187 	size_t bytes_copied = 0;
188 	size_t resolve_prefix_len = 0;
189 	vnode_t rootdir_with_usecount = NULLVP;
190 	vnode_t startdir_with_usecount = NULLVP;
191 	vnode_t usedvp_dp = NULLVP;
192 	int32_t old_count = 0;
193 	uint32_t resolve_flags = 0;
194 	int resolve_error = 0;
195 	bool dp_has_iocount = false;
196 	bool clear_usedvp = false;
197 
198 #if DIAGNOSTIC
199 	if (!vfs_context_ucred(ctx) || !p) {
200 		panic("namei: bad cred/proc");
201 	}
202 	if (cnp->cn_nameiop & (~OPMASK)) {
203 		panic("namei: nameiop contaminated with flags");
204 	}
205 	if (cnp->cn_flags & OPMASK) {
206 		panic("namei: flags contaminated with nameiops");
207 	}
208 #endif
209 
210 	/*
211 	 * A compound VNOP found something that needs further processing:
212 	 * either a trigger vnode, a covered directory, or a symlink.
213 	 */
214 	if (ndp->ni_flag & NAMEI_CONTLOOKUP) {
215 		int rdonly, vbusyflags, keep_going, wantparent;
216 
217 		rdonly = cnp->cn_flags & RDONLY;
218 		vbusyflags = ((cnp->cn_flags & CN_NBMOUNTLOOK) != 0) ? LK_NOWAIT : 0;
219 		keep_going = 0;
220 		wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
221 
222 		ndp->ni_flag &= ~(NAMEI_CONTLOOKUP);
223 
224 		error = lookup_handle_found_vnode(ndp, &ndp->ni_cnd, rdonly, vbusyflags,
225 		    &keep_going, ndp->ni_ncgeneration, wantparent, 0, ctx);
226 		if (error) {
227 			goto out_drop;
228 		}
229 		if (keep_going) {
230 			if ((cnp->cn_flags & ISSYMLINK) == 0) {
231 				panic("We need to keep going on a continued lookup, but for vp type %d (tag %d)", ndp->ni_vp->v_type, ndp->ni_vp->v_tag);
232 			}
233 			goto continue_symlink;
234 		}
235 
236 		return 0;
237 	}
238 
239 vnode_recycled:
240 
241 	/*
242 	 * Get a buffer for the name to be translated, and copy the
243 	 * name into the buffer.
244 	 */
245 	if ((cnp->cn_flags & HASBUF) == 0) {
246 		cnp->cn_pnbuf = ndp->ni_pathbuf;
247 		cnp->cn_pnlen = PATHBUFLEN;
248 	}
249 
250 retry_copy:
251 	if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
252 		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
253 		    cnp->cn_pnlen, &bytes_copied);
254 	} else {
255 		error = copystr(CAST_DOWN(void *, ndp->ni_dirp), cnp->cn_pnbuf,
256 		    cnp->cn_pnlen, &bytes_copied);
257 	}
258 	if (error == ENAMETOOLONG && !(cnp->cn_flags & HASBUF)) {
259 		if (bytes_copied == PATHBUFLEN) {
260 			resolve_error = lookup_check_for_resolve_prefix(cnp->cn_pnbuf, PATHBUFLEN,
261 			    PATHBUFLEN, &resolve_flags, &resolve_prefix_len);
262 			/* errors from copyinstr take precedence over resolve_error */
263 			if (!resolve_error && resolve_prefix_len) {
264 				ndp->ni_dirp += resolve_prefix_len;
265 				resolve_prefix_len = 0;
266 			}
267 		}
268 
269 		cnp->cn_pnbuf = zalloc(ZV_NAMEI);
270 		cnp->cn_flags |= HASBUF;
271 		cnp->cn_pnlen = MAXPATHLEN;
272 		bytes_copied = 0;
273 
274 		goto retry_copy;
275 	} else if (error == ENAMETOOLONG && (cnp->cn_flags & HASBUF) &&
276 	    (cnp->cn_pnlen * 2) <= MAXLONGPATHLEN && proc_support_long_paths(p)) {
277 		if (cnp->cn_pnlen == MAXPATHLEN) {
278 			/* First time we arrive here, the buffer came from ZV_NAMEI */
279 			zfree(ZV_NAMEI, cnp->cn_pnbuf);
280 		} else {
281 			kfree_data(cnp->cn_pnbuf, cnp->cn_pnlen);
282 		}
283 
284 		resolve_error = 0;
285 
286 		cnp->cn_pnlen *= 2;
287 		cnp->cn_pnbuf = kalloc_data(cnp->cn_pnlen, Z_WAITOK | Z_ZERO | Z_NOFAIL);
288 		bytes_copied = 0;
289 
290 		goto retry_copy;
291 	}
292 	if (error) {
293 		goto error_out;
294 	} else if (resolve_error) {
295 		error = resolve_error;
296 		goto error_out;
297 	}
298 	assert(bytes_copied <= cnp->cn_pnlen);
299 	ndp->ni_pathlen = (u_int)bytes_copied;
300 	bytes_copied = 0;
301 
302 	if (!(resolve_flags & RESOLVE_CHECKED)) {
303 		assert(!(cnp->cn_flags & HASBUF) && (cnp->cn_pnlen == PATHBUFLEN));
304 		error = lookup_check_for_resolve_prefix(cnp->cn_pnbuf, cnp->cn_pnlen, ndp->ni_pathlen,
305 		    &resolve_flags, &resolve_prefix_len);
306 		if (error) {
307 			goto error_out;
308 		}
309 		if (resolve_prefix_len) {
310 			/*
311 			 * Since this is pointing to the static path buffer instead of a zalloc'ed memorry,
312 			 * we're not going to attempt to free this, so it is perfectly fine to change the
313 			 * value of cnp->cn_pnbuf.
314 			 */
315 			cnp->cn_pnbuf += resolve_prefix_len;
316 			cnp->cn_pnlen -= resolve_prefix_len;
317 			ndp->ni_pathlen -= resolve_prefix_len;
318 			resolve_prefix_len = 0;
319 		}
320 	}
321 
322 	/* At this point we should have stripped off the prefix from the path that has to be looked up */
323 	assert((resolve_flags & RESOLVE_CHECKED) && (resolve_prefix_len == 0));
324 
325 	/*
326 	 * Since the name cache may contain positive entries of
327 	 * the incorrect case, force lookup() to bypass the cache
328 	 * and call directly into the filesystem for each path
329 	 * component. Note: the FS may still consult the cache,
330 	 * but can apply rules to validate the results.
331 	 */
332 	if (proc_is_forcing_hfs_case_sensitivity(p)) {
333 		cnp->cn_flags |= CN_SKIPNAMECACHE;
334 	}
335 
336 #if CONFIG_VOLFS
337 	/*
338 	 * Check for legacy volfs style pathnames.
339 	 *
340 	 * For compatibility reasons we currently allow these paths,
341 	 * but future versions of the OS may not support them.
342 	 */
343 	if (ndp->ni_pathlen >= VOLFS_MIN_PATH_LEN &&
344 	    cnp->cn_pnbuf[0] == '/' &&
345 	    cnp->cn_pnbuf[1] == '.' &&
346 	    cnp->cn_pnbuf[2] == 'v' &&
347 	    cnp->cn_pnbuf[3] == 'o' &&
348 	    cnp->cn_pnbuf[4] == 'l' &&
349 	    cnp->cn_pnbuf[5] == '/') {
350 		char * realpath;
351 		size_t realpathlen;
352 		int realpath_err;
353 		/* Attempt to resolve a legacy volfs style pathname. */
354 
355 		realpathlen = MAXPATHLEN;
356 		do {
357 			if (realpathlen == MAXPATHLEN) {
358 				realpath = zalloc(ZV_NAMEI);
359 			} else {
360 				/*
361 				 * To be consistent with the behavior of openbyid_np, which always supports
362 				 * long paths, do not gate our support on proc_support_long_paths either.
363 				 */
364 				realpath = kalloc_data(realpathlen, Z_WAITOK | Z_ZERO | Z_NOFAIL);
365 			}
366 			/*
367 			 * We only error out on the ENAMETOOLONG cases where we know that
368 			 * vfs_getrealpath translation succeeded but the path could not fit into
369 			 * realpathlen characters.  In other failure cases, we may be dealing with a path
370 			 * that legitimately looks like /.vol/1234/567 and is not meant to be translated
371 			 */
372 			if ((realpath_err = vfs_getrealpath(&cnp->cn_pnbuf[6], realpath, realpathlen, ctx))) {
373 				if (realpathlen == MAXPATHLEN) {
374 					zfree(ZV_NAMEI, realpath);
375 				} else {
376 					kfree_data(realpath, realpathlen);
377 				}
378 				if (realpath_err == ENOSPC || realpath_err == ENAMETOOLONG) {
379 					error = ENAMETOOLONG;
380 				}
381 			} else {
382 				size_t tmp_len;
383 				if (cnp->cn_flags & HASBUF) {
384 					if (cnp->cn_pnlen == MAXPATHLEN) {
385 						zfree(ZV_NAMEI, cnp->cn_pnbuf);
386 					} else {
387 						kfree_data(cnp->cn_pnbuf, cnp->cn_pnlen);
388 					}
389 				}
390 				cnp->cn_pnbuf = realpath;
391 				cnp->cn_pnlen = (int)realpathlen;
392 				tmp_len = strlen(realpath) + 1;
393 				assert(tmp_len <= UINT_MAX);
394 				ndp->ni_pathlen = (u_int)tmp_len;
395 				cnp->cn_flags |= HASBUF | CN_VOLFSPATH;
396 				error = 0;
397 			}
398 		} while (error == ENAMETOOLONG && (realpathlen *= 2) && realpathlen <= MAXLONGPATHLEN);
399 
400 		if (error) {
401 			goto error_out;
402 		}
403 	}
404 #endif /* CONFIG_VOLFS */
405 
406 #if CONFIG_AUDIT
407 	/* If we are auditing the kernel pathname, save the user pathname */
408 	if (cnp->cn_flags & AUDITVNPATH1) {
409 		AUDIT_ARG(upath, ut->uu_cdir, cnp->cn_pnbuf, ARG_UPATH1);
410 	}
411 	if (cnp->cn_flags & AUDITVNPATH2) {
412 		AUDIT_ARG(upath, ut->uu_cdir, cnp->cn_pnbuf, ARG_UPATH2);
413 	}
414 #endif /* CONFIG_AUDIT */
415 
416 	/*
417 	 * Do not allow empty pathnames
418 	 */
419 	if (*cnp->cn_pnbuf == '\0') {
420 		error = ENOENT;
421 		goto error_out;
422 	}
423 	if (ndp->ni_flag & NAMEI_NOFOLLOW_ANY || (resolve_flags & RESOLVE_NOFOLLOW_ANY)) {
424 		ndp->ni_loopcnt = MAXSYMLINKS;
425 	} else {
426 		ndp->ni_loopcnt = 0;
427 	}
428 
429 	/*
430 	 * determine the starting point for the translation.
431 	 */
432 	proc_dirs_lock_shared(p);
433 	lck_rw_lock_shared(&rootvnode_rw_lock);
434 
435 	if (!(ndp->ni_flag & NAMEI_ROOTDIR)) {
436 		if (fdt_flag_test(&p->p_fd, FD_CHROOT)) {
437 			ndp->ni_rootdir = p->p_fd.fd_rdir;
438 		} else {
439 			ndp->ni_rootdir = rootvnode;
440 		}
441 	}
442 
443 	if (!ndp->ni_rootdir) {
444 		if (ndp->ni_flag & NAMEI_ROOTDIR) {
445 			panic("NAMEI_ROOTDIR is set but ni_rootdir is not\n");
446 		} else if (fdt_flag_test(&p->p_fd, FD_CHROOT)) {
447 			/* This should be a panic */
448 			printf("p->p_fd.fd_rdir is not set\n");
449 		} else {
450 			printf("rootvnode is not set\n");
451 		}
452 		lck_rw_unlock_shared(&rootvnode_rw_lock);
453 		proc_dirs_unlock_shared(p);
454 		error = ENOENT;
455 		goto error_out;
456 	}
457 
458 	cnp->cn_nameptr = cnp->cn_pnbuf;
459 
460 	ndp->ni_usedvp = NULLVP;
461 
462 	if (*(cnp->cn_nameptr) == '/') {
463 		while (*(cnp->cn_nameptr) == '/') {
464 			cnp->cn_nameptr++;
465 			ndp->ni_pathlen--;
466 		}
467 		if (ndp->ni_flag & NAMEI_RESOLVE_BENEATH) {
468 			/* Absolute paths are never allowed in NAMEI_RESOLVE_BENEATH */
469 			lck_rw_unlock_shared(&rootvnode_rw_lock);
470 			proc_dirs_unlock_shared(p);
471 			error = EACCES;
472 			goto error_out;
473 		}
474 		dp = ndp->ni_rootdir;
475 	} else if (cnp->cn_flags & USEDVP) {
476 		dp = ndp->ni_dvp;
477 		ndp->ni_usedvp = dp;
478 		usedvp_dp = dp;
479 	} else {
480 		dp = vfs_context_cwd(ctx);
481 		if (ndp->ni_flag & NAMEI_RESOLVE_BENEATH) {
482 			/* Store the starting directory because it can change after a symlink traversal */
483 			ndp->ni_usedvp = dp;
484 			clear_usedvp = true;
485 		}
486 	}
487 
488 	if (dp == NULLVP || (dp->v_lflag & VL_DEAD)) {
489 		dp = NULLVP;
490 		lck_rw_unlock_shared(&rootvnode_rw_lock);
491 		proc_dirs_unlock_shared(p);
492 		error = ENOENT;
493 		goto error_out;
494 	}
495 
496 	/*
497 	 * We need our own usecount on the root vnode and the starting dir across
498 	 * the lookup. There's two things that be done here. We can hold the locks
499 	 * (which protect the existing usecounts on the directories) across the
500 	 * lookup or take our own usecount. Holding the locks across the lookup can
501 	 * cause deadlock issues if we re-enter namei on the same thread so the
502 	 * correct thing to do is to acquire our own usecount.
503 	 *
504 	 * Ideally, the usecount should be obtained by vnode_get->vnode_ref->vnode_put.
505 	 * However when this vnode is the rootvnode, that sequence will produce a
506 	 * lot of vnode mutex locks and  unlocks on a single vnode (the rootvnode)
507 	 * and will be highly contended and degrade performance. Since we have
508 	 * an existing usecount protected by the locks we hold, we'll just use
509 	 * an atomic op to increment the usecount on a vnode which already has one
510 	 * and can't be released because we have the locks which protect against that
511 	 * happening.
512 	 */
513 	rootdir_with_usecount = ndp->ni_rootdir;
514 	old_count = os_atomic_inc_orig(&rootdir_with_usecount->v_usecount, relaxed);
515 	if (old_count < 1) {
516 		panic("(1) invalid pre-increment usecount (%d) for rootdir vnode %p",
517 		    old_count, rootdir_with_usecount);
518 	} else if (old_count == INT32_MAX) {
519 		panic("(1) usecount overflow for vnode %p", rootdir_with_usecount);
520 	}
521 
522 	if ((dp != rootdir_with_usecount) && (dp != usedvp_dp)) {
523 		old_count = os_atomic_inc_orig(&dp->v_usecount, relaxed);
524 		if (old_count < 1) {
525 			panic("(2) invalid pre-increment usecount (%d) for vnode %p", old_count, dp);
526 		} else if (old_count == INT32_MAX) {
527 			panic("(2) usecount overflow for vnode %p", dp);
528 		}
529 		startdir_with_usecount = dp;
530 	}
531 
532 	/* Now that we have our usecount, release the locks */
533 	lck_rw_unlock_shared(&rootvnode_rw_lock);
534 	proc_dirs_unlock_shared(p);
535 
536 	ndp->ni_dvp = NULLVP;
537 	ndp->ni_vp  = NULLVP;
538 
539 	for (;;) {
540 #if CONFIG_MACF
541 		/*
542 		 * Give MACF policies a chance to reject the lookup
543 		 * before performing any filesystem operations.
544 		 * This hook is called before resolving the path and
545 		 * again each time a symlink is encountered.
546 		 * NB: policies receive path information as supplied
547 		 *     by the caller and thus cannot be trusted.
548 		 */
549 		error = mac_vnode_check_lookup_preflight(ctx, dp, cnp->cn_nameptr, cnp->cn_namelen);
550 		if (error) {
551 			goto error_out;
552 		}
553 #endif
554 		ndp->ni_startdir = dp;
555 		dp = NULLVP;
556 
557 		if ((error = lookup(ndp))) {
558 			goto error_out;
559 		}
560 
561 		/*
562 		 * Check for symbolic link
563 		 */
564 		if ((cnp->cn_flags & ISSYMLINK) == 0) {
565 			if (startdir_with_usecount) {
566 				vnode_rele(startdir_with_usecount);
567 				startdir_with_usecount = NULLVP;
568 			}
569 			if (rootdir_with_usecount) {
570 				lck_rw_lock_shared(&rootvnode_rw_lock);
571 				if (rootdir_with_usecount == rootvnode) {
572 					old_count = os_atomic_dec_orig(&rootdir_with_usecount->v_usecount, relaxed);
573 					if (old_count < 2) {
574 						/*
575 						 * There needs to have been at least 1 usecount left on the rootvnode
576 						 */
577 						panic("(3) Unexpected pre-decrement value (%d) of usecount for rootvnode %p",
578 						    old_count, rootdir_with_usecount);
579 					}
580 					rootdir_with_usecount = NULLVP;
581 				}
582 				lck_rw_unlock_shared(&rootvnode_rw_lock);
583 				if (rootdir_with_usecount) {
584 					vnode_rele(rootdir_with_usecount);
585 					rootdir_with_usecount = NULLVP;
586 				}
587 			}
588 
589 			return 0;
590 		}
591 
592 continue_symlink:
593 		/* Gives us a new path to process, and a starting dir */
594 		error = lookup_handle_symlink(ndp, &dp, &dp_has_iocount, ctx);
595 		if (error != 0) {
596 			break;
597 		}
598 		if (dp_has_iocount) {
599 			if ((dp != rootdir_with_usecount) && (dp != startdir_with_usecount) &&
600 			    (dp != usedvp_dp)) {
601 				if (startdir_with_usecount) {
602 					vnode_rele(startdir_with_usecount);
603 				}
604 				vnode_ref_ext(dp, 0, VNODE_REF_FORCE);
605 				startdir_with_usecount = dp;
606 			}
607 			vnode_put(dp);
608 			dp_has_iocount = false;
609 		}
610 	}
611 	/*
612 	 * only come here if we fail to handle a SYMLINK...
613 	 * if either ni_dvp or ni_vp is non-NULL, then
614 	 * we need to drop the iocount that was picked
615 	 * up in the lookup routine
616 	 */
617 out_drop:
618 	if (ndp->ni_dvp) {
619 		vnode_put(ndp->ni_dvp);
620 	}
621 	if (ndp->ni_vp) {
622 		vnode_put(ndp->ni_vp);
623 	}
624 error_out:
625 	if (clear_usedvp) {
626 		ndp->ni_usedvp = NULLVP;
627 	}
628 	if (startdir_with_usecount) {
629 		vnode_rele(startdir_with_usecount);
630 		startdir_with_usecount = NULLVP;
631 	}
632 	if (rootdir_with_usecount) {
633 		lck_rw_lock_shared(&rootvnode_rw_lock);
634 		if (rootdir_with_usecount == rootvnode) {
635 			old_count = os_atomic_dec_orig(&rootdir_with_usecount->v_usecount, relaxed);
636 			if (old_count < 2) {
637 				/*
638 				 * There needs to have been at least 1 usecount left on the rootvnode
639 				 */
640 				panic("(4) Unexpected pre-decrement value (%d) of usecount for rootvnode %p",
641 				    old_count, rootdir_with_usecount);
642 			}
643 			lck_rw_unlock_shared(&rootvnode_rw_lock);
644 		} else {
645 			lck_rw_unlock_shared(&rootvnode_rw_lock);
646 			vnode_rele(rootdir_with_usecount);
647 		}
648 		rootdir_with_usecount = NULLVP;
649 	}
650 
651 	if ((cnp->cn_flags & HASBUF)) {
652 		cnp->cn_flags &= ~HASBUF;
653 		if (cnp->cn_pnlen == MAXPATHLEN) {
654 			zfree(ZV_NAMEI, cnp->cn_pnbuf);
655 		} else {
656 			kfree_data(cnp->cn_pnbuf, cnp->cn_pnlen);
657 		}
658 	}
659 	cnp->cn_pnbuf = NULL;
660 	ndp->ni_vp = NULLVP;
661 	ndp->ni_dvp = NULLVP;
662 
663 #if CONFIG_VOLFS
664 	/*
665 	 * Deal with volfs fallout.
666 	 *
667 	 * At this point, if we were originally given a volfs path that
668 	 * looks like /.vol/123/456, then we would have had to convert it into
669 	 * a full path.  Assuming that part worked properly, we will now attempt
670 	 * to conduct a lookup of the item in the namespace.  Under normal
671 	 * circumstances, if a user looked up /tmp/foo and it was not there, it
672 	 * would be permissible to return ENOENT.
673 	 *
674 	 * However, we may not want to do that here.  Specifically, the volfs path
675 	 * uniquely identifies a certain item in the namespace regardless of where it
676 	 * lives.  If the item has moved in between the time we constructed the
677 	 * path and now, when we're trying to do a lookup/authorization on the full
678 	 * path, we may have gotten an ENOENT.
679 	 *
680 	 * At this point we can no longer tell if the path no longer exists
681 	 * or if the item in question no longer exists. It could have been renamed
682 	 * away, in which case the /.vol identifier is still valid.
683 	 *
684 	 * Do this dance a maximum of MAX_VOLFS_RESTARTS times.
685 	 */
686 	if ((error == ENOENT) && (ndp->ni_cnd.cn_flags & CN_VOLFSPATH)) {
687 		if (volfs_restarts < MAX_VOLFS_RESTARTS) {
688 			volfs_restarts++;
689 			goto vnode_recycled;
690 		}
691 	}
692 #endif
693 
694 	if (error == ERECYCLE) {
695 		/* vnode was recycled underneath us. re-drive lookup to start at
696 		 *  the beginning again, since recycling invalidated last lookup*/
697 		ndp->ni_cnd.cn_flags = cnpflags;
698 		ndp->ni_dvp = usedvp;
699 		goto vnode_recycled;
700 	}
701 
702 
703 	return error;
704 }
705 
706 int
namei_compound_available(vnode_t dp,struct nameidata * ndp)707 namei_compound_available(vnode_t dp, struct nameidata *ndp)
708 {
709 	if ((ndp->ni_flag & NAMEI_COMPOUNDOPEN) != 0) {
710 		return vnode_compound_open_available(dp);
711 	}
712 
713 	return 0;
714 }
715 
716 int
lookup_check_for_resolve_prefix(char * path,size_t pathbuflen,size_t len,uint32_t * resolve_flags,size_t * prefix_len)717 lookup_check_for_resolve_prefix(char *path, size_t pathbuflen, size_t len, uint32_t *resolve_flags, size_t *prefix_len)
718 {
719 	int error = 0;
720 	*resolve_flags = (uint32_t)RESOLVE_CHECKED;
721 	*prefix_len = 0;
722 
723 	if (len < (sizeof("/.nofollow/") - 1) || path[0] != '/' || path[1] != '.') {
724 		return 0;
725 	}
726 
727 	if ((strncmp(&path[2], "nofollow/", (sizeof("nofollow/") - 1)) == 0)) {
728 		*resolve_flags |= RESOLVE_NOFOLLOW_ANY;
729 		*prefix_len = sizeof("/.nofollow") - 1;
730 	} else if ((len >= sizeof("/.resolve/1/") - 1) &&
731 	    strncmp(&path[2], "resolve/", (sizeof("resolve/") - 1)) == 0) {
732 		char * flag = path + (sizeof("/.resolve/") - 1);
733 		char *next = flag;
734 		char last_char = path[pathbuflen - 1];
735 
736 		/* no leading zeroes or non digits */
737 		if ((flag[0] == '0' && flag[1] != '/') ||
738 		    flag[0] < '0' || flag[0] > '9') {
739 			error = EINVAL;
740 			goto out;
741 		}
742 
743 		path[pathbuflen - 1] = '\0';
744 		unsigned long flag_val = strtoul(flag, &next, 10);
745 		path[pathbuflen - 1] = last_char;
746 		if (next[0] != '/' || (flag_val & ~(RESOLVE_NOFOLLOW_ANY))) {
747 			error = EINVAL;
748 			goto out;
749 		}
750 		assert(next >= flag);
751 		*resolve_flags |= (uint32_t)flag_val;
752 		*prefix_len = (size_t)(next - path);
753 	}
754 out:
755 	assert(*prefix_len <= sizeof("/.resolve/2147483647"));
756 	return error;
757 }
758 
759 static int
lookup_authorize_search(vnode_t dp,struct componentname * cnp,int dp_authorized_in_cache,vfs_context_t ctx)760 lookup_authorize_search(vnode_t dp, struct componentname *cnp, int dp_authorized_in_cache, vfs_context_t ctx)
761 {
762 #if !CONFIG_MACF
763 #pragma unused(cnp)
764 #endif
765 
766 	int error;
767 
768 	if (!dp_authorized_in_cache) {
769 		error = vnode_authorize(dp, NULL, KAUTH_VNODE_SEARCH, ctx);
770 		if (error) {
771 			return error;
772 		}
773 	}
774 #if CONFIG_MACF
775 	error = mac_vnode_check_lookup(ctx, dp, cnp);
776 	if (error) {
777 		return error;
778 	}
779 #endif /* CONFIG_MACF */
780 
781 	return 0;
782 }
783 
784 static void
lookup_consider_update_cache(vnode_t dvp,vnode_t vp,struct componentname * cnp,int nc_generation)785 lookup_consider_update_cache(vnode_t dvp, vnode_t vp, struct componentname *cnp, int nc_generation)
786 {
787 	int isdot_or_dotdot;
788 	isdot_or_dotdot = (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') || (cnp->cn_flags & ISDOTDOT);
789 
790 	if (vp->v_name == NULL || vp->v_parent == NULLVP) {
791 		int  update_flags = 0;
792 
793 		if (isdot_or_dotdot == 0) {
794 			if (vp->v_name == NULL) {
795 				update_flags |= VNODE_UPDATE_NAME;
796 			}
797 			if (dvp != NULLVP && vp->v_parent == NULLVP) {
798 				update_flags |= VNODE_UPDATE_PARENT;
799 			}
800 
801 			if (update_flags) {
802 				vnode_update_identity(vp, dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, update_flags);
803 			}
804 		}
805 	}
806 	if ((cnp->cn_flags & MAKEENTRY) && (vp->v_flag & VNCACHEABLE) && LIST_FIRST(&vp->v_nclinks) == NULL) {
807 		/*
808 		 * missing from name cache, but should
809 		 * be in it... this can happen if volfs
810 		 * causes the vnode to be created or the
811 		 * name cache entry got recycled but the
812 		 * vnode didn't...
813 		 * check to make sure that ni_dvp is valid
814 		 * cache_lookup_path may return a NULL
815 		 * do a quick check to see if the generation of the
816 		 * directory matches our snapshot... this will get
817 		 * rechecked behind the name cache lock, but if it
818 		 * already fails to match, no need to go any further
819 		 */
820 		if (dvp != NULLVP && (nc_generation == dvp->v_nc_generation) && (!isdot_or_dotdot)) {
821 			cache_enter_with_gen(dvp, vp, cnp, nc_generation);
822 		}
823 	}
824 }
825 
826 #if NAMEDRSRCFORK
827 /*
828  * Can change ni_dvp and ni_vp.  On success, returns with iocounts on stream vnode (always) and
829  * data fork if requested.  On failure, returns with iocount data fork (always) and its parent directory
830  * (if one was provided).
831  */
832 static int
lookup_handle_rsrc_fork(vnode_t dp,struct nameidata * ndp,struct componentname * cnp,int wantparent,vfs_context_t ctx)833 lookup_handle_rsrc_fork(vnode_t dp, struct nameidata *ndp, struct componentname *cnp, int wantparent, vfs_context_t ctx)
834 {
835 	vnode_t svp = NULLVP;
836 	enum nsoperation nsop;
837 	int nsflags;
838 	int error;
839 
840 	if (dp->v_type != VREG) {
841 		error = ENOENT;
842 		goto out;
843 	}
844 	switch (cnp->cn_nameiop) {
845 	case DELETE:
846 		if (cnp->cn_flags & CN_ALLOWRSRCFORK) {
847 			nsop = NS_DELETE;
848 		} else {
849 			error = EPERM;
850 			goto out;
851 		}
852 		break;
853 	case CREATE:
854 		if (cnp->cn_flags & CN_ALLOWRSRCFORK) {
855 			nsop = NS_CREATE;
856 		} else {
857 			error = EPERM;
858 			goto out;
859 		}
860 		break;
861 	case LOOKUP:
862 		/* Make sure our lookup of "/..namedfork/rsrc" is allowed. */
863 		if (cnp->cn_flags & CN_ALLOWRSRCFORK) {
864 			nsop = NS_OPEN;
865 		} else {
866 			error = EPERM;
867 			goto out;
868 		}
869 		break;
870 	default:
871 		error = EPERM;
872 		goto out;
873 	}
874 
875 	nsflags = 0;
876 	if (cnp->cn_flags & CN_RAW_ENCRYPTED) {
877 		nsflags |= NS_GETRAWENCRYPTED;
878 	}
879 
880 	/* Ask the file system for the resource fork. */
881 	error = vnode_getnamedstream(dp, &svp, XATTR_RESOURCEFORK_NAME, nsop, nsflags, ctx);
882 
883 	/* During a create, it OK for stream vnode to be missing. */
884 	if (error == ENOATTR || error == ENOENT) {
885 		error = (nsop == NS_CREATE) ? 0 : ENOENT;
886 	}
887 	if (error) {
888 		goto out;
889 	}
890 	/* The "parent" of the stream is the file. */
891 	if (wantparent) {
892 		if (ndp->ni_dvp) {
893 			vnode_put(ndp->ni_dvp);
894 		}
895 		ndp->ni_dvp = dp;
896 	} else {
897 		vnode_put(dp);
898 	}
899 	ndp->ni_vp = svp;  /* on create this may be null */
900 
901 	/* Restore the truncated pathname buffer (for audits). */
902 	if (ndp->ni_pathlen == 1 && ndp->ni_next[0] == '\0') {
903 		/*
904 		 * While we replaced only '/' with '\0' and would ordinarily
905 		 * need to just switch that back, the buffer in which we did
906 		 * this may not be what the pathname buffer is now when symlinks
907 		 * are involved. If we just restore the "/" we will make the
908 		 * string not terminated anymore, so be safe and restore the
909 		 * entire suffix.
910 		 */
911 		strncpy(ndp->ni_next, _PATH_RSRCFORKSPEC, sizeof(_PATH_RSRCFORKSPEC));
912 		cnp->cn_nameptr = ndp->ni_next + 1;
913 		cnp->cn_namelen = sizeof(_PATH_RSRCFORKSPEC) - 1;
914 		ndp->ni_next += cnp->cn_namelen;
915 		if (ndp->ni_next[0] != '\0') {
916 			panic("Incorrect termination of path in %s", __FUNCTION__);
917 		}
918 	}
919 	cnp->cn_flags  &= ~MAKEENTRY;
920 
921 	return 0;
922 out:
923 	return error;
924 }
925 #endif /* NAMEDRSRCFORK */
926 
927 /*
928  * iocounts in:
929  *      --One on ni_vp.  One on ni_dvp if there is more path, or we didn't come through the
930  *      cache, or we came through the cache and the caller doesn't want the parent.
931  *
932  * iocounts out:
933  *	--Leaves us in the correct state for the next step, whatever that might be.
934  *	--If we find a symlink, returns with iocounts on both ni_vp and ni_dvp.
935  *	--If we are to look up another component, then we have an iocount on ni_vp and
936  *	nothing else.
937  *	--If we are done, returns an iocount on ni_vp, and possibly on ni_dvp depending on nameidata flags.
938  *	--In the event of an error, may return with ni_dvp NULL'ed out (in which case, iocount
939  *	was dropped).
940  */
941 static int
lookup_handle_found_vnode(struct nameidata * ndp,struct componentname * cnp,int rdonly,int vbusyflags,int * keep_going,int nc_generation,int wantparent,int atroot,vfs_context_t ctx)942 lookup_handle_found_vnode(struct nameidata *ndp, struct componentname *cnp, int rdonly,
943     int vbusyflags, int *keep_going, int nc_generation,
944     int wantparent, int atroot, vfs_context_t ctx)
945 {
946 	vnode_t dp;
947 	int error;
948 	char *cp;
949 
950 	dp = ndp->ni_vp;
951 	*keep_going = 0;
952 
953 	if (ndp->ni_vp == NULLVP) {
954 		panic("NULL ni_vp in %s", __FUNCTION__);
955 	}
956 
957 	if (atroot) {
958 		goto nextname;
959 	}
960 
961 	/*
962 	 * Take into account any additional components consumed by
963 	 * the underlying filesystem.
964 	 */
965 	if (cnp->cn_consume > 0) {
966 		cnp->cn_nameptr += cnp->cn_consume;
967 		ndp->ni_next += cnp->cn_consume;
968 		ndp->ni_pathlen -= cnp->cn_consume;
969 		cnp->cn_consume = 0;
970 	} else {
971 		lookup_consider_update_cache(ndp->ni_dvp, dp, cnp, nc_generation);
972 	}
973 
974 	/*
975 	 * Check to see if the vnode has been mounted on...
976 	 * if so find the root of the mounted file system.
977 	 * Updates ndp->ni_vp.
978 	 */
979 	error = lookup_traverse_mountpoints(ndp, cnp, dp, vbusyflags, ctx);
980 	dp = ndp->ni_vp;
981 	if (error) {
982 		goto out;
983 	}
984 
985 #if CONFIG_MACF
986 	if (vfs_flags(vnode_mount(dp)) & MNT_MULTILABEL) {
987 		error = vnode_label(vnode_mount(dp), NULL, dp, NULL, 0, ctx);
988 		if (error) {
989 			goto out;
990 		}
991 	}
992 #endif
993 
994 	/*
995 	 * Check for symbolic link
996 	 */
997 	if ((dp->v_type == VLNK) &&
998 	    ((cnp->cn_flags & FOLLOW) || (ndp->ni_flag & NAMEI_TRAILINGSLASH) || *ndp->ni_next == '/')) {
999 		cnp->cn_flags |= ISSYMLINK;
1000 		*keep_going = 1;
1001 		return 0;
1002 	}
1003 
1004 	/*
1005 	 * Check for bogus trailing slashes.
1006 	 */
1007 	if ((ndp->ni_flag & NAMEI_TRAILINGSLASH)) {
1008 		if (dp->v_type != VDIR) {
1009 			error = ENOTDIR;
1010 			goto out;
1011 		}
1012 		ndp->ni_flag &= ~(NAMEI_TRAILINGSLASH);
1013 	}
1014 
1015 #if NAMEDSTREAMS
1016 	/*
1017 	 * Deny namei/lookup requests to resolve paths that point to shadow files.
1018 	 * Access to shadow files must be conducted by explicit calls to VNOP_LOOKUP
1019 	 * directly, and not use lookup/namei
1020 	 */
1021 	if (vnode_isshadow(dp)) {
1022 		error = ENOENT;
1023 		goto out;
1024 	}
1025 #endif
1026 
1027 nextname:
1028 	/*
1029 	 * Not a symbolic link.  If more pathname,
1030 	 * continue at next component, else return.
1031 	 *
1032 	 * Definitely have a dvp if there's another slash
1033 	 */
1034 	if (*ndp->ni_next == '/') {
1035 		cnp->cn_nameptr = ndp->ni_next + 1;
1036 		ndp->ni_pathlen--;
1037 		while (*cnp->cn_nameptr == '/') {
1038 			cnp->cn_nameptr++;
1039 			ndp->ni_pathlen--;
1040 		}
1041 
1042 		cp = cnp->cn_nameptr;
1043 		vnode_put(ndp->ni_dvp);
1044 		ndp->ni_dvp = NULLVP;
1045 
1046 		if (*cp == '\0') {
1047 			goto emptyname;
1048 		}
1049 
1050 		*keep_going = 1;
1051 		return 0;
1052 	}
1053 
1054 	/*
1055 	 * Disallow directory write attempts on read-only file systems.
1056 	 */
1057 	if (rdonly &&
1058 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
1059 		error = EROFS;
1060 		goto out;
1061 	}
1062 
1063 	/* If SAVESTART is set, we should have a dvp */
1064 	if (cnp->cn_flags & SAVESTART) {
1065 		/*
1066 		 * note that we already hold a reference
1067 		 * on both dp and ni_dvp, but for some reason
1068 		 * can't get another one... in this case we
1069 		 * need to do vnode_put on dp in 'bad2'
1070 		 */
1071 		if ((vnode_get(ndp->ni_dvp))) {
1072 			error = ENOENT;
1073 			goto out;
1074 		}
1075 		ndp->ni_startdir = ndp->ni_dvp;
1076 	}
1077 	if (!wantparent && ndp->ni_dvp) {
1078 		vnode_put(ndp->ni_dvp);
1079 		ndp->ni_dvp = NULLVP;
1080 	}
1081 
1082 	if (cnp->cn_flags & AUDITVNPATH1) {
1083 		AUDIT_ARG(vnpath, dp, ARG_VNODE1);
1084 	} else if (cnp->cn_flags & AUDITVNPATH2) {
1085 		AUDIT_ARG(vnpath, dp, ARG_VNODE2);
1086 	}
1087 
1088 #if NAMEDRSRCFORK
1089 	/*
1090 	 * Caller wants the resource fork.
1091 	 */
1092 	if ((cnp->cn_flags & CN_WANTSRSRCFORK) && (dp != NULLVP)) {
1093 		error = lookup_handle_rsrc_fork(dp, ndp, cnp, wantparent, ctx);
1094 		if (error != 0) {
1095 			goto out;
1096 		}
1097 
1098 		dp = ndp->ni_vp;
1099 	}
1100 #endif
1101 	if (kdebug_enable) {
1102 		kdebug_lookup(ndp->ni_vp, cnp);
1103 	}
1104 
1105 	return 0;
1106 
1107 emptyname:
1108 	error = lookup_handle_emptyname(ndp, cnp, wantparent);
1109 	if (error != 0) {
1110 		goto out;
1111 	}
1112 
1113 	return 0;
1114 out:
1115 	return error;
1116 }
1117 
1118 /*
1119  * Comes in iocount on ni_vp.  May overwrite ni_dvp, but doesn't interpret incoming value.
1120  */
1121 static int
lookup_handle_emptyname(struct nameidata * ndp,struct componentname * cnp,int wantparent)1122 lookup_handle_emptyname(struct nameidata *ndp, struct componentname *cnp, int wantparent)
1123 {
1124 	vnode_t dp;
1125 	int error = 0;
1126 
1127 	dp = ndp->ni_vp;
1128 	cnp->cn_namelen = 0;
1129 	/*
1130 	 * A degenerate name (e.g. / or "") which is a way of
1131 	 * talking about a directory, e.g. like "/." or ".".
1132 	 */
1133 	if (dp->v_type != VDIR) {
1134 		error = ENOTDIR;
1135 		goto out;
1136 	}
1137 	if (cnp->cn_nameiop == CREATE && dp == rootvnode) {
1138 		error = EEXIST;
1139 		goto out;
1140 	}
1141 	if (cnp->cn_nameiop != LOOKUP) {
1142 		error = EISDIR;
1143 		goto out;
1144 	}
1145 	if (wantparent) {
1146 		/*
1147 		 * note that we already hold a reference
1148 		 * on dp, but for some reason can't
1149 		 * get another one... in this case we
1150 		 * need to do vnode_put on dp in 'bad'
1151 		 */
1152 		if ((vnode_get(dp))) {
1153 			error = ENOENT;
1154 			goto out;
1155 		}
1156 		ndp->ni_dvp = dp;
1157 	}
1158 	cnp->cn_flags &= ~ISDOTDOT;
1159 	cnp->cn_flags |= ISLASTCN;
1160 	ndp->ni_next = cnp->cn_nameptr;
1161 	ndp->ni_vp = dp;
1162 
1163 	if (cnp->cn_flags & AUDITVNPATH1) {
1164 		AUDIT_ARG(vnpath, dp, ARG_VNODE1);
1165 	} else if (cnp->cn_flags & AUDITVNPATH2) {
1166 		AUDIT_ARG(vnpath, dp, ARG_VNODE2);
1167 	}
1168 	if (cnp->cn_flags & SAVESTART) {
1169 		panic("lookup: SAVESTART");
1170 	}
1171 
1172 	return 0;
1173 out:
1174 	return error;
1175 }
1176 /*
1177  * Search a pathname.
1178  * This is a very central and rather complicated routine.
1179  *
1180  * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
1181  * The starting directory is taken from ni_startdir. The pathname is
1182  * descended until done, or a symbolic link is encountered. The variable
1183  * ni_more is clear if the path is completed; it is set to one if a
1184  * symbolic link needing interpretation is encountered.
1185  *
1186  * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
1187  * whether the name is to be looked up, created, renamed, or deleted.
1188  * When CREATE, RENAME, or DELETE is specified, information usable in
1189  * creating, renaming, or deleting a directory entry may be calculated.
1190  * If flag has LOCKPARENT or'ed into it, the parent directory is returned
1191  * locked. If flag has WANTPARENT or'ed into it, the parent directory is
1192  * returned unlocked. Otherwise the parent directory is not returned. If
1193  * the target of the pathname exists and LOCKLEAF is or'ed into the flag
1194  * the target is returned locked, otherwise it is returned unlocked.
1195  * When creating or renaming and LOCKPARENT is specified, the target may not
1196  * be ".".  When deleting and LOCKPARENT is specified, the target may be ".".
1197  *
1198  * Overall outline of lookup:
1199  *
1200  * dirloop:
1201  *	identify next component of name at ndp->ni_ptr
1202  *	handle degenerate case where name is null string
1203  *	if .. and crossing mount points and on mounted filesys, find parent
1204  *	call VNOP_LOOKUP routine for next component name
1205  *	    directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
1206  *	    component vnode returned in ni_vp (if it exists), locked.
1207  *	if result vnode is mounted on and crossing mount points,
1208  *	    find mounted on vnode
1209  *	if more components of name, do next level at dirloop
1210  *	return the answer in ni_vp, locked if LOCKLEAF set
1211  *	    if LOCKPARENT set, return locked parent in ni_dvp
1212  *	    if WANTPARENT set, return unlocked parent in ni_dvp
1213  *
1214  * Returns:	0			Success
1215  *		ENOENT			No such file or directory
1216  *		EBADF			Bad file descriptor
1217  *		ENOTDIR			Not a directory
1218  *		EROFS			Read-only file system [CREATE]
1219  *		EISDIR			Is a directory [CREATE]
1220  *		cache_lookup_path:ERECYCLE  (vnode was recycled from underneath us, redrive lookup again)
1221  *		vnode_authorize:EROFS
1222  *		vnode_authorize:EACCES
1223  *		vnode_authorize:EPERM
1224  *		vnode_authorize:???
1225  *		VNOP_LOOKUP:ENOENT	No such file or directory
1226  *		VNOP_LOOKUP:EJUSTRETURN	Restart system call (INTERNAL)
1227  *		VNOP_LOOKUP:???
1228  *		VFS_ROOT:ENOTSUP
1229  *		VFS_ROOT:ENOENT
1230  *		VFS_ROOT:???
1231  */
1232 int
lookup(struct nameidata * ndp)1233 lookup(struct nameidata *ndp)
1234 {
1235 	char    *cp;            /* pointer into pathname argument */
1236 	vnode_t         tdp;            /* saved dp */
1237 	vnode_t         dp;             /* the directory we are searching */
1238 	int docache = 1;                /* == 0 do not cache last component */
1239 	int wantparent;                 /* 1 => wantparent or lockparent flag */
1240 	int rdonly;                     /* lookup read-only flag bit */
1241 	int dp_authorized = 0;
1242 	int error = 0;
1243 	struct componentname *cnp = &ndp->ni_cnd;
1244 	vfs_context_t ctx = cnp->cn_context;
1245 	int vbusyflags = 0;
1246 	int nc_generation = 0;
1247 	vnode_t last_dp = NULLVP;
1248 	int keep_going;
1249 	int atroot;
1250 
1251 	/*
1252 	 * Setup: break out flag bits into variables.
1253 	 */
1254 	if (cnp->cn_flags & NOCACHE) {
1255 		docache = 0;
1256 	}
1257 	wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
1258 	rdonly = cnp->cn_flags & RDONLY;
1259 	cnp->cn_flags &= ~ISSYMLINK;
1260 	cnp->cn_consume = 0;
1261 
1262 	dp = ndp->ni_startdir;
1263 	ndp->ni_startdir = NULLVP;
1264 
1265 	if ((cnp->cn_flags & CN_NBMOUNTLOOK) != 0) {
1266 		vbusyflags = LK_NOWAIT;
1267 	}
1268 	cp = cnp->cn_nameptr;
1269 
1270 	if (*cp == '\0') {
1271 		if ((vnode_getwithref(dp))) {
1272 			dp = NULLVP;
1273 			error = ENOENT;
1274 			goto bad;
1275 		}
1276 		ndp->ni_vp = dp;
1277 		error = lookup_handle_emptyname(ndp, cnp, wantparent);
1278 		if (error) {
1279 			goto bad;
1280 		}
1281 
1282 		return 0;
1283 	}
1284 dirloop:
1285 	atroot = 0;
1286 	ndp->ni_vp = NULLVP;
1287 
1288 	if ((error = cache_lookup_path(ndp, cnp, dp, ctx, &dp_authorized, last_dp))) {
1289 		dp = NULLVP;
1290 		goto bad;
1291 	}
1292 	if ((cnp->cn_flags & ISLASTCN)) {
1293 		if (docache) {
1294 			cnp->cn_flags |= MAKEENTRY;
1295 		}
1296 	} else {
1297 		cnp->cn_flags |= MAKEENTRY;
1298 	}
1299 
1300 	dp = ndp->ni_dvp;
1301 
1302 	if (ndp->ni_vp != NULLVP) {
1303 		/*
1304 		 * cache_lookup_path returned a non-NULL ni_vp then,
1305 		 * we're guaranteed that the dp is a VDIR, it's
1306 		 * been authorized, and vp is not ".."
1307 		 *
1308 		 * make sure we don't try to enter the name back into
1309 		 * the cache if this vp is purged before we get to that
1310 		 * check since we won't have serialized behind whatever
1311 		 * activity is occurring in the FS that caused the purge
1312 		 */
1313 		if (dp != NULLVP) {
1314 			nc_generation = dp->v_nc_generation - 1;
1315 		}
1316 
1317 		goto returned_from_lookup_path;
1318 	}
1319 
1320 	/*
1321 	 * Handle "..": three special cases.
1322 	 * 1. if at starting directory (e.g. the cwd/usedvp)
1323 	 *    and RESOLVE_BENEATH, then return EACCES.
1324 	 * 2. If at root directory (e.g. after chroot)
1325 	 *    or at absolute root directory
1326 	 *    then ignore it so can't get out.
1327 	 * 3. If this vnode is the root of a mounted
1328 	 *    filesystem, then replace it with the
1329 	 *    vnode which was mounted on so we take the
1330 	 *    .. in the other file system.
1331 	 */
1332 	if ((cnp->cn_flags & ISDOTDOT)) {
1333 		/* if dp is the starting directory and RESOLVE_BENEATH, we should return EACCES */
1334 		if ((ndp->ni_flag & NAMEI_RESOLVE_BENEATH) && (dp == ndp->ni_usedvp)) {
1335 			error = EACCES;
1336 			goto bad;
1337 		}
1338 		/*
1339 		 * if this is a chroot'ed process, check if the current
1340 		 * directory is still a subdirectory of the process's
1341 		 * root directory.
1342 		 */
1343 		if (ndp->ni_rootdir && (ndp->ni_rootdir != rootvnode) &&
1344 		    dp != ndp->ni_rootdir) {
1345 			int sdir_error;
1346 			int is_subdir = FALSE;
1347 
1348 			sdir_error = vnode_issubdir(dp, ndp->ni_rootdir,
1349 			    &is_subdir, vfs_context_kernel());
1350 
1351 			/*
1352 			 * If we couldn't determine if dp is a subdirectory of
1353 			 * ndp->ni_rootdir (sdir_error != 0), we let the request
1354 			 * proceed.
1355 			 */
1356 			if (!sdir_error && !is_subdir) {
1357 				vnode_put(dp);
1358 				dp = ndp->ni_rootdir;
1359 				/*
1360 				 * There's a ref on the process's root directory
1361 				 * but we can't use vnode_getwithref here as
1362 				 * there is nothing preventing that ref being
1363 				 * released by another thread.
1364 				 */
1365 				if (vnode_get(dp)) {
1366 					dp = NULLVP;
1367 					error = ENOENT;
1368 					goto bad;
1369 				}
1370 			}
1371 		}
1372 
1373 		for (;;) {
1374 			if (dp == ndp->ni_rootdir || dp == rootvnode) {
1375 				ndp->ni_dvp = dp;
1376 				ndp->ni_vp = dp;
1377 				/*
1378 				 * we're pinned at the root
1379 				 * we've already got one reference on 'dp'
1380 				 * courtesy of cache_lookup_path... take
1381 				 * another one for the ".."
1382 				 * if we fail to get the new reference, we'll
1383 				 * drop our original down in 'bad'
1384 				 */
1385 				if (vnode_get(dp)) {
1386 					error = ENOENT;
1387 					goto bad;
1388 				}
1389 				atroot = 1;
1390 				goto returned_from_lookup_path;
1391 			}
1392 			if ((dp->v_flag & VROOT) == 0 ||
1393 			    (cnp->cn_flags & NOCROSSMOUNT)) {
1394 				break;
1395 			}
1396 			if (dp->v_mount == NULL) {      /* forced umount */
1397 				error = EBADF;
1398 				goto bad;
1399 			}
1400 			tdp = dp;
1401 			dp = tdp->v_mount->mnt_vnodecovered;
1402 
1403 			if ((vnode_getwithref(dp))) {
1404 				vnode_put(tdp);
1405 				dp = NULLVP;
1406 				error = ENOENT;
1407 				goto bad;
1408 			}
1409 
1410 			vnode_put(tdp);
1411 
1412 			ndp->ni_dvp = dp;
1413 			dp_authorized = 0;
1414 		}
1415 	}
1416 
1417 	/*
1418 	 * We now have a segment name to search for, and a directory to search.
1419 	 */
1420 #if CONFIG_UNION_MOUNTS
1421 unionlookup:
1422 #endif /* CONFIG_UNION_MOUNTS */
1423 	ndp->ni_vp = NULLVP;
1424 
1425 	if (dp->v_type != VDIR) {
1426 		error = ENOTDIR;
1427 		goto lookup_error;
1428 	}
1429 	if ((cnp->cn_flags & DONOTAUTH) != DONOTAUTH) {
1430 		error = lookup_authorize_search(dp, cnp, dp_authorized, ctx);
1431 		if (error) {
1432 			goto lookup_error;
1433 		}
1434 	}
1435 
1436 	/*
1437 	 * Now that we've authorized a lookup, can bail out if the filesystem
1438 	 * will be doing a batched operation.  Return an iocount on dvp.
1439 	 */
1440 #if NAMEDRSRCFORK
1441 	if ((cnp->cn_flags & ISLASTCN) && namei_compound_available(dp, ndp) && !(cnp->cn_flags & CN_WANTSRSRCFORK)) {
1442 #else
1443 	if ((cnp->cn_flags & ISLASTCN) && namei_compound_available(dp, ndp)) {
1444 #endif /* NAMEDRSRCFORK */
1445 		ndp->ni_flag |= NAMEI_UNFINISHED;
1446 		ndp->ni_ncgeneration = dp->v_nc_generation;
1447 		return 0;
1448 	}
1449 
1450 	nc_generation = dp->v_nc_generation;
1451 
1452 	/*
1453 	 * Note:
1454 	 * Filesystems that support hardlinks may want to call vnode_update_identity
1455 	 * if the lookup operation below will modify the in-core vnode to belong to a new point
1456 	 * in the namespace.  VFS cannot infer whether or not the look up operation makes the vnode
1457 	 * name change or change parents.  Without this, the lookup may make update
1458 	 * filesystem-specific in-core metadata but fail to update the v_parent or v_name
1459 	 * fields in the vnode.  If VFS were to do this, it would be necessary to call
1460 	 * vnode_update_identity on every lookup operation -- expensive!
1461 	 *
1462 	 * However, even with this in place, multiple lookups may occur in between this lookup
1463 	 * and the subsequent vnop, so, at best, we could only guarantee that you would get a
1464 	 * valid path back, and not necessarily the one that you wanted.
1465 	 *
1466 	 * Example:
1467 	 * /tmp/a == /foo/b
1468 	 *
1469 	 * If you are now looking up /foo/b and the vnode for this link represents /tmp/a,
1470 	 * vnode_update_identity will fix the parentage so that you can get /foo/b back
1471 	 * through the v_parent chain (preventing you from getting /tmp/b back). It would
1472 	 * not fix whether or not you should or should not get /tmp/a vs. /foo/b.
1473 	 */
1474 
1475 	error = VNOP_LOOKUP(dp, &ndp->ni_vp, cnp, ctx);
1476 
1477 	if (error) {
1478 lookup_error:
1479 #if CONFIG_UNION_MOUNTS
1480 		if ((error == ENOENT) &&
1481 		    (dp->v_mount != NULL) &&
1482 		    (dp->v_mount->mnt_flag & MNT_UNION)) {
1483 			tdp = dp;
1484 			error = lookup_traverse_union(tdp, &dp, ctx);
1485 			vnode_put(tdp);
1486 			if (error) {
1487 				dp = NULLVP;
1488 				goto bad;
1489 			}
1490 
1491 			ndp->ni_dvp = dp;
1492 			dp_authorized = 0;
1493 			goto unionlookup;
1494 		}
1495 #endif /* CONFIG_UNION_MOUNTS */
1496 
1497 		if (error != EJUSTRETURN) {
1498 			goto bad;
1499 		}
1500 
1501 		if (ndp->ni_vp != NULLVP) {
1502 			panic("leaf should be empty");
1503 		}
1504 
1505 #if NAMEDRSRCFORK
1506 		/*
1507 		 * At this point, error should be EJUSTRETURN.
1508 		 *
1509 		 * If CN_WANTSRSRCFORK is set, that implies that the
1510 		 * underlying filesystem could not find the "parent" of the
1511 		 * resource fork (the data fork), and we are doing a lookup
1512 		 * for a CREATE event.
1513 		 *
1514 		 * However, this should be converted to an error, as the
1515 		 * failure to find this parent should disallow further
1516 		 * progress to try and acquire a resource fork vnode.
1517 		 */
1518 		if (cnp->cn_flags & CN_WANTSRSRCFORK) {
1519 			error = ENOENT;
1520 			goto bad;
1521 		}
1522 #endif
1523 
1524 		error = lookup_validate_creation_path(ndp);
1525 		if (error) {
1526 			goto bad;
1527 		}
1528 		/*
1529 		 * We return with ni_vp NULL to indicate that the entry
1530 		 * doesn't currently exist, leaving a pointer to the
1531 		 * referenced directory vnode in ndp->ni_dvp.
1532 		 */
1533 		if (cnp->cn_flags & SAVESTART) {
1534 			if ((vnode_get(ndp->ni_dvp))) {
1535 				error = ENOENT;
1536 				goto bad;
1537 			}
1538 			ndp->ni_startdir = ndp->ni_dvp;
1539 		}
1540 		if (!wantparent) {
1541 			vnode_put(ndp->ni_dvp);
1542 		}
1543 
1544 		if (kdebug_enable) {
1545 			kdebug_lookup(ndp->ni_dvp, cnp);
1546 		}
1547 		return 0;
1548 	}
1549 returned_from_lookup_path:
1550 	/* We'll always have an iocount on ni_vp when this finishes. */
1551 	error = lookup_handle_found_vnode(ndp, cnp, rdonly, vbusyflags, &keep_going, nc_generation, wantparent, atroot, ctx);
1552 	if (error != 0) {
1553 		goto bad2;
1554 	}
1555 
1556 	if (keep_going) {
1557 		dp = ndp->ni_vp;
1558 
1559 		/* namei() will handle symlinks */
1560 		if ((dp->v_type == VLNK) &&
1561 		    ((cnp->cn_flags & FOLLOW) || (ndp->ni_flag & NAMEI_TRAILINGSLASH) || *ndp->ni_next == '/')) {
1562 			return 0;
1563 		}
1564 
1565 		/*
1566 		 * Otherwise, there's more path to process.
1567 		 * cache_lookup_path is now responsible for dropping io ref on dp
1568 		 * when it is called again in the dirloop.  This ensures we hold
1569 		 * a ref on dp until we complete the next round of lookup.
1570 		 */
1571 		last_dp = dp;
1572 
1573 		goto dirloop;
1574 	}
1575 
1576 	return 0;
1577 bad2:
1578 	if (ndp->ni_dvp) {
1579 		vnode_put(ndp->ni_dvp);
1580 	}
1581 
1582 	vnode_put(ndp->ni_vp);
1583 	ndp->ni_vp = NULLVP;
1584 
1585 	if (kdebug_enable) {
1586 		kdebug_lookup(dp, cnp);
1587 	}
1588 	return error;
1589 
1590 bad:
1591 	if (dp) {
1592 		vnode_put(dp);
1593 	}
1594 	ndp->ni_vp = NULLVP;
1595 
1596 	if (kdebug_enable) {
1597 		kdebug_lookup(dp, cnp);
1598 	}
1599 	return error;
1600 }
1601 
1602 #if CONFIG_UNION_MOUNTS
1603 /*
1604  * Given a vnode in a union mount, traverse to the equivalent
1605  * vnode in the underlying mount.
1606  */
1607 int
1608 lookup_traverse_union(vnode_t dvp, vnode_t *new_dvp, vfs_context_t ctx)
1609 {
1610 	char *path = NULL, *pp;
1611 	const char *name, *np;
1612 	size_t len;
1613 	int error = 0;
1614 	struct nameidata nd;
1615 	vnode_t vp = dvp;
1616 
1617 	*new_dvp = NULL;
1618 
1619 	if (vp && vp->v_flag & VROOT) {
1620 		*new_dvp = vp->v_mount->mnt_vnodecovered;
1621 		if (vnode_getwithref(*new_dvp)) {
1622 			return ENOENT;
1623 		}
1624 		return 0;
1625 	}
1626 
1627 	path = zalloc_flags(ZV_NAMEI, Z_WAITOK | Z_NOFAIL);
1628 
1629 	/*
1630 	 * Walk back up to the mountpoint following the
1631 	 * v_parent chain and build a slash-separated path.
1632 	 * Then lookup that path starting with the covered vnode.
1633 	 */
1634 	pp = path + (MAXPATHLEN - 1);
1635 	*pp = '\0';
1636 
1637 	while (1) {
1638 		name = vnode_getname(vp);
1639 		if (name == NULL) {
1640 			printf("lookup_traverse_union: null parent name: .%s\n", pp);
1641 			error = ENOENT;
1642 			goto done;
1643 		}
1644 		len = strlen(name);
1645 		if ((len + 1) > (size_t)(pp - path)) {          // Enough space for this name ?
1646 			error = ENAMETOOLONG;
1647 			vnode_putname(name);
1648 			goto done;
1649 		}
1650 		for (np = name + len; len > 0; len--) { // Copy name backwards
1651 			*--pp = *--np;
1652 		}
1653 		vnode_putname(name);
1654 		vp = vp->v_parent;
1655 		if (vp == NULLVP || vp->v_flag & VROOT) {
1656 			break;
1657 		}
1658 		*--pp = '/';
1659 	}
1660 
1661 	/* Evaluate the path in the underlying mount */
1662 	NDINIT(&nd, LOOKUP, OP_LOOKUP, USEDVP, UIO_SYSSPACE, CAST_USER_ADDR_T(pp), ctx);
1663 	nd.ni_dvp = dvp->v_mount->mnt_vnodecovered;
1664 	error = namei(&nd);
1665 	if (error == 0) {
1666 		*new_dvp = nd.ni_vp;
1667 	}
1668 	nameidone(&nd);
1669 done:
1670 	if (path) {
1671 		zfree(ZV_NAMEI, path);
1672 	}
1673 	return error;
1674 }
1675 #endif /* CONFIG_UNION_MOUNTS */
1676 
1677 int
1678 lookup_validate_creation_path(struct nameidata *ndp)
1679 {
1680 	struct componentname *cnp = &ndp->ni_cnd;
1681 
1682 	/*
1683 	 * If creating and at end of pathname, then can consider
1684 	 * allowing file to be created.
1685 	 */
1686 	if (cnp->cn_flags & RDONLY) {
1687 		return EROFS;
1688 	}
1689 	if ((cnp->cn_flags & ISLASTCN) && (ndp->ni_flag & NAMEI_TRAILINGSLASH) && !(cnp->cn_flags & WILLBEDIR)) {
1690 		return ENOENT;
1691 	}
1692 
1693 	return 0;
1694 }
1695 
1696 /*
1697  * Modifies only ni_vp.  Always returns with ni_vp still valid (iocount held).
1698  */
1699 static int
1700 lookup_traverse_mountpoints(struct nameidata *ndp, struct componentname *cnp, vnode_t dp,
1701     int vbusyflags, vfs_context_t ctx)
1702 {
1703 	mount_t mp;
1704 	vnode_t tdp;
1705 	int error = 0;
1706 	uint32_t depth = 0;
1707 	vnode_t mounted_on_dp;
1708 	int current_mount_generation = 0;
1709 #if CONFIG_TRIGGERS
1710 	vnode_t triggered_dp = NULLVP;
1711 	int retry_cnt = 0;
1712 #define MAX_TRIGGER_RETRIES 1
1713 #endif
1714 
1715 	if (dp->v_type != VDIR || cnp->cn_flags & NOCROSSMOUNT) {
1716 		return 0;
1717 	}
1718 
1719 	mounted_on_dp = dp;
1720 #if CONFIG_TRIGGERS
1721 restart:
1722 #endif
1723 	current_mount_generation = mount_generation;
1724 
1725 	while (dp->v_mountedhere) {
1726 		vnode_lock_spin(dp);
1727 		if ((mp = dp->v_mountedhere)) {
1728 			mp->mnt_crossref++;
1729 			vnode_unlock(dp);
1730 		} else {
1731 			vnode_unlock(dp);
1732 			break;
1733 		}
1734 
1735 		if (ISSET(mp->mnt_lflag, MNT_LFORCE)) {
1736 			mount_dropcrossref(mp, dp, 0);
1737 			break;  // don't traverse into a forced unmount
1738 		}
1739 
1740 
1741 		if (vfs_busy(mp, vbusyflags)) {
1742 			mount_dropcrossref(mp, dp, 0);
1743 			if (vbusyflags == LK_NOWAIT) {
1744 				error = ENOENT;
1745 				goto out;
1746 			}
1747 
1748 			continue;
1749 		}
1750 
1751 		error = VFS_ROOT(mp, &tdp, ctx);
1752 
1753 		mount_dropcrossref(mp, dp, 0);
1754 		vfs_unbusy(mp);
1755 
1756 		if (error) {
1757 			goto out;
1758 		}
1759 
1760 		vnode_put(dp);
1761 		ndp->ni_vp = dp = tdp;
1762 		if (dp->v_type != VDIR) {
1763 #if DEVELOPMENT || DEBUG
1764 			panic("%s : Root of filesystem not a directory",
1765 			    __FUNCTION__);
1766 #else
1767 			break;
1768 #endif
1769 		}
1770 		depth++;
1771 	}
1772 
1773 #if CONFIG_TRIGGERS
1774 	/*
1775 	 * The triggered_dp check here is required but is susceptible to a
1776 	 * (unlikely) race in which trigger mount is done from here and is
1777 	 * unmounted before we get past vfs_busy above. We retry to deal with
1778 	 * that case but it has the side effect of unwanted retries for
1779 	 * "special" processes which don't want to trigger mounts.
1780 	 */
1781 	if (dp->v_resolve && retry_cnt < MAX_TRIGGER_RETRIES) {
1782 		error = vnode_trigger_resolve(dp, ndp, ctx);
1783 		if (error) {
1784 			goto out;
1785 		}
1786 		if (dp == triggered_dp) {
1787 			retry_cnt += 1;
1788 		} else {
1789 			retry_cnt = 0;
1790 		}
1791 		triggered_dp = dp;
1792 		goto restart;
1793 	}
1794 #endif /* CONFIG_TRIGGERS */
1795 
1796 	if (depth) {
1797 		mp = mounted_on_dp->v_mountedhere;
1798 
1799 		if (mp) {
1800 			mount_lock_spin(mp);
1801 			mp->mnt_realrootvp_vid = dp->v_id;
1802 			mp->mnt_realrootvp = dp;
1803 			mp->mnt_generation = current_mount_generation;
1804 			mount_unlock(mp);
1805 		}
1806 	}
1807 
1808 	return 0;
1809 
1810 out:
1811 	return error;
1812 }
1813 
1814 /*
1815  * Takes ni_vp and ni_dvp non-NULL.  Returns with *new_dp set to the location
1816  * at which to start a lookup with a resolved path, and all other iocounts dropped.
1817  */
1818 static int
1819 lookup_handle_symlink(struct nameidata *ndp, vnode_t *new_dp, bool *new_dp_has_iocount, vfs_context_t ctx)
1820 {
1821 	int error;
1822 	char *cp = NULL;               /* pointer into pathname argument */
1823 	u_int cplen = 0;
1824 	uio_t auio;
1825 	UIO_STACKBUF(uio_buf, 1);
1826 	int need_newpathbuf;
1827 	u_int linklen = 0;
1828 	struct componentname *cnp = &ndp->ni_cnd;
1829 	vnode_t dp;
1830 	char *tmppn;
1831 	u_int rsrclen = (cnp->cn_flags & CN_WANTSRSRCFORK) ? sizeof(_PATH_RSRCFORKSPEC) : 0;
1832 	bool dp_has_iocount = false;
1833 
1834 	if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
1835 		return ELOOP;
1836 	}
1837 #if CONFIG_MACF
1838 	if ((error = mac_vnode_check_readlink(ctx, ndp->ni_vp)) != 0) {
1839 		return error;
1840 	}
1841 #endif /* MAC */
1842 	if (ndp->ni_pathlen > 1 || !(cnp->cn_flags & HASBUF)) {
1843 		need_newpathbuf = 1;
1844 	} else {
1845 		need_newpathbuf = 0;
1846 	}
1847 
1848 	if (need_newpathbuf) {
1849 		if (!(cnp->cn_flags & HASBUF) || cnp->cn_pnlen == MAXPATHLEN) {
1850 			cp = zalloc(ZV_NAMEI);
1851 			cplen = MAXPATHLEN;
1852 		} else {
1853 			assert(proc_support_long_paths(vfs_context_proc(ctx)));
1854 			cp = kalloc_data(cnp->cn_pnlen, Z_WAITOK | Z_ZERO);
1855 			cplen = cnp->cn_pnlen;
1856 		}
1857 	} else {
1858 		cp = cnp->cn_pnbuf;
1859 	}
1860 	auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf));
1861 
1862 	uio_addiov(auio, CAST_USER_ADDR_T(cp), MAXPATHLEN);
1863 
1864 	error = VNOP_READLINK(ndp->ni_vp, auio, ctx);
1865 
1866 	if (!error) {
1867 		user_ssize_t resid = uio_resid(auio);
1868 
1869 		assert(resid <= MAXPATHLEN);
1870 
1871 		if (resid == MAXPATHLEN) {
1872 			linklen = 0;
1873 		} else {
1874 			/*
1875 			 * Safe to set unsigned with a [larger] signed type here
1876 			 * because 0 <= uio_resid <= MAXPATHLEN and MAXPATHLEN
1877 			 * is only 1024.
1878 			 */
1879 			linklen = (u_int)strnlen(cp, MAXPATHLEN - (u_int)resid);
1880 		}
1881 
1882 		size_t maxlen = proc_support_long_paths(vfs_context_proc(ctx)) ? MAXLONGPATHLEN : MAXPATHLEN;
1883 
1884 		if (linklen == 0) {
1885 			error = ENOENT;
1886 		} else if (linklen + ndp->ni_pathlen + rsrclen > maxlen) {
1887 			error = ENAMETOOLONG;
1888 		}
1889 	}
1890 
1891 	if (error) {
1892 		if (need_newpathbuf) {
1893 			if (cplen == MAXPATHLEN) {
1894 				zfree(ZV_NAMEI, cp);
1895 			} else {
1896 				kfree_data(cp, cplen);
1897 			}
1898 		}
1899 		return error;
1900 	}
1901 
1902 	if (need_newpathbuf) {
1903 		tmppn = cnp->cn_pnbuf;
1904 		u_int tmplen = cnp->cn_pnlen;
1905 		bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
1906 		cnp->cn_pnbuf = cp;
1907 		cnp->cn_pnlen = cplen;
1908 
1909 		if ((cnp->cn_flags & HASBUF)) {
1910 			if (tmplen == MAXPATHLEN) {
1911 				zfree(ZV_NAMEI, tmppn);
1912 			} else {
1913 				kfree_data(tmppn, tmplen);
1914 			}
1915 		} else {
1916 			cnp->cn_flags |= HASBUF;
1917 		}
1918 	} else {
1919 		cnp->cn_pnbuf[linklen] = '\0';
1920 	}
1921 
1922 	ndp->ni_pathlen += linklen;
1923 	cnp->cn_nameptr = cnp->cn_pnbuf;
1924 
1925 	/*
1926 	 * starting point for 'relative'
1927 	 * symbolic link path
1928 	 */
1929 	dp = ndp->ni_dvp;
1930 
1931 	/*
1932 	 * get rid of reference returned via 'lookup'
1933 	 * ni_dvp is released only if we restart at /.
1934 	 */
1935 	vnode_put(ndp->ni_vp);
1936 	ndp->ni_vp = NULLVP;
1937 	ndp->ni_dvp = NULLVP;
1938 
1939 	dp_has_iocount = true;
1940 
1941 	/*
1942 	 * Check if symbolic link restarts us at the root
1943 	 */
1944 	if (*(cnp->cn_nameptr) == '/') {
1945 		/* return EACCES if resolve beneath and the symlink restarts at root */
1946 		if (ndp->ni_flag & NAMEI_RESOLVE_BENEATH) {
1947 			vnode_put(dp); /* ALWAYS have a dvp for a symlink */
1948 			return EACCES;
1949 		}
1950 		while (*(cnp->cn_nameptr) == '/') {
1951 			cnp->cn_nameptr++;
1952 			ndp->ni_pathlen--;
1953 		}
1954 		if (linklen != 0) {
1955 			vnode_put(dp); /* ALWAYS have a dvp for a symlink */
1956 			dp_has_iocount = false;
1957 			if ((dp = ndp->ni_rootdir) == NULLVP) {
1958 				return ENOENT;
1959 			}
1960 		}
1961 	}
1962 
1963 	*new_dp = dp;
1964 	*new_dp_has_iocount = dp_has_iocount;
1965 
1966 	return 0;
1967 }
1968 
1969 /*
1970  * relookup - lookup a path name component
1971  *    Used by lookup to re-aquire things.
1972  */
1973 int
1974 relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
1975 {
1976 	struct vnode *dp = NULL;                /* the directory we are searching */
1977 	int wantparent;                 /* 1 => wantparent or lockparent flag */
1978 	int rdonly;                     /* lookup read-only flag bit */
1979 	int error = 0;
1980 #ifdef NAMEI_DIAGNOSTIC
1981 	int i, newhash;                 /* DEBUG: check name hash */
1982 	char *cp;                       /* DEBUG: check name ptr/len */
1983 #endif
1984 	vfs_context_t ctx = cnp->cn_context;
1985 
1986 	/*
1987 	 * Setup: break out flag bits into variables.
1988 	 */
1989 	wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
1990 	rdonly = cnp->cn_flags & RDONLY;
1991 	cnp->cn_flags &= ~ISSYMLINK;
1992 
1993 	if (cnp->cn_flags & NOCACHE) {
1994 		cnp->cn_flags &= ~MAKEENTRY;
1995 	} else {
1996 		cnp->cn_flags |= MAKEENTRY;
1997 	}
1998 
1999 	dp = dvp;
2000 
2001 	/*
2002 	 * Check for degenerate name (e.g. / or "")
2003 	 * which is a way of talking about a directory,
2004 	 * e.g. like "/." or ".".
2005 	 */
2006 	if (cnp->cn_nameptr[0] == '\0') {
2007 		if (cnp->cn_nameiop != LOOKUP || wantparent) {
2008 			error = EISDIR;
2009 			goto bad;
2010 		}
2011 		if (dp->v_type != VDIR) {
2012 			error = ENOTDIR;
2013 			goto bad;
2014 		}
2015 		if ((vnode_get(dp))) {
2016 			error = ENOENT;
2017 			goto bad;
2018 		}
2019 		*vpp = dp;
2020 
2021 		if (cnp->cn_flags & SAVESTART) {
2022 			panic("lookup: SAVESTART");
2023 		}
2024 		return 0;
2025 	}
2026 	/*
2027 	 * We now have a segment name to search for, and a directory to search.
2028 	 */
2029 	if ((error = VNOP_LOOKUP(dp, vpp, cnp, ctx))) {
2030 		if (error != EJUSTRETURN) {
2031 			goto bad;
2032 		}
2033 #if DIAGNOSTIC
2034 		if (*vpp != NULL) {
2035 			panic("leaf should be empty");
2036 		}
2037 #endif
2038 		/*
2039 		 * If creating and at end of pathname, then can consider
2040 		 * allowing file to be created.
2041 		 */
2042 		if (rdonly) {
2043 			error = EROFS;
2044 			goto bad;
2045 		}
2046 		/*
2047 		 * We return with ni_vp NULL to indicate that the entry
2048 		 * doesn't currently exist, leaving a pointer to the
2049 		 * (possibly locked) directory inode in ndp->ni_dvp.
2050 		 */
2051 		return 0;
2052 	}
2053 	dp = *vpp;
2054 
2055 #if DIAGNOSTIC
2056 	/*
2057 	 * Check for symbolic link
2058 	 */
2059 	if (dp->v_type == VLNK && (cnp->cn_flags & FOLLOW)) {
2060 		panic("relookup: symlink found.");
2061 	}
2062 #endif
2063 
2064 	/*
2065 	 * Disallow directory write attempts on read-only file systems.
2066 	 */
2067 	if (rdonly &&
2068 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
2069 		error = EROFS;
2070 		goto bad2;
2071 	}
2072 	/* ASSERT(dvp == ndp->ni_startdir) */
2073 
2074 	return 0;
2075 
2076 bad2:
2077 	vnode_put(dp);
2078 bad:
2079 	*vpp = NULL;
2080 
2081 	return error;
2082 }
2083 
2084 /*
2085  * Free pathname buffer
2086  */
2087 void
2088 nameidone(struct nameidata *ndp)
2089 {
2090 	if (ndp->ni_cnd.cn_flags & HASBUF) {
2091 		char *tmp = ndp->ni_cnd.cn_pnbuf;
2092 
2093 		ndp->ni_cnd.cn_pnbuf = NULL;
2094 		ndp->ni_cnd.cn_flags &= ~HASBUF;
2095 		if (ndp->ni_cnd.cn_pnlen == MAXPATHLEN) {
2096 			zfree(ZV_NAMEI, tmp);
2097 		} else {
2098 			kfree_data(tmp, ndp->ni_cnd.cn_pnlen);
2099 		}
2100 	}
2101 }
2102 
2103 
2104 /*
2105  * Log (part of) a pathname using kdebug, as used by fs_usage.  The path up to
2106  * and including the current component name are logged.  Up to NUMPARMS * 4
2107  * bytes of pathname will be logged.  If the path to be logged is longer than
2108  * that, then the last NUMPARMS * 4 bytes are logged. That is, the truncation
2109  * removes the leading portion of the path.
2110  *
2111  * The logging is done via multiple KDBG_RELEASE calls.  The first one is marked
2112  * with DBG_FUNC_START.  The last one is marked with DBG_FUNC_END (in addition
2113  * to DBG_FUNC_START if it is also the first).  There may be intermediate ones
2114  * with neither DBG_FUNC_START nor DBG_FUNC_END.
2115  *
2116  * The first event passes the vnode pointer and 24 or 32 (on K32, 12 or 24)
2117  * bytes of pathname.  The remaining events add 32 (on K32, 16) bytes of
2118  * pathname each.  The minimum number of events required to pass the path are
2119  * used.  Any excess padding in the final event (because not all of the 24 or 32
2120  * (on K32, 12 or 16) bytes are needed for the remainder of the path) is set to
2121  * zero bytes, or '>' if there is more path beyond the current component name
2122  * (usually because an intermediate component was not found).
2123  *
2124  * NOTE: If the path length is greater than NUMPARMS * 4, or is not of the form
2125  * 24 + N * 32 (or on K32, 12 + N * 16), there will be no padding.
2126  */
2127 #if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST)
2128 
2129 void
2130 kdebug_vfs_lookup(const char *path, size_t path_len, void *vnp,
2131     uint32_t flags)
2132 {
2133 	unsigned long path_words[4] = {};
2134 	size_t trace_len = MIN(sizeof(path_words) - sizeof(path_words[0]), path_len);
2135 	size_t path_next = 0;
2136 	bool noprocfilt = flags & KDBG_VFS_LOOKUP_FLAG_NOPROCFILT;
2137 
2138 	assert(path_len >= 0);
2139 
2140 	int code = ((flags & KDBG_VFS_LOOKUP_FLAG_LOOKUP) ? VFS_LOOKUP :
2141 	    VFS_LOOKUP_DONE) | DBG_FUNC_START;
2142 
2143 	if (path_len <= (3 * (int)sizeof(long))) {
2144 		code |= DBG_FUNC_END;
2145 	}
2146 	memcpy(path_words, path, trace_len);
2147 	path_next += trace_len;
2148 
2149 	if (noprocfilt) {
2150 		KDBG_RELEASE_NOPROCFILT(code, kdebug_vnode(vnp), path_words[0],
2151 		    path_words[1], path_words[2]);
2152 	} else {
2153 		KDBG_RELEASE(code, kdebug_vnode(vnp), path_words[0], path_words[1],
2154 		    path_words[2]);
2155 	}
2156 
2157 	code &= ~DBG_FUNC_START;
2158 
2159 	for (int i = 3; i * (int)sizeof(long) < path_len; i += 4) {
2160 		trace_len = sizeof(path_words);
2161 		if ((i + 4) * (int)sizeof(long) >= path_len) {
2162 			code |= DBG_FUNC_END;
2163 			trace_len = path_len - path_next;
2164 			memset(path_words, 0, sizeof(path_words));
2165 		}
2166 		memcpy(path_words, &path[path_next], trace_len);
2167 		path_next += trace_len;
2168 
2169 		if (noprocfilt) {
2170 			KDBG_RELEASE_NOPROCFILT(code, path_words[0], path_words[1],
2171 			    path_words[2], path_words[3]);
2172 		} else {
2173 			KDBG_RELEASE(code, path_words[0], path_words[1],
2174 			    path_words[2], path_words[3]);
2175 		}
2176 	}
2177 }
2178 
2179 void
2180 kdebug_lookup_gen_events(long *path_words, int path_len, void *vnp, bool lookup)
2181 {
2182 	assert(path_len >= 0);
2183 	kdebug_vfs_lookup((const char *)path_words, path_len, vnp,
2184 	    lookup ? KDBG_VFS_LOOKUP_FLAG_LOOKUP : 0);
2185 }
2186 
2187 void
2188 kdebug_lookup(vnode_t vnp, struct componentname *cnp)
2189 {
2190 	kdebug_vfs_lookup(cnp->cn_pnbuf, strnlen(cnp->cn_pnbuf, cnp->cn_pnlen), vnp, KDBG_VFS_LOOKUP_FLAG_LOOKUP);
2191 }
2192 
2193 #else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */
2194 
2195 void
2196 kdebug_vfs_lookup(const char *dbg_parms __unused, size_t dbg_namelen __unused,
2197     void *dp __unused, __unused kdebug_vfs_lookup_flags_t flags)
2198 {
2199 }
2200 
2201 static void
2202 kdebug_lookup(struct vnode *dp __unused, struct componentname *cnp __unused)
2203 {
2204 }
2205 #endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */
2206 
2207 int
2208 vfs_getbyid(fsid_t *fsid, ino64_t ino, vnode_t *vpp, vfs_context_t ctx)
2209 {
2210 	mount_t mp;
2211 	int error;
2212 
2213 	mp = mount_lookupby_volfsid(fsid->val[0], 1);
2214 	if (mp == NULL) {
2215 		return EINVAL;
2216 	}
2217 
2218 	/* Get the target vnode. */
2219 	if (ino == 2) {
2220 		error = VFS_ROOT(mp, vpp, ctx);
2221 	} else {
2222 		error = VFS_VGET(mp, ino, vpp, ctx);
2223 	}
2224 
2225 	vfs_unbusy(mp);
2226 	return error;
2227 }
2228 /*
2229  * Obtain the real path from a legacy volfs style path.
2230  *
2231  * Valid formats of input path:
2232  *
2233  *	"555/@"
2234  *	"555/2"
2235  *	"555/123456"
2236  *	"555/123456/foobar"
2237  *
2238  * Where:
2239  *	555 represents the volfs file system id
2240  *	'@' and '2' are aliases to the root of a file system
2241  *	123456 represents a file id
2242  *	"foobar" represents a file name
2243  */
2244 #if CONFIG_VOLFS
2245 static int
2246 vfs_getrealpath(const char * path, char * realpath, size_t bufsize, vfs_context_t ctx)
2247 {
2248 	vnode_t vp;
2249 	struct mount *mp = NULL;
2250 	char  *str;
2251 	char ch;
2252 	unsigned long id;
2253 	ino64_t ino;
2254 	int error;
2255 	int length;
2256 
2257 	/* Get file system id and move str to next component. */
2258 	id = strtoul(path, &str, 10);
2259 	if (id == 0 || str[0] != '/') {
2260 		return EINVAL;
2261 	}
2262 	while (*str == '/') {
2263 		str++;
2264 	}
2265 	ch = *str;
2266 
2267 	if (id > INT_MAX) {
2268 		return ENOENT;
2269 	}
2270 	mp = mount_lookupby_volfsid((int)id, 1);
2271 	if (mp == NULL) {
2272 		return EINVAL;  /* unexpected failure */
2273 	}
2274 	/* Check for an alias to a file system root. */
2275 	if (ch == '@' && str[1] == '\0') {
2276 		ino = 2;
2277 		str++;
2278 	} else {
2279 		/* Get file id and move str to next component. */
2280 		ino = strtouq(str, &str, 10);
2281 	}
2282 
2283 	/* Get the target vnode. */
2284 	if (ino == 2) {
2285 		struct vfs_attr vfsattr;
2286 		int use_vfs_root = TRUE;
2287 
2288 		VFSATTR_INIT(&vfsattr);
2289 		VFSATTR_WANTED(&vfsattr, f_capabilities);
2290 		if (vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
2291 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
2292 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
2293 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
2294 				use_vfs_root = FALSE;
2295 			}
2296 		}
2297 
2298 		if (use_vfs_root) {
2299 			error = VFS_ROOT(mp, &vp, ctx);
2300 		} else {
2301 			error = VFS_VGET(mp, ino, &vp, ctx);
2302 		}
2303 	} else {
2304 		error = VFS_VGET(mp, ino, &vp, ctx);
2305 	}
2306 	vfs_unbusy(mp);
2307 	if (error) {
2308 		goto out;
2309 	}
2310 	realpath[0] = '\0';
2311 
2312 	/* Get the absolute path to this vnode. */
2313 	error = build_path(vp, realpath, (int)bufsize, &length, 0, ctx);
2314 	vnode_put(vp);
2315 
2316 	if (error == 0 && *str != '\0') {
2317 		size_t attempt = strlcat(realpath, str, MAXPATHLEN);
2318 		if (attempt > MAXPATHLEN) {
2319 			error = ENAMETOOLONG;
2320 		}
2321 	}
2322 out:
2323 	return error;
2324 }
2325 #endif
2326 
2327 void
2328 lookup_compound_vnop_post_hook(int error, vnode_t dvp, vnode_t vp, struct nameidata *ndp, int did_create)
2329 {
2330 	if (error == 0 && vp == NULLVP) {
2331 		panic("NULL vp with error == 0.");
2332 	}
2333 
2334 	/*
2335 	 * We don't want to do any of this if we didn't use the compound vnop
2336 	 * to perform the lookup... i.e. if we're allowing and using the legacy pattern,
2337 	 * where we did a full lookup.
2338 	 */
2339 	if ((ndp->ni_flag & NAMEI_COMPOUND_OP_MASK) == 0) {
2340 		return;
2341 	}
2342 
2343 	/*
2344 	 * If we're going to continue the lookup, we'll handle
2345 	 * all lookup-related updates at that time.
2346 	 */
2347 	if (error == EKEEPLOOKING) {
2348 		return;
2349 	}
2350 
2351 	/*
2352 	 * Only audit or update cache for *found* vnodes.  For creation
2353 	 * neither would happen in the non-compound-vnop case.
2354 	 */
2355 	if ((vp != NULLVP) && !did_create) {
2356 		/*
2357 		 * If MAKEENTRY isn't set, and we've done a successful compound VNOP,
2358 		 * then we certainly don't want to update cache or identity.
2359 		 */
2360 		if ((error != 0) || (ndp->ni_cnd.cn_flags & MAKEENTRY)) {
2361 			lookup_consider_update_cache(dvp, vp, &ndp->ni_cnd, ndp->ni_ncgeneration);
2362 		}
2363 		if (ndp->ni_cnd.cn_flags & AUDITVNPATH1) {
2364 			AUDIT_ARG(vnpath, vp, ARG_VNODE1);
2365 		} else if (ndp->ni_cnd.cn_flags & AUDITVNPATH2) {
2366 			AUDIT_ARG(vnpath, vp, ARG_VNODE2);
2367 		}
2368 	}
2369 
2370 	/*
2371 	 * If you created (whether you opened or not), cut a lookup tracepoint
2372 	 * for the parent dir (as would happen without a compound vnop).  Note: we may need
2373 	 * a vnode despite failure in this case!
2374 	 *
2375 	 * If you did not create:
2376 	 *      Found child (succeeded or not): cut a tracepoint for the child.
2377 	 *      Did not find child: cut a tracepoint with the parent.
2378 	 */
2379 	if (kdebug_enable) {
2380 		kdebug_lookup(vp ? vp : dvp, &ndp->ni_cnd);
2381 	}
2382 }
2383