1 /*
2 * Copyright (c) 1995-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 * (c) UNIX System Laboratories, Inc.
32 * All or some portions of this file are derived from material licensed
33 * to the University of California by American Telephone and Telegraph
34 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35 * the permission of UNIX System Laboratories, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 * must display the following acknowledgement:
47 * This product includes software developed by the University of
48 * California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95
66 */
67 /*
68 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69 * support for mandatory and extensible security protections. This notice
70 * is included in support of clause 2.2 (b) of the Apple Public License,
71 * Version 2.0.
72 */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112
113 #include <vfs/vfs_disk_conditioner.h>
114
115 #include <security/audit/audit.h>
116 #include <bsm/audit_kevents.h>
117
118 #include <mach/mach_types.h>
119 #include <kern/kern_types.h>
120 #include <kern/kalloc.h>
121 #include <kern/task.h>
122
123 #include <vm/vm_pageout.h>
124 #include <vm/vm_protos.h>
125
126 #include <libkern/OSAtomic.h>
127 #include <os/atomic_private.h>
128 #include <pexpert/pexpert.h>
129 #include <IOKit/IOBSD.h>
130
131 // deps for MIG call
132 #include <kern/host.h>
133 #include <kern/ipc_misc.h>
134 #include <mach/host_priv.h>
135 #include <mach/vfs_nspace.h>
136 #include <os/log.h>
137
138 #include <nfs/nfs_conf.h>
139
140 #if ROUTEFS
141 #include <miscfs/routefs/routefs.h>
142 #endif /* ROUTEFS */
143
144 #if CONFIG_MACF
145 #include <security/mac.h>
146 #include <security/mac_framework.h>
147 #endif
148
149 #if CONFIG_FSE
150 #define GET_PATH(x) \
151 ((x) = get_pathbuff())
152 #define RELEASE_PATH(x) \
153 release_pathbuff(x)
154 #else
155 #define GET_PATH(x) \
156 ((x) = zalloc(ZV_NAMEI))
157 #define RELEASE_PATH(x) \
158 zfree(ZV_NAMEI, x)
159 #endif /* CONFIG_FSE */
160
161 #ifndef HFS_GET_BOOT_INFO
162 #define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004)
163 #endif
164
165 #ifndef HFS_SET_BOOT_INFO
166 #define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005)
167 #endif
168
169 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
170 #define APFSIOC_REVERT_TO_SNAPSHOT _IOW('J', 1, u_int64_t)
171 #endif
172
173 extern void disk_conditioner_unmount(mount_t mp);
174
175 /* struct for checkdirs iteration */
176 struct cdirargs {
177 vnode_t olddp;
178 vnode_t newdp;
179 };
180 /* callback for checkdirs iteration */
181 static int checkdirs_callback(proc_t p, void * arg);
182
183 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
184 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
185 void enablequotas(struct mount *mp, vfs_context_t ctx);
186 static int getfsstat_callback(mount_t mp, void * arg);
187 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
188 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
189 static int sync_callback(mount_t, void *);
190 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
191 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
192 boolean_t partial_copy);
193 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
194 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
195 struct componentname *cnp, user_addr_t fsmountargs,
196 int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
197 void vfs_notify_mount(vnode_t pdvp);
198
199 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
200
201 struct fd_vn_data * fg_vn_data_alloc(void);
202
203 /*
204 * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
205 * Concurrent lookups (or lookups by ids) on hard links can cause the
206 * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
207 * does) to return ENOENT as the path cannot be returned from the name cache
208 * alone. We have no option but to retry and hope to get one namei->reverse path
209 * generation done without an intervening lookup, lookup by id on the hard link
210 * item. This is only an issue for MAC hooks which cannot reenter the filesystem
211 * which currently are the MAC hooks for rename, unlink and rmdir.
212 */
213 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
214
215 /* Max retry limit for rename due to vnode recycling. */
216 #define MAX_RENAME_ERECYCLE_RETRIES 1024
217
218 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
219 int unlink_flags);
220
221 #ifdef CONFIG_IMGSRC_ACCESS
222 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
223 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
224 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
225 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
226 static void mount_end_update(mount_t mp);
227 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
228 #endif /* CONFIG_IMGSRC_ACCESS */
229
230 //snapshot functions
231 #if CONFIG_MNT_ROOTSNAP
232 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
233 #else
234 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
235 #endif
236
237 __private_extern__
238 int sync_internal(void);
239
240 __private_extern__
241 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
242
243 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
244 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
245
246 /* vars for sync mutex */
247 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
248 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
249
250 extern lck_rw_t rootvnode_rw_lock;
251
252 /*
253 * incremented each time a mount or unmount operation occurs
254 * used to invalidate the cached value of the rootvp in the
255 * mount structure utilized by cache_lookup_path
256 */
257 uint32_t mount_generation = 0;
258
259 /* counts number of mount and unmount operations */
260 unsigned int vfs_nummntops = 0;
261
262 /* system-wide, per-boot unique mount ID */
263 static _Atomic uint64_t mount_unique_id = 1;
264
265 extern const struct fileops vnops;
266 #if CONFIG_APPLEDOUBLE
267 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
268 #endif /* CONFIG_APPLEDOUBLE */
269
270 /*
271 * Virtual File System System Calls
272 */
273
274 /*
275 * Private in-kernel mounting spi (specific use-cases only)
276 */
277 boolean_t
vfs_iskernelmount(mount_t mp)278 vfs_iskernelmount(mount_t mp)
279 {
280 return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
281 }
282
283 __private_extern__
284 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)285 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
286 void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
287 vfs_context_t ctx)
288 {
289 struct nameidata nd;
290 boolean_t did_namei;
291 int error;
292
293 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
294 UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
295
296 kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
297
298 /*
299 * Get the vnode to be covered if it's not supplied
300 */
301 if (vp == NULLVP) {
302 error = namei(&nd);
303 if (error) {
304 if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
305 printf("failed to locate mount-on path: %s ", path);
306 }
307 return error;
308 }
309 vp = nd.ni_vp;
310 pvp = nd.ni_dvp;
311 did_namei = TRUE;
312 } else {
313 char *pnbuf = CAST_DOWN(char *, path);
314
315 nd.ni_cnd.cn_pnbuf = pnbuf;
316 nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
317 did_namei = FALSE;
318 }
319
320 kern_flags |= KERNEL_MOUNT_KMOUNT;
321 error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
322 syscall_flags, kern_flags, NULL, ctx);
323
324 if (did_namei) {
325 vnode_put(vp);
326 vnode_put(pvp);
327 nameidone(&nd);
328 }
329
330 return error;
331 }
332
333 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)334 vfs_mount_at_path(const char *fstype, const char *path,
335 vnode_t pvp, vnode_t vp, void *data, size_t datalen,
336 int mnt_flags, int flags)
337 {
338 int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
339 int error, km_flags = 0;
340
341 /*
342 * This call is currently restricted to specific use cases.
343 */
344 if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
345 return ENOTSUP;
346 }
347
348 #if !defined(XNU_TARGET_OS_OSX)
349 if (strcmp(fstype, "lifs") == 0) {
350 syscall_flags |= MNT_NOEXEC;
351 }
352 #endif
353
354 if (flags & VFS_MOUNT_FLAG_NOAUTH) {
355 km_flags |= KERNEL_MOUNT_NOAUTH;
356 }
357 if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
358 km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
359 }
360
361 error = kernel_mount(fstype, pvp, vp, path, data, datalen,
362 syscall_flags, km_flags, vfs_context_kernel());
363 if (error) {
364 printf("%s: mount on %s failed, error %d\n", __func__, path,
365 error);
366 }
367
368 return error;
369 }
370
371 int
vfs_mount_override_type_name(mount_t mp,const char * name)372 vfs_mount_override_type_name(mount_t mp, const char *name)
373 {
374 if (mp == NULL || name == NULL) {
375 return EINVAL;
376 }
377
378 /* Override the FS type name. */
379 mount_lock_spin(mp);
380 strlcpy(mp->fstypename_override, name, sizeof(mp->fstypename_override));
381 mp->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
382 mount_unlock(mp);
383
384 return 0;
385 }
386
387 /*
388 * Mount a file system.
389 */
390 /* ARGSUSED */
391 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)392 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
393 {
394 struct __mac_mount_args muap;
395
396 muap.type = uap->type;
397 muap.path = uap->path;
398 muap.flags = uap->flags;
399 muap.data = uap->data;
400 muap.mac_p = USER_ADDR_NULL;
401 return __mac_mount(p, &muap, retval);
402 }
403
404 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)405 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
406 {
407 struct componentname cn;
408 vfs_context_t ctx = vfs_context_current();
409 size_t dummy = 0;
410 int error;
411 int flags = uap->flags;
412 char fstypename[MFSNAMELEN];
413 char *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
414 vnode_t pvp;
415 vnode_t vp;
416
417 AUDIT_ARG(fd, uap->fd);
418 AUDIT_ARG(fflags, flags);
419 /* fstypename will get audited by mount_common */
420
421 /* Sanity check the flags */
422 if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
423 return ENOTSUP;
424 }
425
426 if (flags & MNT_UNION) {
427 return EPERM;
428 }
429
430 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
431 if (error) {
432 return error;
433 }
434
435 if ((error = file_vnode(uap->fd, &vp)) != 0) {
436 return error;
437 }
438
439 if ((error = vnode_getwithref(vp)) != 0) {
440 file_drop(uap->fd);
441 return error;
442 }
443
444 pvp = vnode_getparent(vp);
445 if (pvp == NULL) {
446 vnode_put(vp);
447 file_drop(uap->fd);
448 return EINVAL;
449 }
450
451 memset(&cn, 0, sizeof(struct componentname));
452 cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
453 cn.cn_pnlen = MAXPATHLEN;
454
455 if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
456 zfree(ZV_NAMEI, cn.cn_pnbuf);
457 vnode_put(pvp);
458 vnode_put(vp);
459 file_drop(uap->fd);
460 return error;
461 }
462
463 error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
464
465 zfree(ZV_NAMEI, cn.cn_pnbuf);
466 vnode_put(pvp);
467 vnode_put(vp);
468 file_drop(uap->fd);
469
470 return error;
471 }
472
473 void
vfs_notify_mount(vnode_t pdvp)474 vfs_notify_mount(vnode_t pdvp)
475 {
476 vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
477 lock_vnode_and_post(pdvp, NOTE_WRITE);
478 }
479
480 /*
481 * __mac_mount:
482 * Mount a file system taking into account MAC label behavior.
483 * See mount(2) man page for more information
484 *
485 * Parameters: p Process requesting the mount
486 * uap User argument descriptor (see below)
487 * retval (ignored)
488 *
489 * Indirect: uap->type Filesystem type
490 * uap->path Path to mount
491 * uap->data Mount arguments
492 * uap->mac_p MAC info
493 * uap->flags Mount flags
494 *
495 *
496 * Returns: 0 Success
497 * !0 Not success
498 */
499 boolean_t root_fs_upgrade_try = FALSE;
500
501 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)502 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
503 {
504 vnode_t pvp = NULL;
505 vnode_t vp = NULL;
506 int need_nameidone = 0;
507 vfs_context_t ctx = vfs_context_current();
508 char fstypename[MFSNAMELEN];
509 struct nameidata nd;
510 size_t dummy = 0;
511 char *labelstr = NULL;
512 size_t labelsz = 0;
513 int flags = uap->flags;
514 int error;
515 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
516 boolean_t is_64bit = IS_64BIT_PROCESS(p);
517 #else
518 #pragma unused(p)
519 #endif
520 /*
521 * Get the fs type name from user space
522 */
523 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
524 if (error) {
525 return error;
526 }
527
528 /*
529 * Get the vnode to be covered
530 */
531 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
532 UIO_USERSPACE, uap->path, ctx);
533 if (flags & MNT_NOFOLLOW) {
534 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
535 }
536 error = namei(&nd);
537 if (error) {
538 goto out;
539 }
540 need_nameidone = 1;
541 vp = nd.ni_vp;
542 pvp = nd.ni_dvp;
543
544 #ifdef CONFIG_IMGSRC_ACCESS
545 /* Mounting image source cannot be batched with other operations */
546 if (flags == MNT_IMGSRC_BY_INDEX) {
547 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
548 ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
549 goto out;
550 }
551 #endif /* CONFIG_IMGSRC_ACCESS */
552
553 #if CONFIG_MACF
554 /*
555 * Get the label string (if any) from user space
556 */
557 if (uap->mac_p != USER_ADDR_NULL) {
558 struct user_mac mac;
559 size_t ulen = 0;
560
561 if (is_64bit) {
562 struct user64_mac mac64;
563 error = copyin(uap->mac_p, &mac64, sizeof(mac64));
564 mac.m_buflen = (user_size_t)mac64.m_buflen;
565 mac.m_string = (user_addr_t)mac64.m_string;
566 } else {
567 struct user32_mac mac32;
568 error = copyin(uap->mac_p, &mac32, sizeof(mac32));
569 mac.m_buflen = mac32.m_buflen;
570 mac.m_string = mac32.m_string;
571 }
572 if (error) {
573 goto out;
574 }
575 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
576 (mac.m_buflen < 2)) {
577 error = EINVAL;
578 goto out;
579 }
580 labelsz = mac.m_buflen;
581 labelstr = kalloc_data(labelsz, Z_WAITOK);
582 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
583 if (error) {
584 goto out;
585 }
586 AUDIT_ARG(mac_string, labelstr);
587 }
588 #endif /* CONFIG_MACF */
589
590 AUDIT_ARG(fflags, flags);
591
592 #if !CONFIG_UNION_MOUNTS
593 if (flags & MNT_UNION) {
594 error = EPERM;
595 goto out;
596 }
597 #endif
598
599 if ((vp->v_flag & VROOT) &&
600 (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
601 #if CONFIG_UNION_MOUNTS
602 if (!(flags & MNT_UNION)) {
603 flags |= MNT_UPDATE;
604 } else {
605 /*
606 * For a union mount on '/', treat it as fresh
607 * mount instead of update.
608 * Otherwise, union mouting on '/' used to panic the
609 * system before, since mnt_vnodecovered was found to
610 * be NULL for '/' which is required for unionlookup
611 * after it gets ENOENT on union mount.
612 */
613 flags = (flags & ~(MNT_UPDATE));
614 }
615 #else
616 flags |= MNT_UPDATE;
617 #endif /* CONFIG_UNION_MOUNTS */
618
619 #if SECURE_KERNEL
620 if ((flags & MNT_RDONLY) == 0) {
621 /* Release kernels are not allowed to mount "/" as rw */
622 error = EPERM;
623 goto out;
624 }
625 #endif
626
627 /*
628 * See 7392553 for more details on why this check exists.
629 * Suffice to say: If this check is ON and something tries
630 * to mount the rootFS RW, we'll turn off the codesign
631 * bitmap optimization.
632 */
633 #if CHECK_CS_VALIDATION_BITMAP
634 if ((flags & MNT_RDONLY) == 0) {
635 root_fs_upgrade_try = TRUE;
636 }
637 #endif
638 }
639
640 error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
641 labelstr, ctx);
642
643 out:
644
645 #if CONFIG_MACF
646 kfree_data(labelstr, labelsz);
647 #endif /* CONFIG_MACF */
648
649 if (vp) {
650 vnode_put(vp);
651 }
652 if (pvp) {
653 vnode_put(pvp);
654 }
655 if (need_nameidone) {
656 nameidone(&nd);
657 }
658
659 return error;
660 }
661
662 /*
663 * common mount implementation (final stage of mounting)
664 *
665 * Arguments:
666 * fstypename file system type (ie it's vfs name)
667 * pvp parent of covered vnode
668 * vp covered vnode
669 * cnp component name (ie path) of covered vnode
670 * flags generic mount flags
671 * fsmountargs file system specific data
672 * labelstr optional MAC label
673 * kernelmount TRUE for mounts initiated from inside the kernel
674 * ctx caller's context
675 */
676 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)677 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
678 struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
679 char *labelstr, vfs_context_t ctx)
680 {
681 #if !CONFIG_MACF
682 #pragma unused(labelstr)
683 #endif
684 struct vnode *devvp = NULLVP;
685 struct vnode *device_vnode = NULLVP;
686 #if CONFIG_MACF
687 struct vnode *rvp;
688 #endif
689 struct mount *mp;
690 struct vfstable *vfsp = (struct vfstable *)0;
691 struct proc *p = vfs_context_proc(ctx);
692 int error, flag = 0;
693 bool flag_set = false;
694 user_addr_t devpath = USER_ADDR_NULL;
695 int ronly = 0;
696 int mntalloc = 0;
697 boolean_t vfsp_ref = FALSE;
698 boolean_t is_rwlock_locked = FALSE;
699 boolean_t did_rele = FALSE;
700 boolean_t have_usecount = FALSE;
701 boolean_t did_set_lmount = FALSE;
702 boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
703
704 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
705 /* Check for mutually-exclusive flag bits */
706 uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
707 int bitcount = 0;
708 while (checkflags != 0) {
709 checkflags &= (checkflags - 1);
710 bitcount++;
711 }
712
713 if (bitcount > 1) {
714 //not allowed to request multiple mount-by-role flags
715 error = EINVAL;
716 goto out1;
717 }
718 #endif
719
720 /*
721 * Process an update for an existing mount
722 */
723 if (flags & MNT_UPDATE) {
724 if ((vp->v_flag & VROOT) == 0) {
725 error = EINVAL;
726 goto out1;
727 }
728 mp = vp->v_mount;
729
730 /* if unmount or mount in progress, return error */
731 mount_lock_spin(mp);
732 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
733 mount_unlock(mp);
734 error = EBUSY;
735 goto out1;
736 }
737 mp->mnt_lflag |= MNT_LMOUNT;
738 did_set_lmount = TRUE;
739 mount_unlock(mp);
740 lck_rw_lock_exclusive(&mp->mnt_rwlock);
741 is_rwlock_locked = TRUE;
742 /*
743 * We only allow the filesystem to be reloaded if it
744 * is currently mounted read-only.
745 */
746 if ((flags & MNT_RELOAD) &&
747 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
748 error = ENOTSUP;
749 goto out1;
750 }
751
752 /*
753 * If content protection is enabled, update mounts are not
754 * allowed to turn it off.
755 */
756 if ((mp->mnt_flag & MNT_CPROTECT) &&
757 ((flags & MNT_CPROTECT) == 0)) {
758 error = EINVAL;
759 goto out1;
760 }
761
762 /*
763 * can't turn off MNT_REMOVABLE either but it may be an unexpected
764 * failure to return an error for this so we'll just silently
765 * add it if it is not passed in.
766 */
767 if ((mp->mnt_flag & MNT_REMOVABLE) &&
768 ((flags & MNT_REMOVABLE) == 0)) {
769 flags |= MNT_REMOVABLE;
770 }
771
772 /* Can't downgrade the backer of the root FS */
773 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
774 (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
775 error = ENOTSUP;
776 goto out1;
777 }
778
779 /*
780 * Only root, or the user that did the original mount is
781 * permitted to update it.
782 */
783 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
784 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
785 goto out1;
786 }
787 #if CONFIG_MACF
788 error = mac_mount_check_remount(ctx, mp);
789 if (error != 0) {
790 goto out1;
791 }
792 #endif
793 /*
794 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
795 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
796 */
797 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
798 flags |= MNT_NOSUID | MNT_NODEV;
799 if (mp->mnt_flag & MNT_NOEXEC) {
800 flags |= MNT_NOEXEC;
801 }
802 }
803 flag = mp->mnt_flag;
804 flag_set = true;
805
806
807
808 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
809
810 vfsp = mp->mnt_vtable;
811 goto update;
812 } // MNT_UPDATE
813
814 /*
815 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
816 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
817 */
818 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
819 flags |= MNT_NOSUID | MNT_NODEV;
820 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
821 flags |= MNT_NOEXEC;
822 }
823 }
824
825 /* XXXAUDIT: Should we capture the type on the error path as well? */
826 /* XXX cast-away const (audit_arg_text() does not modify its input) */
827 AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
828 mount_list_lock();
829 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
830 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
831 vfsp->vfc_refcount++;
832 vfsp_ref = TRUE;
833 break;
834 }
835 }
836 mount_list_unlock();
837 if (vfsp == NULL) {
838 error = ENODEV;
839 goto out1;
840 }
841
842 /*
843 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
844 * except in ROSV configs and for the initial BaseSystem root.
845 */
846 if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
847 ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
848 ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
849 error = EINVAL; /* unsupported request */
850 goto out1;
851 }
852
853 error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
854 if (error != 0) {
855 goto out1;
856 }
857
858 /*
859 * Allocate and initialize the filesystem (mount_t)
860 */
861 mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
862 mntalloc = 1;
863
864 /* Initialize the default IO constraints */
865 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
866 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
867 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
868 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
869 mp->mnt_devblocksize = DEV_BSIZE;
870 mp->mnt_alignmentmask = PAGE_MASK;
871 mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
872 mp->mnt_ioscale = 1;
873 mp->mnt_ioflags = 0;
874 mp->mnt_realrootvp = NULLVP;
875 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
876
877 mp->mnt_lflag |= MNT_LMOUNT;
878 did_set_lmount = TRUE;
879
880 TAILQ_INIT(&mp->mnt_vnodelist);
881 TAILQ_INIT(&mp->mnt_workerqueue);
882 TAILQ_INIT(&mp->mnt_newvnodes);
883 mount_lock_init(mp);
884 lck_rw_lock_exclusive(&mp->mnt_rwlock);
885 is_rwlock_locked = TRUE;
886 mp->mnt_op = vfsp->vfc_vfsops;
887 mp->mnt_vtable = vfsp;
888 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
889 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
890 strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
891 do {
892 int pathlen = MAXPATHLEN;
893
894 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
895 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
896 }
897 } while (0);
898 mp->mnt_vnodecovered = vp;
899 mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
900 mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
901 mp->mnt_devbsdunit = 0;
902 mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
903
904 /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
905 vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
906
907 if (kernelmount) {
908 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
909 }
910 if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
911 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
912 }
913
914 if (KERNEL_MOUNT_DEVFS & internal_flags) {
915 // kernel mounted devfs
916 mp->mnt_kern_flag |= MNTK_SYSTEM;
917 }
918
919 update:
920
921 /*
922 * Set the mount level flags.
923 */
924 if (flags & MNT_RDONLY) {
925 mp->mnt_flag |= MNT_RDONLY;
926 } else if (mp->mnt_flag & MNT_RDONLY) {
927 // disallow read/write upgrades of file systems that
928 // had the TYPENAME_OVERRIDE feature set.
929 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
930 error = EPERM;
931 goto out1;
932 }
933 mp->mnt_kern_flag |= MNTK_WANTRDWR;
934 }
935 mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
936 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
937 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
938 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
939 MNT_QUARANTINE | MNT_CPROTECT);
940
941 #if SECURE_KERNEL
942 #if !CONFIG_MNT_SUID
943 /*
944 * On release builds of iOS based platforms, always enforce NOSUID on
945 * all mounts. We do this here because we can catch update mounts as well as
946 * non-update mounts in this case.
947 */
948 mp->mnt_flag |= (MNT_NOSUID);
949 #endif
950 #endif
951
952 mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
953 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
954 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
955 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
956 MNT_QUARANTINE | MNT_CPROTECT);
957
958 #if CONFIG_MACF
959 if (flags & MNT_MULTILABEL) {
960 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
961 error = EINVAL;
962 goto out1;
963 }
964 mp->mnt_flag |= MNT_MULTILABEL;
965 }
966 #endif
967 /*
968 * Process device path for local file systems if requested.
969 *
970 * Snapshot and mount-by-role mounts do not use this path; they are
971 * passing other opaque data in the device path field.
972 *
973 * Basesystemroot mounts pass a device path to be resolved here,
974 * but it's just a char * already inside the kernel, which
975 * kernel_mount() shoved into a user_addr_t to call us. So for such
976 * mounts we must skip copyin (both of the address and of the string
977 * (in NDINIT).
978 */
979 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
980 !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
981 boolean_t do_copyin_devpath = true;
982 #if CONFIG_BASESYSTEMROOT
983 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
984 // KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
985 // We have been passed fsmountargs, which is typed as a user_addr_t,
986 // but is actually a char ** pointing to a (kernelspace) string.
987 // We manually unpack it with a series of casts and dereferences
988 // that reverses what was done just above us on the stack in
989 // imageboot_pivot_image().
990 // After retrieving the path to the dev node (which we will NDINIT
991 // in a moment), we pass NULL fsmountargs on to the filesystem.
992 _Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
993 char **devnamepp = (char **)fsmountargs;
994 char *devnamep = *devnamepp;
995 devpath = CAST_USER_ADDR_T(devnamep);
996 do_copyin_devpath = false;
997 fsmountargs = USER_ADDR_NULL;
998
999 //Now that we have a mp, denote that this mount is for the basesystem.
1000 mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1001 }
1002 #endif // CONFIG_BASESYSTEMROOT
1003
1004 if (do_copyin_devpath) {
1005 if (vfs_context_is64bit(ctx)) {
1006 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1007 goto out1;
1008 }
1009 fsmountargs += sizeof(devpath);
1010 } else {
1011 user32_addr_t tmp;
1012 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1013 goto out1;
1014 }
1015 /* munge into LP64 addr */
1016 devpath = CAST_USER_ADDR_T(tmp);
1017 fsmountargs += sizeof(tmp);
1018 }
1019 }
1020
1021 /* Lookup device and authorize access to it */
1022 if ((devpath)) {
1023 struct nameidata nd;
1024
1025 enum uio_seg seg = UIO_USERSPACE;
1026 #if CONFIG_BASESYSTEMROOT
1027 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1028 seg = UIO_SYSSPACE;
1029 }
1030 #endif // CONFIG_BASESYSTEMROOT
1031
1032 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1033 if ((error = namei(&nd))) {
1034 goto out1;
1035 }
1036
1037 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1038 devvp = nd.ni_vp;
1039
1040 nameidone(&nd);
1041
1042 if (devvp->v_type != VBLK) {
1043 error = ENOTBLK;
1044 goto out2;
1045 }
1046 if (major(devvp->v_rdev) >= nblkdev) {
1047 error = ENXIO;
1048 goto out2;
1049 }
1050 /*
1051 * If mount by non-root, then verify that user has necessary
1052 * permissions on the device.
1053 */
1054 if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1055 mode_t accessmode = KAUTH_VNODE_READ_DATA;
1056
1057 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1058 accessmode |= KAUTH_VNODE_WRITE_DATA;
1059 }
1060 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1061 goto out2;
1062 }
1063 }
1064 }
1065 /* On first mount, preflight and open device */
1066 if (devpath && ((flags & MNT_UPDATE) == 0)) {
1067 if ((error = vnode_ref(devvp))) {
1068 goto out2;
1069 }
1070 /*
1071 * Disallow multiple mounts of the same device.
1072 * Disallow mounting of a device that is currently in use
1073 * (except for root, which might share swap device for miniroot).
1074 * Flush out any old buffers remaining from a previous use.
1075 */
1076 if ((error = vfs_mountedon(devvp))) {
1077 goto out3;
1078 }
1079
1080 if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1081 error = EBUSY;
1082 goto out3;
1083 }
1084 if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1085 error = ENOTBLK;
1086 goto out3;
1087 }
1088 if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1089 goto out3;
1090 }
1091
1092 ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1093 #if CONFIG_MACF
1094 error = mac_vnode_check_open(ctx,
1095 devvp,
1096 ronly ? FREAD : FREAD | FWRITE);
1097 if (error) {
1098 goto out3;
1099 }
1100 #endif /* MAC */
1101 if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1102 goto out3;
1103 }
1104
1105 mp->mnt_devvp = devvp;
1106 device_vnode = devvp;
1107 } else if ((mp->mnt_flag & MNT_RDONLY) &&
1108 (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1109 (device_vnode = mp->mnt_devvp)) {
1110 dev_t dev;
1111 int maj;
1112 /*
1113 * If upgrade to read-write by non-root, then verify
1114 * that user has necessary permissions on the device.
1115 */
1116 vnode_getalways(device_vnode);
1117
1118 if (suser(vfs_context_ucred(ctx), NULL) &&
1119 (error = vnode_authorize(device_vnode, NULL,
1120 KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1121 ctx)) != 0) {
1122 vnode_put(device_vnode);
1123 goto out2;
1124 }
1125
1126 /* Tell the device that we're upgrading */
1127 dev = (dev_t)device_vnode->v_rdev;
1128 maj = major(dev);
1129
1130 if ((u_int)maj >= (u_int)nblkdev) {
1131 panic("Volume mounted on a device with invalid major number.");
1132 }
1133
1134 error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1135 vnode_put(device_vnode);
1136 device_vnode = NULLVP;
1137 if (error != 0) {
1138 goto out2;
1139 }
1140 }
1141 } // localargs && !(snapshot | data | vm)
1142
1143 #if CONFIG_MACF
1144 if ((flags & MNT_UPDATE) == 0) {
1145 mac_mount_label_init(mp);
1146 mac_mount_label_associate(ctx, mp);
1147 }
1148 if (labelstr) {
1149 if ((flags & MNT_UPDATE) != 0) {
1150 error = mac_mount_check_label_update(ctx, mp);
1151 if (error != 0) {
1152 goto out3;
1153 }
1154 }
1155 }
1156 #endif
1157 /*
1158 * Mount the filesystem. We already asserted that internal_flags
1159 * cannot have more than one mount-by-role bit set.
1160 */
1161 if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1162 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1163 (caddr_t)fsmountargs, 0, ctx);
1164 } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1165 #if CONFIG_ROSV_STARTUP
1166 struct mount *origin_mp = (struct mount*)fsmountargs;
1167 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1168 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1169 if (error) {
1170 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1171 } else {
1172 /* Mark volume associated with system volume */
1173 mp->mnt_kern_flag |= MNTK_SYSTEM;
1174
1175 /* Attempt to acquire the mnt_devvp and set it up */
1176 struct vnode *mp_devvp = NULL;
1177 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1178 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1179 0, &mp_devvp, vfs_context_kernel());
1180 if (!lerr) {
1181 mp->mnt_devvp = mp_devvp;
1182 //vnode_lookup took an iocount, need to drop it.
1183 vnode_put(mp_devvp);
1184 // now set `device_vnode` to the devvp that was acquired.
1185 // this is needed in order to ensure vfs_init_io_attributes is invoked.
1186 // note that though the iocount above was dropped, the mount acquires
1187 // an implicit reference against the device.
1188 device_vnode = mp_devvp;
1189 }
1190 }
1191 }
1192 #else
1193 error = EINVAL;
1194 #endif
1195 } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1196 #if CONFIG_MOUNT_VM
1197 struct mount *origin_mp = (struct mount*)fsmountargs;
1198 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1199 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1200 if (error) {
1201 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1202 } else {
1203 /* Mark volume associated with system volume and a swap mount */
1204 mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1205 /* Attempt to acquire the mnt_devvp and set it up */
1206 struct vnode *mp_devvp = NULL;
1207 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1208 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1209 0, &mp_devvp, vfs_context_kernel());
1210 if (!lerr) {
1211 mp->mnt_devvp = mp_devvp;
1212 //vnode_lookup took an iocount, need to drop it.
1213 vnode_put(mp_devvp);
1214
1215 // now set `device_vnode` to the devvp that was acquired.
1216 // note that though the iocount above was dropped, the mount acquires
1217 // an implicit reference against the device.
1218 device_vnode = mp_devvp;
1219 }
1220 }
1221 }
1222 #else
1223 error = EINVAL;
1224 #endif
1225 } else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1226 #if CONFIG_MOUNT_PREBOOTRECOVERY
1227 struct mount *origin_mp = (struct mount*)fsmountargs;
1228 uint32_t mount_role = 0;
1229 if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1230 mount_role = VFS_PREBOOT_ROLE;
1231 } else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1232 mount_role = VFS_RECOVERY_ROLE;
1233 }
1234
1235 if (mount_role != 0) {
1236 fs_role_mount_args_t frma = {origin_mp, mount_role};
1237 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1238 if (error) {
1239 printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1240 } else {
1241 // NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1242 /* Mark volume associated with system volume */
1243 //mp->mnt_kern_flag |= MNTK_SYSTEM;
1244 /* Attempt to acquire the mnt_devvp and set it up */
1245 struct vnode *mp_devvp = NULL;
1246 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1247 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1248 0, &mp_devvp, vfs_context_kernel());
1249 if (!lerr) {
1250 mp->mnt_devvp = mp_devvp;
1251 //vnode_lookup took an iocount, need to drop it.
1252 vnode_put(mp_devvp);
1253
1254 // now set `device_vnode` to the devvp that was acquired.
1255 // note that though the iocount above was dropped, the mount acquires
1256 // an implicit reference against the device.
1257 device_vnode = mp_devvp;
1258 }
1259 }
1260 }
1261 } else {
1262 printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1263 error = EINVAL;
1264 }
1265 #else
1266 error = EINVAL;
1267 #endif
1268 } else {
1269 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1270 }
1271
1272 if (flags & MNT_UPDATE) {
1273 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1274 mp->mnt_flag &= ~MNT_RDONLY;
1275 }
1276 mp->mnt_flag &= ~
1277 (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1278 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1279 if (error) {
1280 mp->mnt_flag = flag; /* restore flag value */
1281 }
1282 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1283 lck_rw_done(&mp->mnt_rwlock);
1284 is_rwlock_locked = FALSE;
1285 if (!error) {
1286 enablequotas(mp, ctx);
1287 }
1288 goto exit;
1289 }
1290
1291 /*
1292 * Put the new filesystem on the mount list after root.
1293 */
1294 if (error == 0) {
1295 struct vfs_attr vfsattr;
1296 #if CONFIG_MACF
1297 error = mac_mount_check_mount_late(ctx, mp);
1298 if (error != 0) {
1299 goto out4;
1300 }
1301
1302 if (vfs_flags(mp) & MNT_MULTILABEL) {
1303 error = VFS_ROOT(mp, &rvp, ctx);
1304 if (error) {
1305 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1306 goto out4;
1307 }
1308 error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1309 /*
1310 * drop reference provided by VFS_ROOT
1311 */
1312 vnode_put(rvp);
1313
1314 if (error) {
1315 goto out4;
1316 }
1317 }
1318 #endif /* MAC */
1319
1320 vnode_lock_spin(vp);
1321 CLR(vp->v_flag, VMOUNT);
1322 vp->v_mountedhere = mp;
1323 vnode_unlock(vp);
1324
1325 /*
1326 * taking the name_cache_lock exclusively will
1327 * insure that everyone is out of the fast path who
1328 * might be trying to use a now stale copy of
1329 * vp->v_mountedhere->mnt_realrootvp
1330 * bumping mount_generation causes the cached values
1331 * to be invalidated
1332 */
1333 name_cache_lock();
1334 mount_generation++;
1335 name_cache_unlock();
1336
1337 error = vnode_ref(vp);
1338 if (error != 0) {
1339 goto out4;
1340 }
1341
1342 have_usecount = TRUE;
1343
1344 error = checkdirs(vp, ctx);
1345 if (error != 0) {
1346 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1347 goto out4;
1348 }
1349 /*
1350 * there is no cleanup code here so I have made it void
1351 * we need to revisit this
1352 */
1353 (void)VFS_START(mp, 0, ctx);
1354
1355 if (mount_list_add(mp) != 0) {
1356 /*
1357 * The system is shutting down trying to umount
1358 * everything, so fail with a plausible errno.
1359 */
1360 error = EBUSY;
1361 goto out4;
1362 }
1363 lck_rw_done(&mp->mnt_rwlock);
1364 is_rwlock_locked = FALSE;
1365
1366 /* Check if this mounted file system supports EAs or named streams. */
1367 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1368 VFSATTR_INIT(&vfsattr);
1369 VFSATTR_WANTED(&vfsattr, f_capabilities);
1370 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1371 vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1372 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1373 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1374 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1375 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1376 }
1377 #if NAMEDSTREAMS
1378 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1379 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1380 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1381 }
1382 #endif
1383 /* Check if this file system supports path from id lookups. */
1384 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1385 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1386 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1387 } else if (mp->mnt_flag & MNT_DOVOLFS) {
1388 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1389 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1390 }
1391
1392 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1393 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1394 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1395 }
1396 }
1397 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1398 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1399 }
1400 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1401 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1402 }
1403 /* increment the operations count */
1404 OSAddAtomic(1, &vfs_nummntops);
1405 enablequotas(mp, ctx);
1406
1407 if (device_vnode) {
1408 device_vnode->v_specflags |= SI_MOUNTEDON;
1409
1410 /*
1411 * cache the IO attributes for the underlying physical media...
1412 * an error return indicates the underlying driver doesn't
1413 * support all the queries necessary... however, reasonable
1414 * defaults will have been set, so no reason to bail or care
1415 */
1416 vfs_init_io_attributes(device_vnode, mp);
1417 }
1418
1419 /* Now that mount is setup, notify the listeners */
1420 vfs_notify_mount(pvp);
1421 IOBSDMountChange(mp, kIOMountChangeMount);
1422 } else {
1423 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1424 if (mp->mnt_vnodelist.tqh_first != NULL) {
1425 panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1426 mp->mnt_vtable->vfc_name, error);
1427 }
1428
1429 vnode_lock_spin(vp);
1430 CLR(vp->v_flag, VMOUNT);
1431 vnode_unlock(vp);
1432 mount_list_lock();
1433 mp->mnt_vtable->vfc_refcount--;
1434 mount_list_unlock();
1435
1436 if (device_vnode) {
1437 vnode_rele(device_vnode);
1438 VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1439 }
1440 lck_rw_done(&mp->mnt_rwlock);
1441 is_rwlock_locked = FALSE;
1442
1443 /*
1444 * if we get here, we have a mount structure that needs to be freed,
1445 * but since the coveredvp hasn't yet been updated to point at it,
1446 * no need to worry about other threads holding a crossref on this mp
1447 * so it's ok to just free it
1448 */
1449 mount_lock_destroy(mp);
1450 #if CONFIG_MACF
1451 mac_mount_label_destroy(mp);
1452 #endif
1453 zfree(mount_zone, mp);
1454 did_set_lmount = false;
1455 }
1456 exit:
1457 /*
1458 * drop I/O count on the device vp if there was one
1459 */
1460 if (devpath && devvp) {
1461 vnode_put(devvp);
1462 }
1463
1464 if (did_set_lmount) {
1465 mount_lock_spin(mp);
1466 mp->mnt_lflag &= ~MNT_LMOUNT;
1467 mount_unlock(mp);
1468 }
1469
1470 return error;
1471
1472 /* Error condition exits */
1473 out4:
1474 (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1475
1476 /*
1477 * If the mount has been placed on the covered vp,
1478 * it may have been discovered by now, so we have
1479 * to treat this just like an unmount
1480 */
1481 mount_lock_spin(mp);
1482 mp->mnt_lflag |= MNT_LDEAD;
1483 mount_unlock(mp);
1484
1485 if (device_vnode != NULLVP) {
1486 vnode_rele(device_vnode);
1487 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1488 ctx);
1489 did_rele = TRUE;
1490 }
1491
1492 vnode_lock_spin(vp);
1493
1494 mp->mnt_crossref++;
1495 vp->v_mountedhere = (mount_t) 0;
1496
1497 vnode_unlock(vp);
1498
1499 if (have_usecount) {
1500 vnode_rele(vp);
1501 }
1502 out3:
1503 if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1504 vnode_rele(devvp);
1505 }
1506 out2:
1507 if (devpath && devvp) {
1508 vnode_put(devvp);
1509 }
1510 out1:
1511 /* Release mnt_rwlock only when it was taken */
1512 if (is_rwlock_locked == TRUE) {
1513 if (flag_set) {
1514 mp->mnt_flag = flag; /* restore mnt_flag value */
1515 }
1516 lck_rw_done(&mp->mnt_rwlock);
1517 }
1518
1519 if (did_set_lmount) {
1520 mount_lock_spin(mp);
1521 mp->mnt_lflag &= ~MNT_LMOUNT;
1522 mount_unlock(mp);
1523 }
1524
1525 if (mntalloc) {
1526 if (mp->mnt_crossref) {
1527 mount_dropcrossref(mp, vp, 0);
1528 } else {
1529 mount_lock_destroy(mp);
1530 #if CONFIG_MACF
1531 mac_mount_label_destroy(mp);
1532 #endif
1533 zfree(mount_zone, mp);
1534 }
1535 }
1536 if (vfsp_ref) {
1537 mount_list_lock();
1538 vfsp->vfc_refcount--;
1539 mount_list_unlock();
1540 }
1541
1542 return error;
1543 }
1544
1545 /*
1546 * Flush in-core data, check for competing mount attempts,
1547 * and set VMOUNT
1548 */
1549 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)1550 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1551 {
1552 #if !CONFIG_MACF
1553 #pragma unused(cnp,fsname)
1554 #endif
1555 struct vnode_attr va;
1556 int error;
1557 boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1558 boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1559 boolean_t is_busy;
1560
1561 if (!skip_auth) {
1562 /*
1563 * If the user is not root, ensure that they own the directory
1564 * onto which we are attempting to mount.
1565 */
1566 VATTR_INIT(&va);
1567 VATTR_WANTED(&va, va_uid);
1568 if ((error = vnode_getattr(vp, &va, ctx)) ||
1569 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1570 (!vfs_context_issuser(ctx)))) {
1571 error = EPERM;
1572 goto out;
1573 }
1574 }
1575
1576 if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1577 goto out;
1578 }
1579
1580 if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1581 goto out;
1582 }
1583
1584 if (vp->v_type != VDIR) {
1585 error = ENOTDIR;
1586 goto out;
1587 }
1588
1589 vnode_lock_spin(vp);
1590 is_busy = is_fmount ?
1591 (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL)) :
1592 (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL));
1593 if (is_busy) {
1594 vnode_unlock(vp);
1595 error = EBUSY;
1596 goto out;
1597 }
1598 SET(vp->v_flag, VMOUNT);
1599 vnode_unlock(vp);
1600
1601 #if CONFIG_MACF
1602 error = mac_mount_check_mount(ctx, vp,
1603 cnp, fsname);
1604 if (error != 0) {
1605 vnode_lock_spin(vp);
1606 CLR(vp->v_flag, VMOUNT);
1607 vnode_unlock(vp);
1608 }
1609 #endif
1610
1611 out:
1612 return error;
1613 }
1614
1615 #if CONFIG_IMGSRC_ACCESS
1616
1617 #define DEBUG_IMGSRC 0
1618
1619 #if DEBUG_IMGSRC
1620 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1621 #else
1622 #define IMGSRC_DEBUG(args...) do { } while(0)
1623 #endif
1624
1625 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)1626 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1627 {
1628 struct nameidata nd;
1629 vnode_t vp, realdevvp;
1630 mode_t accessmode;
1631 int error;
1632 enum uio_seg uio = UIO_USERSPACE;
1633
1634 if (ctx == vfs_context_kernel()) {
1635 uio = UIO_SYSSPACE;
1636 }
1637
1638 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1639 if ((error = namei(&nd))) {
1640 IMGSRC_DEBUG("namei() failed with %d\n", error);
1641 return error;
1642 }
1643
1644 vp = nd.ni_vp;
1645
1646 if (!vnode_isblk(vp)) {
1647 IMGSRC_DEBUG("Not block device.\n");
1648 error = ENOTBLK;
1649 goto out;
1650 }
1651
1652 realdevvp = mp->mnt_devvp;
1653 if (realdevvp == NULLVP) {
1654 IMGSRC_DEBUG("No device backs the mount.\n");
1655 error = ENXIO;
1656 goto out;
1657 }
1658
1659 error = vnode_getwithref(realdevvp);
1660 if (error != 0) {
1661 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1662 goto out;
1663 }
1664
1665 if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1666 IMGSRC_DEBUG("Wrong dev_t.\n");
1667 error = ENXIO;
1668 goto out1;
1669 }
1670
1671 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1672
1673 /*
1674 * If mount by non-root, then verify that user has necessary
1675 * permissions on the device.
1676 */
1677 if (!vfs_context_issuser(ctx)) {
1678 accessmode = KAUTH_VNODE_READ_DATA;
1679 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1680 accessmode |= KAUTH_VNODE_WRITE_DATA;
1681 }
1682 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1683 IMGSRC_DEBUG("Access denied.\n");
1684 goto out1;
1685 }
1686 }
1687
1688 *devvpp = vp;
1689
1690 out1:
1691 vnode_put(realdevvp);
1692
1693 out:
1694 nameidone(&nd);
1695
1696 if (error) {
1697 vnode_put(vp);
1698 }
1699
1700 return error;
1701 }
1702
1703 /*
1704 * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1705 * and call checkdirs()
1706 */
1707 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)1708 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1709 {
1710 int error;
1711
1712 mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1713
1714 IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
1715 mp->mnt_vtable->vfc_name, vnode_getname(vp));
1716
1717 vnode_lock_spin(vp);
1718 CLR(vp->v_flag, VMOUNT);
1719 vp->v_mountedhere = mp;
1720 vnode_unlock(vp);
1721
1722 /*
1723 * taking the name_cache_lock exclusively will
1724 * insure that everyone is out of the fast path who
1725 * might be trying to use a now stale copy of
1726 * vp->v_mountedhere->mnt_realrootvp
1727 * bumping mount_generation causes the cached values
1728 * to be invalidated
1729 */
1730 name_cache_lock();
1731 mount_generation++;
1732 name_cache_unlock();
1733
1734 error = vnode_ref(vp);
1735 if (error != 0) {
1736 goto out;
1737 }
1738
1739 error = checkdirs(vp, ctx);
1740 if (error != 0) {
1741 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1742 vnode_rele(vp);
1743 goto out;
1744 }
1745
1746 out:
1747 if (error != 0) {
1748 mp->mnt_vnodecovered = NULLVP;
1749 }
1750 return error;
1751 }
1752
1753 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)1754 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1755 {
1756 vnode_rele(vp);
1757 vnode_lock_spin(vp);
1758 vp->v_mountedhere = (mount_t)NULL;
1759 vnode_unlock(vp);
1760
1761 mp->mnt_vnodecovered = NULLVP;
1762 }
1763
1764 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)1765 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1766 {
1767 int error;
1768
1769 /* unmount in progress return error */
1770 mount_lock_spin(mp);
1771 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1772 mount_unlock(mp);
1773 return EBUSY;
1774 }
1775 mount_unlock(mp);
1776 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1777
1778 /*
1779 * We only allow the filesystem to be reloaded if it
1780 * is currently mounted read-only.
1781 */
1782 if ((flags & MNT_RELOAD) &&
1783 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1784 error = ENOTSUP;
1785 goto out;
1786 }
1787
1788 /*
1789 * Only root, or the user that did the original mount is
1790 * permitted to update it.
1791 */
1792 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1793 (!vfs_context_issuser(ctx))) {
1794 error = EPERM;
1795 goto out;
1796 }
1797 #if CONFIG_MACF
1798 error = mac_mount_check_remount(ctx, mp);
1799 if (error != 0) {
1800 goto out;
1801 }
1802 #endif
1803
1804 out:
1805 if (error) {
1806 lck_rw_done(&mp->mnt_rwlock);
1807 }
1808
1809 return error;
1810 }
1811
1812 static void
mount_end_update(mount_t mp)1813 mount_end_update(mount_t mp)
1814 {
1815 lck_rw_done(&mp->mnt_rwlock);
1816 }
1817
1818 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)1819 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1820 {
1821 vnode_t vp;
1822
1823 if (height >= MAX_IMAGEBOOT_NESTING) {
1824 return EINVAL;
1825 }
1826
1827 vp = imgsrc_rootvnodes[height];
1828 if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1829 *rvpp = vp;
1830 return 0;
1831 } else {
1832 return ENOENT;
1833 }
1834 }
1835
1836 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)1837 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
1838 struct componentname *cnp, const char *fsname, vfs_context_t ctx,
1839 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1840 {
1841 int error;
1842 mount_t mp;
1843 boolean_t placed = FALSE;
1844 struct vfstable *vfsp;
1845 user_addr_t devpath;
1846 char *old_mntonname;
1847 vnode_t rvp;
1848 vnode_t devvp;
1849 uint32_t height;
1850 uint32_t flags;
1851
1852 /* If we didn't imageboot, nothing to move */
1853 if (imgsrc_rootvnodes[0] == NULLVP) {
1854 return EINVAL;
1855 }
1856
1857 /* Only root can do this */
1858 if (!vfs_context_issuser(ctx)) {
1859 return EPERM;
1860 }
1861
1862 IMGSRC_DEBUG("looking for root vnode.\n");
1863
1864 /*
1865 * Get root vnode of filesystem we're moving.
1866 */
1867 if (by_index) {
1868 if (is64bit) {
1869 struct user64_mnt_imgsrc_args mia64;
1870 error = copyin(fsmountargs, &mia64, sizeof(mia64));
1871 if (error != 0) {
1872 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1873 return error;
1874 }
1875
1876 height = mia64.mi_height;
1877 flags = mia64.mi_flags;
1878 devpath = (user_addr_t)mia64.mi_devpath;
1879 } else {
1880 struct user32_mnt_imgsrc_args mia32;
1881 error = copyin(fsmountargs, &mia32, sizeof(mia32));
1882 if (error != 0) {
1883 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1884 return error;
1885 }
1886
1887 height = mia32.mi_height;
1888 flags = mia32.mi_flags;
1889 devpath = mia32.mi_devpath;
1890 }
1891 } else {
1892 /*
1893 * For binary compatibility--assumes one level of nesting.
1894 */
1895 if (is64bit) {
1896 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1897 return error;
1898 }
1899 } else {
1900 user32_addr_t tmp;
1901 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1902 return error;
1903 }
1904
1905 /* munge into LP64 addr */
1906 devpath = CAST_USER_ADDR_T(tmp);
1907 }
1908
1909 height = 0;
1910 flags = 0;
1911 }
1912
1913 if (flags != 0) {
1914 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1915 return EINVAL;
1916 }
1917
1918 error = get_imgsrc_rootvnode(height, &rvp);
1919 if (error != 0) {
1920 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
1921 return error;
1922 }
1923
1924 IMGSRC_DEBUG("got old root vnode\n");
1925
1926 old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
1927
1928 /* Can only move once */
1929 mp = vnode_mount(rvp);
1930 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1931 IMGSRC_DEBUG("Already moved.\n");
1932 error = EBUSY;
1933 goto out0;
1934 }
1935
1936 IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
1937 IMGSRC_DEBUG("Starting updated.\n");
1938
1939 /* Get exclusive rwlock on mount, authorize update on mp */
1940 error = mount_begin_update(mp, ctx, 0);
1941 if (error != 0) {
1942 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1943 goto out0;
1944 }
1945
1946 /*
1947 * It can only be moved once. Flag is set under the rwlock,
1948 * so we're now safe to proceed.
1949 */
1950 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1951 IMGSRC_DEBUG("Already moved [2]\n");
1952 goto out1;
1953 }
1954
1955 IMGSRC_DEBUG("Preparing coveredvp.\n");
1956
1957 /* Mark covered vnode as mount in progress, authorize placing mount on top */
1958 error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
1959 if (error != 0) {
1960 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1961 goto out1;
1962 }
1963
1964 IMGSRC_DEBUG("Covered vp OK.\n");
1965
1966 /* Sanity check the name caller has provided */
1967 vfsp = mp->mnt_vtable;
1968 if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1969 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
1970 vfsp->vfc_name, fsname);
1971 error = EINVAL;
1972 goto out2;
1973 }
1974
1975 /* Check the device vnode and update mount-from name, for local filesystems */
1976 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1977 IMGSRC_DEBUG("Local, doing device validation.\n");
1978
1979 if (devpath != USER_ADDR_NULL) {
1980 error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1981 if (error) {
1982 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1983 goto out2;
1984 }
1985
1986 vnode_put(devvp);
1987 }
1988 }
1989
1990 /*
1991 * Place mp on top of vnode, ref the vnode, call checkdirs(),
1992 * and increment the name cache's mount generation
1993 */
1994
1995 IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1996 error = place_mount_and_checkdirs(mp, vp, ctx);
1997 if (error != 0) {
1998 goto out2;
1999 }
2000
2001 placed = TRUE;
2002
2003 strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2004 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2005
2006 /* Forbid future moves */
2007 mount_lock(mp);
2008 mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2009 mount_unlock(mp);
2010
2011 /* Finally, add to mount list, completely ready to go */
2012 if (mount_list_add(mp) != 0) {
2013 /*
2014 * The system is shutting down trying to umount
2015 * everything, so fail with a plausible errno.
2016 */
2017 error = EBUSY;
2018 goto out3;
2019 }
2020
2021 mount_end_update(mp);
2022 vnode_put(rvp);
2023 zfree(ZV_NAMEI, old_mntonname);
2024
2025 vfs_notify_mount(pvp);
2026
2027 return 0;
2028 out3:
2029 strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2030
2031 mount_lock(mp);
2032 mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2033 mount_unlock(mp);
2034
2035 out2:
2036 /*
2037 * Placing the mp on the vnode clears VMOUNT,
2038 * so cleanup is different after that point
2039 */
2040 if (placed) {
2041 /* Rele the vp, clear VMOUNT and v_mountedhere */
2042 undo_place_on_covered_vp(mp, vp);
2043 } else {
2044 vnode_lock_spin(vp);
2045 CLR(vp->v_flag, VMOUNT);
2046 vnode_unlock(vp);
2047 }
2048 out1:
2049 mount_end_update(mp);
2050
2051 out0:
2052 vnode_put(rvp);
2053 zfree(ZV_NAMEI, old_mntonname);
2054 return error;
2055 }
2056
2057 #endif /* CONFIG_IMGSRC_ACCESS */
2058
2059 void
enablequotas(struct mount * mp,vfs_context_t ctx)2060 enablequotas(struct mount *mp, vfs_context_t ctx)
2061 {
2062 struct nameidata qnd;
2063 int type;
2064 char qfpath[MAXPATHLEN];
2065 const char *qfname = QUOTAFILENAME;
2066 const char *qfopsname = QUOTAOPSNAME;
2067 const char *qfextension[] = INITQFNAMES;
2068
2069 /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2070 if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2071 return;
2072 }
2073 /*
2074 * Enable filesystem disk quotas if necessary.
2075 * We ignore errors as this should not interfere with final mount
2076 */
2077 for (type = 0; type < MAXQUOTAS; type++) {
2078 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2079 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2080 CAST_USER_ADDR_T(qfpath), ctx);
2081 if (namei(&qnd) != 0) {
2082 continue; /* option file to trigger quotas is not present */
2083 }
2084 vnode_put(qnd.ni_vp);
2085 nameidone(&qnd);
2086 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2087
2088 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2089 }
2090 return;
2091 }
2092
2093
2094 static int
checkdirs_callback(proc_t p,void * arg)2095 checkdirs_callback(proc_t p, void * arg)
2096 {
2097 struct cdirargs *cdrp = (struct cdirargs *)arg;
2098 vnode_t olddp = cdrp->olddp;
2099 vnode_t newdp = cdrp->newdp;
2100 struct filedesc *fdp = &p->p_fd;
2101 vnode_t new_cvp = newdp;
2102 vnode_t new_rvp = newdp;
2103 vnode_t old_cvp = NULL;
2104 vnode_t old_rvp = NULL;
2105
2106 /*
2107 * XXX Also needs to iterate each thread in the process to see if it
2108 * XXX is using a per-thread current working directory, and, if so,
2109 * XXX update that as well.
2110 */
2111
2112 /*
2113 * First, with the proc_fdlock held, check to see if we will need
2114 * to do any work. If not, we will get out fast.
2115 */
2116 proc_fdlock(p);
2117 if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2118 proc_fdunlock(p);
2119 return PROC_RETURNED;
2120 }
2121 proc_fdunlock(p);
2122
2123 /*
2124 * Ok, we will have to do some work. Always take two refs
2125 * because we might need that many. We'll dispose of whatever
2126 * we ended up not using.
2127 */
2128 if (vnode_ref(newdp) != 0) {
2129 return PROC_RETURNED;
2130 }
2131 if (vnode_ref(newdp) != 0) {
2132 vnode_rele(newdp);
2133 return PROC_RETURNED;
2134 }
2135
2136 proc_dirs_lock_exclusive(p);
2137 /*
2138 * Now do the work. Note: we dropped the proc_fdlock, so we
2139 * have to do all of the checks again.
2140 */
2141 proc_fdlock(p);
2142 if (fdp->fd_cdir == olddp) {
2143 old_cvp = olddp;
2144 fdp->fd_cdir = newdp;
2145 new_cvp = NULL;
2146 }
2147 if (fdp->fd_rdir == olddp) {
2148 old_rvp = olddp;
2149 fdp->fd_rdir = newdp;
2150 new_rvp = NULL;
2151 }
2152 proc_fdunlock(p);
2153 proc_dirs_unlock_exclusive(p);
2154
2155 /*
2156 * Dispose of any references that are no longer needed.
2157 */
2158 if (old_cvp != NULL) {
2159 vnode_rele(old_cvp);
2160 }
2161 if (old_rvp != NULL) {
2162 vnode_rele(old_rvp);
2163 }
2164 if (new_cvp != NULL) {
2165 vnode_rele(new_cvp);
2166 }
2167 if (new_rvp != NULL) {
2168 vnode_rele(new_rvp);
2169 }
2170
2171 return PROC_RETURNED;
2172 }
2173
2174
2175
2176 /*
2177 * Scan all active processes to see if any of them have a current
2178 * or root directory onto which the new filesystem has just been
2179 * mounted. If so, replace them with the new mount point.
2180 */
2181 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2182 checkdirs(vnode_t olddp, vfs_context_t ctx)
2183 {
2184 vnode_t newdp;
2185 vnode_t tvp;
2186 int err;
2187 struct cdirargs cdr;
2188
2189 if (olddp->v_usecount == 1) {
2190 return 0;
2191 }
2192 err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2193
2194 if (err != 0) {
2195 #if DIAGNOSTIC
2196 panic("mount: lost mount: error %d", err);
2197 #endif
2198 return err;
2199 }
2200
2201 cdr.olddp = olddp;
2202 cdr.newdp = newdp;
2203 /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2204 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2205
2206 if (rootvnode == olddp) {
2207 vnode_ref(newdp);
2208 lck_rw_lock_exclusive(&rootvnode_rw_lock);
2209 tvp = rootvnode;
2210 rootvnode = newdp;
2211 lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2212 vnode_rele(tvp);
2213 }
2214
2215 vnode_put(newdp);
2216 return 0;
2217 }
2218
2219 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT \
2220 "com.apple.private.vfs.role-account-unmount"
2221
2222 /*
2223 * Unmount a file system.
2224 *
2225 * Note: unmount takes a path to the vnode mounted on as argument,
2226 * not special file (as before).
2227 */
2228 /* ARGSUSED */
2229 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2230 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2231 {
2232 vnode_t vp;
2233 struct mount *mp;
2234 int error;
2235 struct nameidata nd;
2236 vfs_context_t ctx;
2237
2238 /*
2239 * If the process has the entitlement, use the kernel's context when
2240 * performing lookup on the mount path as the process might lack proper
2241 * permission to access the directory.
2242 */
2243 ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2244 vfs_context_kernel() : vfs_context_current();
2245
2246 NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2247 UIO_USERSPACE, uap->path, ctx);
2248 error = namei(&nd);
2249 if (error) {
2250 return error;
2251 }
2252 vp = nd.ni_vp;
2253 mp = vp->v_mount;
2254 nameidone(&nd);
2255
2256 #if CONFIG_MACF
2257 error = mac_mount_check_umount(ctx, mp);
2258 if (error != 0) {
2259 vnode_put(vp);
2260 return error;
2261 }
2262 #endif
2263 /*
2264 * Must be the root of the filesystem
2265 */
2266 if ((vp->v_flag & VROOT) == 0) {
2267 vnode_put(vp);
2268 return EINVAL;
2269 }
2270 mount_ref(mp, 0);
2271 vnode_put(vp);
2272 /* safedounmount consumes the mount ref */
2273 return safedounmount(mp, uap->flags, ctx);
2274 }
2275
2276 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2277 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2278 {
2279 mount_t mp;
2280
2281 mp = mount_list_lookupby_fsid(fsid, 0, 1);
2282 if (mp == (mount_t)0) {
2283 return ENOENT;
2284 }
2285 mount_ref(mp, 0);
2286 mount_iterdrop(mp);
2287 /* safedounmount consumes the mount ref */
2288 return safedounmount(mp, flags, ctx);
2289 }
2290
2291 /*
2292 * The mount struct comes with a mount ref which will be consumed.
2293 * Do the actual file system unmount, prevent some common foot shooting.
2294 */
2295 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2296 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2297 {
2298 int error;
2299 proc_t p = vfs_context_proc(ctx);
2300
2301 /*
2302 * If the file system is not responding and MNT_NOBLOCK
2303 * is set and not a forced unmount then return EBUSY.
2304 */
2305 if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2306 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2307 error = EBUSY;
2308 goto out;
2309 }
2310
2311 /*
2312 * Skip authorization in two cases:
2313 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2314 * This entitlement allows non-root processes unmount volumes mounted by
2315 * other processes.
2316 * - If the mount is tagged as permissive and this is not a forced-unmount
2317 * attempt.
2318 */
2319 if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2320 (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2321 /*
2322 * Only root, or the user that did the original mount is
2323 * permitted to unmount this filesystem.
2324 */
2325 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2326 (error = suser(kauth_cred_get(), &p->p_acflag))) {
2327 goto out;
2328 }
2329 }
2330 /*
2331 * Don't allow unmounting the root file system, or other volumes
2332 * associated with it (for example, the associated VM or DATA mounts) .
2333 */
2334 if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2335 if (!(mp->mnt_flag & MNT_ROOTFS)) {
2336 printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2337 mp->mnt_vfsstat.f_mntonname);
2338 }
2339 error = EBUSY; /* the root (or associated volumes) is always busy */
2340 goto out;
2341 }
2342
2343 /*
2344 * If the mount is providing the root filesystem's disk image
2345 * (i.e. imageboot), don't allow unmounting
2346 */
2347 if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2348 error = EBUSY;
2349 goto out;
2350 }
2351
2352 return dounmount(mp, flags, 1, ctx);
2353
2354 out:
2355 mount_drop(mp, 0);
2356 return error;
2357 }
2358
2359 /*
2360 * Do the actual file system unmount.
2361 */
2362 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2363 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2364 {
2365 vnode_t coveredvp = (vnode_t)0;
2366 int error;
2367 int needwakeup = 0;
2368 int forcedunmount = 0;
2369 int lflags = 0;
2370 struct vnode *devvp = NULLVP;
2371 #if CONFIG_TRIGGERS
2372 proc_t p = vfs_context_proc(ctx);
2373 int did_vflush = 0;
2374 int pflags_save = 0;
2375 #endif /* CONFIG_TRIGGERS */
2376
2377 #if CONFIG_FSE
2378 if (!(flags & MNT_FORCE)) {
2379 fsevent_unmount(mp, ctx); /* has to come first! */
2380 }
2381 #endif
2382
2383 mount_lock(mp);
2384
2385 /*
2386 * If already an unmount in progress just return EBUSY.
2387 * Even a forced unmount cannot override.
2388 */
2389 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2390 if (withref != 0) {
2391 mount_drop(mp, 1);
2392 }
2393 mount_unlock(mp);
2394 return EBUSY;
2395 }
2396
2397 if (flags & MNT_FORCE) {
2398 forcedunmount = 1;
2399 mp->mnt_lflag |= MNT_LFORCE;
2400 }
2401
2402 #if CONFIG_TRIGGERS
2403 if (flags & MNT_NOBLOCK && p != kernproc) {
2404 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2405 }
2406 #endif
2407
2408 mp->mnt_kern_flag |= MNTK_UNMOUNT;
2409 mp->mnt_lflag |= MNT_LUNMOUNT;
2410 mp->mnt_flag &= ~MNT_ASYNC;
2411 /*
2412 * anyone currently in the fast path that
2413 * trips over the cached rootvp will be
2414 * dumped out and forced into the slow path
2415 * to regenerate a new cached value
2416 */
2417 mp->mnt_realrootvp = NULLVP;
2418 mount_unlock(mp);
2419
2420 if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2421 /*
2422 * Force unmount any mounts in this filesystem.
2423 * If any unmounts fail - just leave them dangling.
2424 * Avoids recursion.
2425 */
2426 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2427 }
2428
2429 /*
2430 * taking the name_cache_lock exclusively will
2431 * insure that everyone is out of the fast path who
2432 * might be trying to use a now stale copy of
2433 * vp->v_mountedhere->mnt_realrootvp
2434 * bumping mount_generation causes the cached values
2435 * to be invalidated
2436 */
2437 name_cache_lock();
2438 mount_generation++;
2439 name_cache_unlock();
2440
2441
2442 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2443 if (withref != 0) {
2444 mount_drop(mp, 0);
2445 }
2446 error = 0;
2447 if (forcedunmount == 0) {
2448 ubc_umount(mp); /* release cached vnodes */
2449 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2450 error = VFS_SYNC(mp, MNT_WAIT, ctx);
2451 if (error) {
2452 mount_lock(mp);
2453 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2454 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2455 mp->mnt_lflag &= ~MNT_LFORCE;
2456 goto out;
2457 }
2458 }
2459 }
2460
2461 IOBSDMountChange(mp, kIOMountChangeUnmount);
2462
2463 #if CONFIG_TRIGGERS
2464 vfs_nested_trigger_unmounts(mp, flags, ctx);
2465 did_vflush = 1;
2466 #endif
2467 if (forcedunmount) {
2468 lflags |= FORCECLOSE;
2469 }
2470 error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM | SKIPROOT | lflags);
2471 if ((forcedunmount == 0) && error) {
2472 mount_lock(mp);
2473 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2474 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2475 mp->mnt_lflag &= ~MNT_LFORCE;
2476 goto out;
2477 }
2478
2479 /* make sure there are no one in the mount iterations or lookup */
2480 mount_iterdrain(mp);
2481
2482 error = VFS_UNMOUNT(mp, flags, ctx);
2483 if (error) {
2484 mount_iterreset(mp);
2485 mount_lock(mp);
2486 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2487 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2488 mp->mnt_lflag &= ~MNT_LFORCE;
2489 goto out;
2490 }
2491
2492 /* increment the operations count */
2493 if (!error) {
2494 OSAddAtomic(1, &vfs_nummntops);
2495 }
2496
2497 if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2498 /* hold an io reference and drop the usecount before close */
2499 devvp = mp->mnt_devvp;
2500 vnode_getalways(devvp);
2501 vnode_rele(devvp);
2502 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2503 ctx);
2504 vnode_clearmountedon(devvp);
2505 vnode_put(devvp);
2506 }
2507 lck_rw_done(&mp->mnt_rwlock);
2508 mount_list_remove(mp);
2509 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2510
2511 /* mark the mount point hook in the vp but not drop the ref yet */
2512 if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2513 /*
2514 * The covered vnode needs special handling. Trying to get an
2515 * iocount must not block here as this may lead to deadlocks
2516 * if the Filesystem to which the covered vnode belongs is
2517 * undergoing forced unmounts. Since we hold a usecount, the
2518 * vnode cannot be reused (it can, however, still be terminated)
2519 */
2520 vnode_getalways(coveredvp);
2521 vnode_lock_spin(coveredvp);
2522
2523 mp->mnt_crossref++;
2524 coveredvp->v_mountedhere = (struct mount *)0;
2525 CLR(coveredvp->v_flag, VMOUNT);
2526
2527 vnode_unlock(coveredvp);
2528 vnode_put(coveredvp);
2529 }
2530
2531 mount_list_lock();
2532 mp->mnt_vtable->vfc_refcount--;
2533 mount_list_unlock();
2534
2535 cache_purgevfs(mp); /* remove cache entries for this file sys */
2536 vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2537 mount_lock(mp);
2538 mp->mnt_lflag |= MNT_LDEAD;
2539
2540 if (mp->mnt_lflag & MNT_LWAIT) {
2541 /*
2542 * do the wakeup here
2543 * in case we block in mount_refdrain
2544 * which will drop the mount lock
2545 * and allow anyone blocked in vfs_busy
2546 * to wakeup and see the LDEAD state
2547 */
2548 mp->mnt_lflag &= ~MNT_LWAIT;
2549 wakeup((caddr_t)mp);
2550 }
2551 mount_refdrain(mp);
2552
2553 /* free disk_conditioner_info structure for this mount */
2554 disk_conditioner_unmount(mp);
2555
2556 out:
2557 if (mp->mnt_lflag & MNT_LWAIT) {
2558 mp->mnt_lflag &= ~MNT_LWAIT;
2559 needwakeup = 1;
2560 }
2561
2562 #if CONFIG_TRIGGERS
2563 if (flags & MNT_NOBLOCK && p != kernproc) {
2564 // Restore P_NOREMOTEHANG bit to its previous value
2565 if ((pflags_save & P_NOREMOTEHANG) == 0) {
2566 OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2567 }
2568 }
2569
2570 /*
2571 * Callback and context are set together under the mount lock, and
2572 * never cleared, so we're safe to examine them here, drop the lock,
2573 * and call out.
2574 */
2575 if (mp->mnt_triggercallback != NULL) {
2576 mount_unlock(mp);
2577 if (error == 0) {
2578 mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2579 } else if (did_vflush) {
2580 mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2581 }
2582 } else {
2583 mount_unlock(mp);
2584 }
2585 #else
2586 mount_unlock(mp);
2587 #endif /* CONFIG_TRIGGERS */
2588
2589 lck_rw_done(&mp->mnt_rwlock);
2590
2591 if (needwakeup) {
2592 wakeup((caddr_t)mp);
2593 }
2594
2595 if (!error) {
2596 if ((coveredvp != NULLVP)) {
2597 vnode_t pvp = NULLVP;
2598
2599 /*
2600 * The covered vnode needs special handling. Trying to
2601 * get an iocount must not block here as this may lead
2602 * to deadlocks if the Filesystem to which the covered
2603 * vnode belongs is undergoing forced unmounts. Since we
2604 * hold a usecount, the vnode cannot be reused
2605 * (it can, however, still be terminated).
2606 */
2607 vnode_getalways(coveredvp);
2608
2609 mount_dropcrossref(mp, coveredvp, 0);
2610 /*
2611 * We'll _try_ to detect if this really needs to be
2612 * done. The coveredvp can only be in termination (or
2613 * terminated) if the coveredvp's mount point is in a
2614 * forced unmount (or has been) since we still hold the
2615 * ref.
2616 */
2617 if (!vnode_isrecycled(coveredvp)) {
2618 pvp = vnode_getparent(coveredvp);
2619 #if CONFIG_TRIGGERS
2620 if (coveredvp->v_resolve) {
2621 vnode_trigger_rearm(coveredvp, ctx);
2622 }
2623 #endif
2624 }
2625
2626 vnode_rele(coveredvp);
2627 vnode_put(coveredvp);
2628 coveredvp = NULLVP;
2629
2630 if (pvp) {
2631 lock_vnode_and_post(pvp, NOTE_WRITE);
2632 vnode_put(pvp);
2633 }
2634 } else if (mp->mnt_flag & MNT_ROOTFS) {
2635 mount_lock_destroy(mp);
2636 #if CONFIG_MACF
2637 mac_mount_label_destroy(mp);
2638 #endif
2639 zfree(mount_zone, mp);
2640 } else {
2641 panic("dounmount: no coveredvp");
2642 }
2643 }
2644 return error;
2645 }
2646
2647 /*
2648 * Unmount any mounts in this filesystem.
2649 */
2650 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)2651 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2652 {
2653 mount_t smp;
2654 fsid_t *fsids, fsid;
2655 int fsids_sz;
2656 int count = 0, i, m = 0;
2657 vnode_t vp;
2658
2659 mount_list_lock();
2660
2661 // Get an array to hold the submounts fsids.
2662 TAILQ_FOREACH(smp, &mountlist, mnt_list)
2663 count++;
2664 fsids_sz = count * sizeof(fsid_t);
2665 fsids = kalloc_data(fsids_sz, Z_NOWAIT);
2666 if (fsids == NULL) {
2667 mount_list_unlock();
2668 goto out;
2669 }
2670 fsids[0] = mp->mnt_vfsstat.f_fsid; // Prime the pump
2671
2672 /*
2673 * Fill the array with submount fsids.
2674 * Since mounts are always added to the tail of the mount list, the
2675 * list is always in mount order.
2676 * For each mount check if the mounted-on vnode belongs to a
2677 * mount that's already added to our array of mounts to be unmounted.
2678 */
2679 for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2680 vp = smp->mnt_vnodecovered;
2681 if (vp == NULL) {
2682 continue;
2683 }
2684 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid; // Underlying fsid
2685 for (i = 0; i <= m; i++) {
2686 if (fsids[i].val[0] == fsid.val[0] &&
2687 fsids[i].val[1] == fsid.val[1]) {
2688 fsids[++m] = smp->mnt_vfsstat.f_fsid;
2689 break;
2690 }
2691 }
2692 }
2693 mount_list_unlock();
2694
2695 // Unmount the submounts in reverse order. Ignore errors.
2696 for (i = m; i > 0; i--) {
2697 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2698 if (smp) {
2699 mount_ref(smp, 0);
2700 mount_iterdrop(smp);
2701 (void) dounmount(smp, flags, 1, ctx);
2702 }
2703 }
2704 out:
2705 kfree_data(fsids, fsids_sz);
2706 }
2707
2708 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)2709 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2710 {
2711 vnode_lock(dp);
2712 mp->mnt_crossref--;
2713
2714 if (mp->mnt_crossref < 0) {
2715 panic("mount cross refs -ve");
2716 }
2717
2718 if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2719 if (need_put) {
2720 vnode_put_locked(dp);
2721 }
2722 vnode_unlock(dp);
2723
2724 mount_lock_destroy(mp);
2725 #if CONFIG_MACF
2726 mac_mount_label_destroy(mp);
2727 #endif
2728 zfree(mount_zone, mp);
2729 return;
2730 }
2731 if (need_put) {
2732 vnode_put_locked(dp);
2733 }
2734 vnode_unlock(dp);
2735 }
2736
2737
2738 /*
2739 * Sync each mounted filesystem.
2740 */
2741 #if DIAGNOSTIC
2742 int syncprt = 0;
2743 #endif
2744
2745 int print_vmpage_stat = 0;
2746
2747 /*
2748 * sync_callback: simple wrapper that calls VFS_SYNC() on volumes
2749 * mounted read-write with the passed waitfor value.
2750 *
2751 * Parameters: mp mount-point descriptor per mounted file-system instance.
2752 * arg user argument (please see below)
2753 *
2754 * User argument is a pointer to 32 bit unsigned integer which describes the
2755 * type of waitfor value to set for calling VFS_SYNC(). If user argument is
2756 * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
2757 * waitfor value.
2758 *
2759 * Returns: VFS_RETURNED
2760 */
2761 static int
sync_callback(mount_t mp,void * arg)2762 sync_callback(mount_t mp, void *arg)
2763 {
2764 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2765 int asyncflag = mp->mnt_flag & MNT_ASYNC;
2766 unsigned waitfor = MNT_NOWAIT;
2767
2768 if (arg) {
2769 waitfor = *(uint32_t*)arg;
2770 }
2771
2772 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
2773 if (waitfor != MNT_WAIT &&
2774 waitfor != (MNT_WAIT | MNT_VOLUME) &&
2775 waitfor != MNT_NOWAIT &&
2776 waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
2777 waitfor != MNT_DWAIT &&
2778 waitfor != (MNT_DWAIT | MNT_VOLUME)) {
2779 panic("Passed inappropriate waitfor %u to "
2780 "sync_callback()", waitfor);
2781 }
2782
2783 mp->mnt_flag &= ~MNT_ASYNC;
2784 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
2785 if (asyncflag) {
2786 mp->mnt_flag |= MNT_ASYNC;
2787 }
2788 }
2789
2790 return VFS_RETURNED;
2791 }
2792
2793 /* ARGSUSED */
2794 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)2795 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2796 {
2797 vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2798
2799 if (print_vmpage_stat) {
2800 vm_countdirtypages();
2801 }
2802
2803 #if DIAGNOSTIC
2804 if (syncprt) {
2805 vfs_bufstats();
2806 }
2807 #endif /* DIAGNOSTIC */
2808 return 0;
2809 }
2810
2811 typedef enum {
2812 SYNC_ALL = 0,
2813 SYNC_ONLY_RELIABLE_MEDIA = 1,
2814 SYNC_ONLY_UNRELIABLE_MEDIA = 2
2815 } sync_type_t;
2816
2817 static int
sync_internal_callback(mount_t mp,void * arg)2818 sync_internal_callback(mount_t mp, void *arg)
2819 {
2820 if (arg) {
2821 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
2822 (mp->mnt_flag & MNT_LOCAL);
2823 sync_type_t sync_type = *((sync_type_t *)arg);
2824
2825 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
2826 return VFS_RETURNED;
2827 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
2828 return VFS_RETURNED;
2829 }
2830 }
2831
2832 (void)sync_callback(mp, NULL);
2833
2834 return VFS_RETURNED;
2835 }
2836
2837 int sync_thread_state = 0;
2838 int sync_timeout_seconds = 5;
2839
2840 #define SYNC_THREAD_RUN 0x0001
2841 #define SYNC_THREAD_RUNNING 0x0002
2842
2843 #if CONFIG_PHYS_WRITE_ACCT
2844 thread_t pm_sync_thread;
2845 #endif /* CONFIG_PHYS_WRITE_ACCT */
2846
2847 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)2848 sync_thread(__unused void *arg, __unused wait_result_t wr)
2849 {
2850 sync_type_t sync_type;
2851 #if CONFIG_PHYS_WRITE_ACCT
2852 pm_sync_thread = current_thread();
2853 #endif /* CONFIG_PHYS_WRITE_ACCT */
2854
2855 lck_mtx_lock(&sync_mtx_lck);
2856 while (sync_thread_state & SYNC_THREAD_RUN) {
2857 sync_thread_state &= ~SYNC_THREAD_RUN;
2858 lck_mtx_unlock(&sync_mtx_lck);
2859
2860 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
2861 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2862 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
2863 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2864
2865 lck_mtx_lock(&sync_mtx_lck);
2866 }
2867 /*
2868 * This wakeup _has_ to be issued before the lock is released otherwise
2869 * we may end up waking up a thread in sync_internal which is
2870 * expecting a wakeup from a thread it just created and not from this
2871 * thread which is about to exit.
2872 */
2873 wakeup(&sync_thread_state);
2874 sync_thread_state &= ~SYNC_THREAD_RUNNING;
2875 #if CONFIG_PHYS_WRITE_ACCT
2876 pm_sync_thread = NULL;
2877 #endif /* CONFIG_PHYS_WRITE_ACCT */
2878 lck_mtx_unlock(&sync_mtx_lck);
2879
2880 if (print_vmpage_stat) {
2881 vm_countdirtypages();
2882 }
2883
2884 #if DIAGNOSTIC
2885 if (syncprt) {
2886 vfs_bufstats();
2887 }
2888 #endif /* DIAGNOSTIC */
2889 }
2890
2891 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
2892
2893 /*
2894 * An in-kernel sync for power management to call.
2895 * This function always returns within sync_timeout seconds.
2896 */
2897 __private_extern__ int
sync_internal(void)2898 sync_internal(void)
2899 {
2900 thread_t thd;
2901 int error;
2902 int thread_created = FALSE;
2903 struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
2904
2905 lck_mtx_lock(&sync_mtx_lck);
2906 sync_thread_state |= SYNC_THREAD_RUN;
2907 if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
2908 int kr;
2909
2910 sync_thread_state |= SYNC_THREAD_RUNNING;
2911 kr = kernel_thread_start(sync_thread, NULL, &thd);
2912 if (kr != KERN_SUCCESS) {
2913 sync_thread_state &= ~SYNC_THREAD_RUNNING;
2914 lck_mtx_unlock(&sync_mtx_lck);
2915 printf("sync_thread failed\n");
2916 return 0;
2917 }
2918 thread_created = TRUE;
2919 }
2920
2921 error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
2922 (PVFS | PDROP | PCATCH), "sync_thread", &ts);
2923 if (error) {
2924 struct timeval now;
2925
2926 microtime(&now);
2927 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
2928 printf("sync timed out: %d sec\n", sync_timeout_seconds);
2929 sync_timeout_last_print.tv_sec = now.tv_sec;
2930 }
2931 }
2932
2933 if (thread_created) {
2934 thread_deallocate(thd);
2935 }
2936
2937 return 0;
2938 } /* end of sync_internal call */
2939
2940 /*
2941 * Change filesystem quotas.
2942 */
2943 #if QUOTA
2944 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)2945 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2946 {
2947 struct mount *mp;
2948 int error, quota_cmd, quota_status = 0;
2949 caddr_t datap;
2950 size_t fnamelen;
2951 struct nameidata nd;
2952 vfs_context_t ctx = vfs_context_current();
2953 struct dqblk my_dqblk = {};
2954
2955 AUDIT_ARG(uid, uap->uid);
2956 AUDIT_ARG(cmd, uap->cmd);
2957 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2958 uap->path, ctx);
2959 error = namei(&nd);
2960 if (error) {
2961 return error;
2962 }
2963 mp = nd.ni_vp->v_mount;
2964 mount_ref(mp, 0);
2965 vnode_put(nd.ni_vp);
2966 nameidone(&nd);
2967
2968 #if CONFIG_MACF
2969 error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
2970 if (error != 0) {
2971 goto out;
2972 }
2973 #endif
2974
2975 /* copyin any data we will need for downstream code */
2976 quota_cmd = uap->cmd >> SUBCMDSHIFT;
2977
2978 switch (quota_cmd) {
2979 case Q_QUOTAON:
2980 /* uap->arg specifies a file from which to take the quotas */
2981 fnamelen = MAXPATHLEN;
2982 datap = zalloc(ZV_NAMEI);
2983 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2984 break;
2985 case Q_GETQUOTA:
2986 /* uap->arg is a pointer to a dqblk structure. */
2987 datap = (caddr_t) &my_dqblk;
2988 break;
2989 case Q_SETQUOTA:
2990 case Q_SETUSE:
2991 /* uap->arg is a pointer to a dqblk structure. */
2992 datap = (caddr_t) &my_dqblk;
2993 if (proc_is64bit(p)) {
2994 struct user_dqblk my_dqblk64;
2995 error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
2996 if (error == 0) {
2997 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2998 }
2999 } else {
3000 error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3001 }
3002 break;
3003 case Q_QUOTASTAT:
3004 /* uap->arg is a pointer to an integer */
3005 datap = (caddr_t) "a_status;
3006 break;
3007 default:
3008 datap = NULL;
3009 break;
3010 } /* switch */
3011
3012 if (error == 0) {
3013 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3014 }
3015
3016 switch (quota_cmd) {
3017 case Q_QUOTAON:
3018 if (datap != NULL) {
3019 zfree(ZV_NAMEI, datap);
3020 }
3021 break;
3022 case Q_GETQUOTA:
3023 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
3024 if (error == 0) {
3025 if (proc_is64bit(p)) {
3026 struct user_dqblk my_dqblk64;
3027
3028 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3029 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3030 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3031 } else {
3032 error = copyout(datap, uap->arg, sizeof(struct dqblk));
3033 }
3034 }
3035 break;
3036 case Q_QUOTASTAT:
3037 /* uap->arg is a pointer to an integer */
3038 if (error == 0) {
3039 error = copyout(datap, uap->arg, sizeof(quota_status));
3040 }
3041 break;
3042 default:
3043 break;
3044 } /* switch */
3045
3046 out:
3047 mount_drop(mp, 0);
3048 return error;
3049 }
3050 #else
3051 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3052 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3053 {
3054 return EOPNOTSUPP;
3055 }
3056 #endif /* QUOTA */
3057
3058 /*
3059 * Get filesystem statistics.
3060 *
3061 * Returns: 0 Success
3062 * namei:???
3063 * vfs_update_vfsstat:???
3064 * munge_statfs:EFAULT
3065 */
3066 /* ARGSUSED */
3067 int
statfs(__unused proc_t p,struct statfs_args * uap,__unused int32_t * retval)3068 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3069 {
3070 struct mount *mp;
3071 struct vfsstatfs *sp;
3072 int error;
3073 struct nameidata nd;
3074 vfs_context_t ctx = vfs_context_current();
3075 vnode_t vp;
3076
3077 NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3078 UIO_USERSPACE, uap->path, ctx);
3079 error = namei(&nd);
3080 if (error != 0) {
3081 return error;
3082 }
3083 vp = nd.ni_vp;
3084 mp = vp->v_mount;
3085 sp = &mp->mnt_vfsstat;
3086 nameidone(&nd);
3087
3088 #if CONFIG_MACF
3089 error = mac_mount_check_stat(ctx, mp);
3090 if (error != 0) {
3091 vnode_put(vp);
3092 return error;
3093 }
3094 #endif
3095
3096 error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3097 if (error != 0) {
3098 vnode_put(vp);
3099 return error;
3100 }
3101
3102 error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
3103 vnode_put(vp);
3104 return error;
3105 }
3106
3107 /*
3108 * Get filesystem statistics.
3109 */
3110 /* ARGSUSED */
3111 int
fstatfs(__unused proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3112 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3113 {
3114 vnode_t vp;
3115 struct mount *mp;
3116 struct vfsstatfs *sp;
3117 int error;
3118
3119 AUDIT_ARG(fd, uap->fd);
3120
3121 if ((error = file_vnode(uap->fd, &vp))) {
3122 return error;
3123 }
3124
3125 error = vnode_getwithref(vp);
3126 if (error) {
3127 file_drop(uap->fd);
3128 return error;
3129 }
3130
3131 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3132
3133 mp = vp->v_mount;
3134 if (!mp) {
3135 error = EBADF;
3136 goto out;
3137 }
3138
3139 #if CONFIG_MACF
3140 error = mac_mount_check_stat(vfs_context_current(), mp);
3141 if (error != 0) {
3142 goto out;
3143 }
3144 #endif
3145
3146 sp = &mp->mnt_vfsstat;
3147 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3148 goto out;
3149 }
3150
3151 error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
3152
3153 out:
3154 file_drop(uap->fd);
3155 vnode_put(vp);
3156
3157 return error;
3158 }
3159
3160 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3161 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3162 {
3163 struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3164
3165 bzero(sfs, sizeof(*sfs));
3166
3167 sfs->f_bsize = vsfs->f_bsize;
3168 sfs->f_iosize = (int32_t)vsfs->f_iosize;
3169 sfs->f_blocks = vsfs->f_blocks;
3170 sfs->f_bfree = vsfs->f_bfree;
3171 sfs->f_bavail = vsfs->f_bavail;
3172 sfs->f_files = vsfs->f_files;
3173 sfs->f_ffree = vsfs->f_ffree;
3174 sfs->f_fsid = vsfs->f_fsid;
3175 sfs->f_owner = vsfs->f_owner;
3176 sfs->f_type = mp->mnt_vtable->vfc_typenum;
3177 sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3178 sfs->f_fssubtype = vsfs->f_fssubtype;
3179 sfs->f_flags_ext = (mp->mnt_kern_flag & MNTK_SYSTEMDATA) ? MNT_EXT_ROOT_DATA_VOL : 0;
3180 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3181 strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3182 } else {
3183 strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
3184 }
3185 strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3186 strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3187 }
3188
3189 /*
3190 * Get file system statistics in 64-bit mode
3191 */
3192 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3193 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3194 {
3195 struct mount *mp;
3196 int error;
3197 struct nameidata *ndp;
3198 struct statfs64 *sfsp;
3199 vfs_context_t ctxp = vfs_context_current();
3200 vnode_t vp;
3201 struct {
3202 struct nameidata nd;
3203 struct statfs64 sfs;
3204 } *__nameidata_statfs64;
3205
3206 __nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3207 Z_WAITOK);
3208 ndp = &__nameidata_statfs64->nd;
3209
3210 NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3211 UIO_USERSPACE, uap->path, ctxp);
3212 error = namei(ndp);
3213 if (error != 0) {
3214 goto out;
3215 }
3216 vp = ndp->ni_vp;
3217 mp = vp->v_mount;
3218 nameidone(ndp);
3219
3220 #if CONFIG_MACF
3221 error = mac_mount_check_stat(ctxp, mp);
3222 if (error != 0) {
3223 vnode_put(vp);
3224 goto out;
3225 }
3226 #endif
3227
3228 error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3229 if (error != 0) {
3230 vnode_put(vp);
3231 goto out;
3232 }
3233
3234 sfsp = &__nameidata_statfs64->sfs;
3235 vfs_get_statfs64(mp, sfsp);
3236 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3237 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3238 /* This process does not want to see a seperate data volume mountpoint */
3239 strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3240 }
3241 error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3242 vnode_put(vp);
3243
3244 out:
3245 kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3246
3247 return error;
3248 }
3249
3250 /*
3251 * Get file system statistics in 64-bit mode
3252 */
3253 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3254 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3255 {
3256 struct vnode *vp;
3257 struct mount *mp;
3258 struct statfs64 sfs;
3259 int error;
3260
3261 AUDIT_ARG(fd, uap->fd);
3262
3263 if ((error = file_vnode(uap->fd, &vp))) {
3264 return error;
3265 }
3266
3267 error = vnode_getwithref(vp);
3268 if (error) {
3269 file_drop(uap->fd);
3270 return error;
3271 }
3272
3273 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3274
3275 mp = vp->v_mount;
3276 if (!mp) {
3277 error = EBADF;
3278 goto out;
3279 }
3280
3281 #if CONFIG_MACF
3282 error = mac_mount_check_stat(vfs_context_current(), mp);
3283 if (error != 0) {
3284 goto out;
3285 }
3286 #endif
3287
3288 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3289 goto out;
3290 }
3291
3292 vfs_get_statfs64(mp, &sfs);
3293 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3294 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3295 /* This process does not want to see a seperate data volume mountpoint */
3296 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3297 }
3298 error = copyout(&sfs, uap->buf, sizeof(sfs));
3299
3300 out:
3301 file_drop(uap->fd);
3302 vnode_put(vp);
3303
3304 return error;
3305 }
3306
3307 struct getfsstat_struct {
3308 user_addr_t sfsp;
3309 user_addr_t *mp;
3310 int count;
3311 int maxcount;
3312 int flags;
3313 int error;
3314 };
3315
3316
3317 static int
getfsstat_callback(mount_t mp,void * arg)3318 getfsstat_callback(mount_t mp, void * arg)
3319 {
3320 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3321 struct vfsstatfs *sp;
3322 int error, my_size;
3323 vfs_context_t ctx = vfs_context_current();
3324
3325 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3326 #if CONFIG_MACF
3327 error = mac_mount_check_stat(ctx, mp);
3328 if (error != 0) {
3329 fstp->error = error;
3330 return VFS_RETURNED_DONE;
3331 }
3332 #endif
3333 sp = &mp->mnt_vfsstat;
3334 /*
3335 * If MNT_NOWAIT is specified, do not refresh the
3336 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3337 */
3338 if ((mp->mnt_lflag & MNT_LDEAD) ||
3339 (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3340 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3341 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3342 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3343 return VFS_RETURNED;
3344 }
3345
3346 /*
3347 * Need to handle LP64 version of struct statfs
3348 */
3349 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3350 if (error) {
3351 fstp->error = error;
3352 return VFS_RETURNED_DONE;
3353 }
3354 fstp->sfsp += my_size;
3355
3356 if (fstp->mp) {
3357 #if CONFIG_MACF
3358 error = mac_mount_label_get(mp, *fstp->mp);
3359 if (error) {
3360 fstp->error = error;
3361 return VFS_RETURNED_DONE;
3362 }
3363 #endif
3364 fstp->mp++;
3365 }
3366 }
3367 fstp->count++;
3368 return VFS_RETURNED;
3369 }
3370
3371 /*
3372 * Get statistics on all filesystems.
3373 */
3374 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3375 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3376 {
3377 struct __mac_getfsstat_args muap;
3378
3379 muap.buf = uap->buf;
3380 muap.bufsize = uap->bufsize;
3381 muap.mac = USER_ADDR_NULL;
3382 muap.macsize = 0;
3383 muap.flags = uap->flags;
3384
3385 return __mac_getfsstat(p, &muap, retval);
3386 }
3387
3388 /*
3389 * __mac_getfsstat: Get MAC-related file system statistics
3390 *
3391 * Parameters: p (ignored)
3392 * uap User argument descriptor (see below)
3393 * retval Count of file system statistics (N stats)
3394 *
3395 * Indirect: uap->bufsize Buffer size
3396 * uap->macsize MAC info size
3397 * uap->buf Buffer where information will be returned
3398 * uap->mac MAC info
3399 * uap->flags File system flags
3400 *
3401 *
3402 * Returns: 0 Success
3403 * !0 Not success
3404 *
3405 */
3406 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3407 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3408 {
3409 user_addr_t sfsp;
3410 user_addr_t *mp;
3411 size_t count, maxcount, bufsize, macsize;
3412 struct getfsstat_struct fst;
3413
3414 if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3415 return EINVAL;
3416 }
3417
3418 bufsize = (size_t) uap->bufsize;
3419 macsize = (size_t) uap->macsize;
3420
3421 if (IS_64BIT_PROCESS(p)) {
3422 maxcount = bufsize / sizeof(struct user64_statfs);
3423 } else {
3424 maxcount = bufsize / sizeof(struct user32_statfs);
3425 }
3426 sfsp = uap->buf;
3427 count = 0;
3428
3429 mp = NULL;
3430
3431 #if CONFIG_MACF
3432 if (uap->mac != USER_ADDR_NULL) {
3433 u_int32_t *mp0;
3434 int error;
3435 unsigned int i;
3436
3437 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3438 if (count != maxcount) {
3439 return EINVAL;
3440 }
3441
3442 /* Copy in the array */
3443 mp0 = kalloc_data(macsize, Z_WAITOK);
3444 if (mp0 == NULL) {
3445 return ENOMEM;
3446 }
3447
3448 error = copyin(uap->mac, mp0, macsize);
3449 if (error) {
3450 kfree_data(mp0, macsize);
3451 return error;
3452 }
3453
3454 /* Normalize to an array of user_addr_t */
3455 mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3456 if (mp == NULL) {
3457 kfree_data(mp0, macsize);
3458 return ENOMEM;
3459 }
3460
3461 for (i = 0; i < count; i++) {
3462 if (IS_64BIT_PROCESS(p)) {
3463 mp[i] = ((user_addr_t *)mp0)[i];
3464 } else {
3465 mp[i] = (user_addr_t)mp0[i];
3466 }
3467 }
3468 kfree_data(mp0, macsize);
3469 }
3470 #endif
3471
3472
3473 fst.sfsp = sfsp;
3474 fst.mp = mp;
3475 fst.flags = uap->flags;
3476 fst.count = 0;
3477 fst.error = 0;
3478 fst.maxcount = (int)maxcount;
3479
3480
3481 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3482
3483 if (mp) {
3484 kfree_data(mp, count * sizeof(user_addr_t));
3485 }
3486
3487 if (fst.error) {
3488 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3489 return fst.error;
3490 }
3491
3492 if (fst.sfsp && fst.count > fst.maxcount) {
3493 *retval = fst.maxcount;
3494 } else {
3495 *retval = fst.count;
3496 }
3497 return 0;
3498 }
3499
3500 static int
getfsstat64_callback(mount_t mp,void * arg)3501 getfsstat64_callback(mount_t mp, void * arg)
3502 {
3503 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3504 struct vfsstatfs *sp;
3505 struct statfs64 sfs;
3506 int error;
3507
3508 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3509 #if CONFIG_MACF
3510 error = mac_mount_check_stat(vfs_context_current(), mp);
3511 if (error != 0) {
3512 fstp->error = error;
3513 return VFS_RETURNED_DONE;
3514 }
3515 #endif
3516 sp = &mp->mnt_vfsstat;
3517 /*
3518 * If MNT_NOWAIT is specified, do not refresh the fsstat
3519 * cache. MNT_WAIT overrides MNT_NOWAIT.
3520 *
3521 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3522 * getfsstat, since the constants are out of the same
3523 * namespace.
3524 */
3525 if ((mp->mnt_lflag & MNT_LDEAD) ||
3526 ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3527 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3528 (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3529 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3530 return VFS_RETURNED;
3531 }
3532
3533 vfs_get_statfs64(mp, &sfs);
3534 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3535 if (error) {
3536 fstp->error = error;
3537 return VFS_RETURNED_DONE;
3538 }
3539 fstp->sfsp += sizeof(sfs);
3540 }
3541 fstp->count++;
3542 return VFS_RETURNED;
3543 }
3544
3545 /*
3546 * Get statistics on all file systems in 64 bit mode.
3547 */
3548 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)3549 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3550 {
3551 user_addr_t sfsp;
3552 int count, maxcount;
3553 struct getfsstat_struct fst;
3554
3555 maxcount = uap->bufsize / sizeof(struct statfs64);
3556
3557 sfsp = uap->buf;
3558 count = 0;
3559
3560 fst.sfsp = sfsp;
3561 fst.flags = uap->flags;
3562 fst.count = 0;
3563 fst.error = 0;
3564 fst.maxcount = maxcount;
3565
3566 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3567
3568 if (fst.error) {
3569 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3570 return fst.error;
3571 }
3572
3573 if (fst.sfsp && fst.count > fst.maxcount) {
3574 *retval = fst.maxcount;
3575 } else {
3576 *retval = fst.count;
3577 }
3578
3579 return 0;
3580 }
3581
3582 /*
3583 * gets the associated vnode with the file descriptor passed.
3584 * as input
3585 *
3586 * INPUT
3587 * ctx - vfs context of caller
3588 * fd - file descriptor for which vnode is required.
3589 * vpp - Pointer to pointer to vnode to be returned.
3590 *
3591 * The vnode is returned with an iocount so any vnode obtained
3592 * by this call needs a vnode_put
3593 *
3594 */
3595 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)3596 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3597 {
3598 int error;
3599 vnode_t vp;
3600 struct fileproc *fp;
3601 proc_t p = vfs_context_proc(ctx);
3602
3603 *vpp = NULLVP;
3604
3605 error = fp_getfvp(p, fd, &fp, &vp);
3606 if (error) {
3607 return error;
3608 }
3609
3610 error = vnode_getwithref(vp);
3611 if (error) {
3612 (void)fp_drop(p, fd, fp, 0);
3613 return error;
3614 }
3615
3616 (void)fp_drop(p, fd, fp, 0);
3617 *vpp = vp;
3618 return error;
3619 }
3620
3621 /*
3622 * Wrapper function around namei to start lookup from a directory
3623 * specified by a file descriptor ni_dirfd.
3624 *
3625 * In addition to all the errors returned by namei, this call can
3626 * return ENOTDIR if the file descriptor does not refer to a directory.
3627 * and EBADF if the file descriptor is not valid.
3628 */
3629 int
nameiat(struct nameidata * ndp,int dirfd)3630 nameiat(struct nameidata *ndp, int dirfd)
3631 {
3632 if ((dirfd != AT_FDCWD) &&
3633 !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3634 !(ndp->ni_cnd.cn_flags & USEDVP)) {
3635 int error = 0;
3636 char c;
3637
3638 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3639 error = copyin(ndp->ni_dirp, &c, sizeof(char));
3640 if (error) {
3641 return error;
3642 }
3643 } else {
3644 c = *((char *)(ndp->ni_dirp));
3645 }
3646
3647 if (c != '/') {
3648 vnode_t dvp_at;
3649
3650 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3651 &dvp_at);
3652 if (error) {
3653 return error;
3654 }
3655
3656 if (vnode_vtype(dvp_at) != VDIR) {
3657 vnode_put(dvp_at);
3658 return ENOTDIR;
3659 }
3660
3661 ndp->ni_dvp = dvp_at;
3662 ndp->ni_cnd.cn_flags |= USEDVP;
3663 error = namei(ndp);
3664 ndp->ni_cnd.cn_flags &= ~USEDVP;
3665 vnode_put(dvp_at);
3666 return error;
3667 }
3668 }
3669
3670 return namei(ndp);
3671 }
3672
3673 /*
3674 * Change current working directory to a given file descriptor.
3675 */
3676 /* ARGSUSED */
3677 static int
common_fchdir(proc_t p,struct fchdir_args * uap,int per_thread)3678 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
3679 {
3680 vnode_t vp;
3681 vnode_t tdp;
3682 vnode_t tvp;
3683 struct mount *mp;
3684 int error, should_put = 1;
3685 vfs_context_t ctx = vfs_context_current();
3686
3687 AUDIT_ARG(fd, uap->fd);
3688 if (per_thread && uap->fd == -1) {
3689 /*
3690 * Switching back from per-thread to per process CWD; verify we
3691 * in fact have one before proceeding. The only success case
3692 * for this code path is to return 0 preemptively after zapping
3693 * the thread structure contents.
3694 */
3695 thread_t th = vfs_context_thread(ctx);
3696 if (th) {
3697 uthread_t uth = get_bsdthread_info(th);
3698 tvp = uth->uu_cdir;
3699 uth->uu_cdir = NULLVP;
3700 if (tvp != NULLVP) {
3701 vnode_rele(tvp);
3702 return 0;
3703 }
3704 }
3705 return EBADF;
3706 }
3707
3708 if ((error = file_vnode(uap->fd, &vp))) {
3709 return error;
3710 }
3711 if ((error = vnode_getwithref(vp))) {
3712 file_drop(uap->fd);
3713 return error;
3714 }
3715
3716 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
3717
3718 if (vp->v_type != VDIR) {
3719 error = ENOTDIR;
3720 goto out;
3721 }
3722
3723 #if CONFIG_MACF
3724 error = mac_vnode_check_chdir(ctx, vp);
3725 if (error) {
3726 goto out;
3727 }
3728 #endif
3729 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3730 if (error) {
3731 goto out;
3732 }
3733
3734 while (!error && (mp = vp->v_mountedhere) != NULL) {
3735 if (vfs_busy(mp, LK_NOWAIT)) {
3736 error = EACCES;
3737 goto out;
3738 }
3739 error = VFS_ROOT(mp, &tdp, ctx);
3740 vfs_unbusy(mp);
3741 if (error) {
3742 break;
3743 }
3744 vnode_put(vp);
3745 vp = tdp;
3746 }
3747 if (error) {
3748 goto out;
3749 }
3750 if ((error = vnode_ref(vp))) {
3751 goto out;
3752 }
3753 vnode_put(vp);
3754 should_put = 0;
3755
3756 if (per_thread) {
3757 thread_t th = vfs_context_thread(ctx);
3758 if (th) {
3759 uthread_t uth = get_bsdthread_info(th);
3760 tvp = uth->uu_cdir;
3761 uth->uu_cdir = vp;
3762 OSBitOrAtomic(P_THCWD, &p->p_flag);
3763 } else {
3764 vnode_rele(vp);
3765 error = ENOENT;
3766 goto out;
3767 }
3768 } else {
3769 proc_dirs_lock_exclusive(p);
3770 proc_fdlock(p);
3771 tvp = p->p_fd.fd_cdir;
3772 p->p_fd.fd_cdir = vp;
3773 proc_fdunlock(p);
3774 proc_dirs_unlock_exclusive(p);
3775 }
3776
3777 if (tvp) {
3778 vnode_rele(tvp);
3779 }
3780
3781 out:
3782 if (should_put) {
3783 vnode_put(vp);
3784 }
3785 file_drop(uap->fd);
3786
3787 return error;
3788 }
3789
3790 int
fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)3791 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3792 {
3793 return common_fchdir(p, uap, 0);
3794 }
3795
3796 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)3797 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3798 {
3799 return common_fchdir(p, (void *)uap, 1);
3800 }
3801
3802
3803 /*
3804 * Change current working directory (".").
3805 *
3806 * Returns: 0 Success
3807 * change_dir:ENOTDIR
3808 * change_dir:???
3809 * vnode_ref:ENOENT No such file or directory
3810 */
3811 /* ARGSUSED */
3812 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)3813 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
3814 {
3815 int error;
3816 vnode_t tvp;
3817
3818 error = change_dir(ndp, ctx);
3819 if (error) {
3820 return error;
3821 }
3822 if ((error = vnode_ref(ndp->ni_vp))) {
3823 vnode_put(ndp->ni_vp);
3824 return error;
3825 }
3826 /*
3827 * drop the iocount we picked up in change_dir
3828 */
3829 vnode_put(ndp->ni_vp);
3830
3831 if (per_thread) {
3832 thread_t th = vfs_context_thread(ctx);
3833 if (th) {
3834 uthread_t uth = get_bsdthread_info(th);
3835 tvp = uth->uu_cdir;
3836 uth->uu_cdir = ndp->ni_vp;
3837 OSBitOrAtomic(P_THCWD, &p->p_flag);
3838 } else {
3839 vnode_rele(ndp->ni_vp);
3840 return ENOENT;
3841 }
3842 } else {
3843 proc_dirs_lock_exclusive(p);
3844 proc_fdlock(p);
3845 tvp = p->p_fd.fd_cdir;
3846 p->p_fd.fd_cdir = ndp->ni_vp;
3847 proc_fdunlock(p);
3848 proc_dirs_unlock_exclusive(p);
3849 }
3850
3851 if (tvp) {
3852 vnode_rele(tvp);
3853 }
3854
3855 return 0;
3856 }
3857
3858
3859 /*
3860 * Change current working directory (".").
3861 *
3862 * Returns: 0 Success
3863 * chdir_internal:ENOTDIR
3864 * chdir_internal:ENOENT No such file or directory
3865 * chdir_internal:???
3866 */
3867 /* ARGSUSED */
3868 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)3869 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3870 {
3871 struct nameidata nd;
3872 vfs_context_t ctx = vfs_context_current();
3873
3874 NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3875 UIO_USERSPACE, uap->path, ctx);
3876
3877 return chdir_internal(p, ctx, &nd, per_thread);
3878 }
3879
3880
3881 /*
3882 * chdir
3883 *
3884 * Change current working directory (".") for the entire process
3885 *
3886 * Parameters: p Process requesting the call
3887 * uap User argument descriptor (see below)
3888 * retval (ignored)
3889 *
3890 * Indirect parameters: uap->path Directory path
3891 *
3892 * Returns: 0 Success
3893 * common_chdir: ENOTDIR
3894 * common_chdir: ENOENT No such file or directory
3895 * common_chdir: ???
3896 *
3897 */
3898 int
chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)3899 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3900 {
3901 return common_chdir(p, (void *)uap, 0);
3902 }
3903
3904 /*
3905 * __pthread_chdir
3906 *
3907 * Change current working directory (".") for a single thread
3908 *
3909 * Parameters: p Process requesting the call
3910 * uap User argument descriptor (see below)
3911 * retval (ignored)
3912 *
3913 * Indirect parameters: uap->path Directory path
3914 *
3915 * Returns: 0 Success
3916 * common_chdir: ENOTDIR
3917 * common_chdir: ENOENT No such file or directory
3918 * common_chdir: ???
3919 *
3920 */
3921 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)3922 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3923 {
3924 return common_chdir(p, (void *)uap, 1);
3925 }
3926
3927
3928 /*
3929 * Change notion of root (``/'') directory.
3930 */
3931 /* ARGSUSED */
3932 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)3933 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3934 {
3935 struct filedesc *fdp = &p->p_fd;
3936 int error;
3937 struct nameidata nd;
3938 vnode_t tvp;
3939 vfs_context_t ctx = vfs_context_current();
3940
3941 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
3942 return error;
3943 }
3944
3945 NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3946 UIO_USERSPACE, uap->path, ctx);
3947 error = change_dir(&nd, ctx);
3948 if (error) {
3949 return error;
3950 }
3951
3952 #if CONFIG_MACF
3953 error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3954 &nd.ni_cnd);
3955 if (error) {
3956 vnode_put(nd.ni_vp);
3957 return error;
3958 }
3959 #endif
3960
3961 if ((error = vnode_ref(nd.ni_vp))) {
3962 vnode_put(nd.ni_vp);
3963 return error;
3964 }
3965 vnode_put(nd.ni_vp);
3966
3967 /*
3968 * This lock provides the guarantee that as long as you hold the lock
3969 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
3970 * on a referenced vnode in namei when determining the rootvnode for
3971 * a process.
3972 */
3973 /* needed for synchronization with lookup */
3974 proc_dirs_lock_exclusive(p);
3975 /* needed for setting the flag and other activities on the fd itself */
3976 proc_fdlock(p);
3977 tvp = fdp->fd_rdir;
3978 fdp->fd_rdir = nd.ni_vp;
3979 fdt_flag_set(fdp, FD_CHROOT);
3980 proc_fdunlock(p);
3981 proc_dirs_unlock_exclusive(p);
3982
3983 if (tvp != NULL) {
3984 vnode_rele(tvp);
3985 }
3986
3987 return 0;
3988 }
3989
3990 #define PATHSTATICBUFLEN 256
3991 #define PIVOT_ROOT_ENTITLEMENT \
3992 "com.apple.private.vfs.pivot-root"
3993
3994 #if defined(XNU_TARGET_OS_OSX)
3995 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)3996 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
3997 {
3998 int error;
3999 char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4000 char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4001 char *new_rootfs_path_before_buf = NULL;
4002 char *old_rootfs_path_after_buf = NULL;
4003 char *incoming = NULL;
4004 char *outgoing = NULL;
4005 vnode_t incoming_rootvp = NULLVP;
4006 size_t bytes_copied;
4007
4008 /*
4009 * XXX : Additional restrictions needed
4010 * - perhaps callable only once.
4011 */
4012 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4013 return error;
4014 }
4015
4016 /*
4017 * pivot_root can be executed by launchd only.
4018 * Enforce entitlement.
4019 */
4020 if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4021 return EPERM;
4022 }
4023
4024 error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4025 if (error == ENAMETOOLONG) {
4026 new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4027 error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4028 }
4029
4030 if (error) {
4031 goto out;
4032 }
4033
4034 error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4035 if (error == ENAMETOOLONG) {
4036 old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4037 error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4038 }
4039 if (error) {
4040 goto out;
4041 }
4042
4043 if (new_rootfs_path_before_buf) {
4044 incoming = new_rootfs_path_before_buf;
4045 } else {
4046 incoming = &new_rootfs_path_before[0];
4047 }
4048
4049 if (old_rootfs_path_after_buf) {
4050 outgoing = old_rootfs_path_after_buf;
4051 } else {
4052 outgoing = &old_rootfs_path_after[0];
4053 }
4054
4055 /*
4056 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4057 * Userland is not allowed to pivot to an image.
4058 */
4059 error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4060 if (error) {
4061 goto out;
4062 }
4063 error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4064 if (error) {
4065 goto out;
4066 }
4067
4068 error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4069
4070 out:
4071 if (incoming_rootvp != NULLVP) {
4072 vnode_put(incoming_rootvp);
4073 incoming_rootvp = NULLVP;
4074 }
4075
4076 if (old_rootfs_path_after_buf) {
4077 zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4078 }
4079
4080 if (new_rootfs_path_before_buf) {
4081 zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4082 }
4083
4084 return error;
4085 }
4086 #else
4087 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4088 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4089 {
4090 return nosys(p, NULL, retval);
4091 }
4092 #endif /* XNU_TARGET_OS_OSX */
4093
4094 /*
4095 * Common routine for chroot and chdir.
4096 *
4097 * Returns: 0 Success
4098 * ENOTDIR Not a directory
4099 * namei:??? [anything namei can return]
4100 * vnode_authorize:??? [anything vnode_authorize can return]
4101 */
4102 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4103 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4104 {
4105 vnode_t vp;
4106 int error;
4107
4108 if ((error = namei(ndp))) {
4109 return error;
4110 }
4111 nameidone(ndp);
4112 vp = ndp->ni_vp;
4113
4114 if (vp->v_type != VDIR) {
4115 vnode_put(vp);
4116 return ENOTDIR;
4117 }
4118
4119 #if CONFIG_MACF
4120 error = mac_vnode_check_chdir(ctx, vp);
4121 if (error) {
4122 vnode_put(vp);
4123 return error;
4124 }
4125 #endif
4126
4127 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4128 if (error) {
4129 vnode_put(vp);
4130 return error;
4131 }
4132
4133 return error;
4134 }
4135
4136 /*
4137 * Free the vnode data (for directories) associated with the file glob.
4138 */
4139 struct fd_vn_data *
fg_vn_data_alloc(void)4140 fg_vn_data_alloc(void)
4141 {
4142 struct fd_vn_data *fvdata;
4143
4144 /* Allocate per fd vnode data */
4145 fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4146 lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4147 return fvdata;
4148 }
4149
4150 /*
4151 * Free the vnode data (for directories) associated with the file glob.
4152 */
4153 void
fg_vn_data_free(void * fgvndata)4154 fg_vn_data_free(void *fgvndata)
4155 {
4156 struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4157
4158 kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4159 lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4160 kfree_type(struct fd_vn_data, fvdata);
4161 }
4162
4163 /*
4164 * Check permissions, allocate an open file structure,
4165 * and call the device open routine if any.
4166 *
4167 * Returns: 0 Success
4168 * EINVAL
4169 * EINTR
4170 * falloc:ENFILE
4171 * falloc:EMFILE
4172 * falloc:ENOMEM
4173 * vn_open_auth:???
4174 * dupfdopen:???
4175 * VNOP_ADVLOCK:???
4176 * vnode_setsize:???
4177 *
4178 * XXX Need to implement uid, gid
4179 */
4180 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval)4181 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4182 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval)
4183 {
4184 proc_t p = vfs_context_proc(ctx);
4185 uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4186 struct fileproc *fp;
4187 vnode_t vp;
4188 int flags, oflags;
4189 int type, indx, error;
4190 struct vfs_context context;
4191
4192 oflags = uflags;
4193
4194 if ((oflags & O_ACCMODE) == O_ACCMODE) {
4195 return EINVAL;
4196 }
4197
4198 flags = FFLAGS(uflags);
4199 CLR(flags, FENCRYPTED);
4200 CLR(flags, FUNENCRYPTED);
4201
4202 AUDIT_ARG(fflags, oflags);
4203 AUDIT_ARG(mode, vap->va_mode);
4204
4205 if ((error = falloc_withinit(p, &fp, &indx, ctx, fp_init, initarg)) != 0) {
4206 return error;
4207 }
4208 if (flags & O_CLOEXEC) {
4209 fp->fp_flags |= FP_CLOEXEC;
4210 }
4211 if (flags & O_CLOFORK) {
4212 fp->fp_flags |= FP_CLOFORK;
4213 }
4214
4215 /* setup state to recognize when fdesc_open was called */
4216 uu->uu_dupfd = -1;
4217
4218 if ((error = vn_open_auth(ndp, &flags, vap))) {
4219 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4220 if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4221 *retval = indx;
4222 return 0;
4223 }
4224 }
4225 if (error == ERESTART) {
4226 error = EINTR;
4227 }
4228 fp_free(p, indx, fp);
4229 return error;
4230 }
4231 uu->uu_dupfd = 0;
4232 vp = ndp->ni_vp;
4233
4234 fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4235 fp->fp_glob->fg_ops = &vnops;
4236 fp_set_data(fp, vp);
4237
4238 if (flags & (O_EXLOCK | O_SHLOCK)) {
4239 struct flock lf = {
4240 .l_whence = SEEK_SET,
4241 };
4242
4243 if (flags & O_EXLOCK) {
4244 lf.l_type = F_WRLCK;
4245 } else {
4246 lf.l_type = F_RDLCK;
4247 }
4248 type = F_FLOCK;
4249 if ((flags & FNONBLOCK) == 0) {
4250 type |= F_WAIT;
4251 }
4252 #if CONFIG_MACF
4253 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4254 F_SETLK, &lf);
4255 if (error) {
4256 goto bad;
4257 }
4258 #endif
4259 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4260 goto bad;
4261 }
4262 fp->fp_glob->fg_flag |= FWASLOCKED;
4263 }
4264
4265 /* try to truncate by setting the size attribute */
4266 if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4267 goto bad;
4268 }
4269
4270 /*
4271 * For directories we hold some additional information in the fd.
4272 */
4273 if (vnode_vtype(vp) == VDIR) {
4274 fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4275 } else {
4276 fp->fp_glob->fg_vn_data = NULL;
4277 }
4278
4279 vnode_put(vp);
4280
4281 /*
4282 * The first terminal open (without a O_NOCTTY) by a session leader
4283 * results in it being set as the controlling terminal.
4284 */
4285 if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4286 !(flags & O_NOCTTY)) {
4287 int tmp = 0;
4288
4289 (void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4290 (caddr_t)&tmp, ctx);
4291 }
4292
4293 proc_fdlock(p);
4294 procfdtbl_releasefd(p, indx, NULL);
4295
4296 #if CONFIG_SECLUDED_MEMORY
4297 if (secluded_for_filecache &&
4298 FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE &&
4299 vnode_vtype(vp) == VREG) {
4300 memory_object_control_t moc;
4301
4302 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4303
4304 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4305 /* nothing to do... */
4306 } else if (fp->fp_glob->fg_flag & FWRITE) {
4307 /* writable -> no longer eligible for secluded pages */
4308 memory_object_mark_eligible_for_secluded(moc,
4309 FALSE);
4310 } else if (secluded_for_filecache == 1) {
4311 char pathname[32] = { 0, };
4312 size_t copied;
4313 /* XXX FBDP: better way to detect /Applications/ ? */
4314 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4315 (void)copyinstr(ndp->ni_dirp,
4316 pathname,
4317 sizeof(pathname),
4318 &copied);
4319 } else {
4320 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4321 pathname,
4322 sizeof(pathname),
4323 &copied);
4324 }
4325 pathname[sizeof(pathname) - 1] = '\0';
4326 if (strncmp(pathname,
4327 "/Applications/",
4328 strlen("/Applications/")) == 0 &&
4329 strncmp(pathname,
4330 "/Applications/Camera.app/",
4331 strlen("/Applications/Camera.app/")) != 0) {
4332 /*
4333 * not writable
4334 * AND from "/Applications/"
4335 * AND not from "/Applications/Camera.app/"
4336 * ==> eligible for secluded
4337 */
4338 memory_object_mark_eligible_for_secluded(moc,
4339 TRUE);
4340 }
4341 } else if (secluded_for_filecache == 2) {
4342 size_t len = strlen(vp->v_name);
4343 if (!strncmp(vp->v_name, "dyld", len) ||
4344 !strncmp(vp->v_name, "launchd", len) ||
4345 !strncmp(vp->v_name, "Camera", len) ||
4346 !strncmp(vp->v_name, "mediaserverd", len) ||
4347 !strncmp(vp->v_name, "SpringBoard", len) ||
4348 !strncmp(vp->v_name, "backboardd", len)) {
4349 /*
4350 * This file matters when launching Camera:
4351 * do not store its contents in the secluded
4352 * pool that will be drained on Camera launch.
4353 */
4354 memory_object_mark_eligible_for_secluded(moc,
4355 FALSE);
4356 }
4357 }
4358 }
4359 #endif /* CONFIG_SECLUDED_MEMORY */
4360
4361 fp_drop(p, indx, fp, 1);
4362 proc_fdunlock(p);
4363
4364 *retval = indx;
4365
4366 return 0;
4367 bad:
4368 context = *vfs_context_current();
4369 context.vc_ucred = fp->fp_glob->fg_cred;
4370
4371 if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4372 (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4373 struct flock lf = {
4374 .l_whence = SEEK_SET,
4375 .l_type = F_UNLCK,
4376 };
4377
4378 (void)VNOP_ADVLOCK(
4379 vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4380 }
4381
4382 vn_close(vp, fp->fp_glob->fg_flag, &context);
4383 vnode_put(vp);
4384 fp_free(p, indx, fp);
4385
4386 return error;
4387 }
4388
4389 /*
4390 * While most of the *at syscall handlers can call nameiat() which
4391 * is a wrapper around namei, the use of namei and initialisation
4392 * of nameidata are far removed and in different functions - namei
4393 * gets called in vn_open_auth for open1. So we'll just do here what
4394 * nameiat() does.
4395 */
4396 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd)4397 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4398 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4399 int dirfd)
4400 {
4401 if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4402 int error;
4403 char c;
4404
4405 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4406 error = copyin(ndp->ni_dirp, &c, sizeof(char));
4407 if (error) {
4408 return error;
4409 }
4410 } else {
4411 c = *((char *)(ndp->ni_dirp));
4412 }
4413
4414 if (c != '/') {
4415 vnode_t dvp_at;
4416
4417 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4418 &dvp_at);
4419 if (error) {
4420 return error;
4421 }
4422
4423 if (vnode_vtype(dvp_at) != VDIR) {
4424 vnode_put(dvp_at);
4425 return ENOTDIR;
4426 }
4427
4428 ndp->ni_dvp = dvp_at;
4429 ndp->ni_cnd.cn_flags |= USEDVP;
4430 error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4431 retval);
4432 vnode_put(dvp_at);
4433 return error;
4434 }
4435 }
4436
4437 return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval);
4438 }
4439
4440 /*
4441 * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4442 *
4443 * Parameters: p Process requesting the open
4444 * uap User argument descriptor (see below)
4445 * retval Pointer to an area to receive the
4446 * return calue from the system call
4447 *
4448 * Indirect: uap->path Path to open (same as 'open')
4449 * uap->flags Flags to open (same as 'open'
4450 * uap->uid UID to set, if creating
4451 * uap->gid GID to set, if creating
4452 * uap->mode File mode, if creating (same as 'open')
4453 * uap->xsecurity ACL to set, if creating
4454 *
4455 * Returns: 0 Success
4456 * !0 errno value
4457 *
4458 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
4459 *
4460 * XXX: We should enummerate the possible errno values here, and where
4461 * in the code they originated.
4462 */
4463 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)4464 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4465 {
4466 int ciferror;
4467 kauth_filesec_t xsecdst;
4468 struct vnode_attr va;
4469 struct nameidata nd;
4470 int cmode;
4471
4472 AUDIT_ARG(owner, uap->uid, uap->gid);
4473
4474 xsecdst = NULL;
4475 if ((uap->xsecurity != USER_ADDR_NULL) &&
4476 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4477 return ciferror;
4478 }
4479
4480 VATTR_INIT(&va);
4481 cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4482 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4483 if (uap->uid != KAUTH_UID_NONE) {
4484 VATTR_SET(&va, va_uid, uap->uid);
4485 }
4486 if (uap->gid != KAUTH_GID_NONE) {
4487 VATTR_SET(&va, va_gid, uap->gid);
4488 }
4489 if (xsecdst != NULL) {
4490 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4491 va.va_vaflags |= VA_FILESEC_ACL;
4492 }
4493
4494 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4495 uap->path, vfs_context_current());
4496
4497 ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4498 NULL, NULL, retval);
4499 if (xsecdst != NULL) {
4500 kauth_filesec_free(xsecdst);
4501 }
4502
4503 return ciferror;
4504 }
4505
4506 /*
4507 * Go through the data-protected atomically controlled open (2)
4508 *
4509 * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4510 */
4511 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)4512 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
4513 {
4514 int flags = uap->flags;
4515 int class = uap->class;
4516 int dpflags = uap->dpflags;
4517
4518 /*
4519 * Follow the same path as normal open(2)
4520 * Look up the item if it exists, and acquire the vnode.
4521 */
4522 struct vnode_attr va;
4523 struct nameidata nd;
4524 int cmode;
4525 int error;
4526
4527 VATTR_INIT(&va);
4528 /* Mask off all but regular access permissions */
4529 cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4530 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4531
4532 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4533 uap->path, vfs_context_current());
4534
4535 /*
4536 * Initialize the extra fields in vnode_attr to pass down our
4537 * extra fields.
4538 * 1. target cprotect class.
4539 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4540 */
4541 if (flags & O_CREAT) {
4542 /* lower level kernel code validates that the class is valid before applying it. */
4543 if (class != PROTECTION_CLASS_DEFAULT) {
4544 /*
4545 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4546 * file behave the same as open (2)
4547 */
4548 VATTR_SET(&va, va_dataprotect_class, class);
4549 }
4550 }
4551
4552 if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED)) {
4553 if (flags & (O_RDWR | O_WRONLY)) {
4554 /* Not allowed to write raw encrypted bytes */
4555 return EINVAL;
4556 }
4557 if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
4558 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4559 }
4560 if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
4561 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4562 }
4563 }
4564
4565 error = open1(vfs_context_current(), &nd, uap->flags, &va,
4566 NULL, NULL, retval);
4567
4568 return error;
4569 }
4570
4571 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)4572 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4573 int fd, enum uio_seg segflg, int *retval)
4574 {
4575 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
4576 struct {
4577 struct vnode_attr va;
4578 struct nameidata nd;
4579 } *__open_data;
4580 struct vnode_attr *vap;
4581 struct nameidata *ndp;
4582 int cmode;
4583 int error;
4584
4585 __open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
4586 vap = &__open_data->va;
4587 ndp = &__open_data->nd;
4588
4589 VATTR_INIT(vap);
4590 /* Mask off all but regular access permissions */
4591 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4592 VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
4593
4594 NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
4595 segflg, path, ctx);
4596
4597 error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd);
4598
4599 kfree_type(typeof(*__open_data), __open_data);
4600
4601 return error;
4602 }
4603
4604 int
open(proc_t p,struct open_args * uap,int32_t * retval)4605 open(proc_t p, struct open_args *uap, int32_t *retval)
4606 {
4607 __pthread_testcancel(1);
4608 return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
4609 }
4610
4611 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)4612 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
4613 int32_t *retval)
4614 {
4615 return openat_internal(vfs_context_current(), uap->path, uap->flags,
4616 uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
4617 }
4618
4619 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)4620 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
4621 int32_t *retval)
4622 {
4623 return openat_internal(vfs_context_current(), uap->path, uap->flags,
4624 uap->mode, uap->fd, UIO_USERSPACE, retval);
4625 }
4626
4627 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)4628 openat(proc_t p, struct openat_args *uap, int32_t *retval)
4629 {
4630 __pthread_testcancel(1);
4631 return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
4632 }
4633
4634 /*
4635 * openbyid_np: open a file given a file system id and a file system object id
4636 * the hfs file system object id is an fsobj_id_t {uint32, uint32}
4637 * file systems that don't support object ids it is a node id (uint64_t).
4638 *
4639 * Parameters: p Process requesting the open
4640 * uap User argument descriptor (see below)
4641 * retval Pointer to an area to receive the
4642 * return calue from the system call
4643 *
4644 * Indirect: uap->path Path to open (same as 'open')
4645 *
4646 * uap->fsid id of target file system
4647 * uap->objid id of target file system object
4648 * uap->flags Flags to open (same as 'open')
4649 *
4650 * Returns: 0 Success
4651 * !0 errno value
4652 *
4653 *
4654 * XXX: We should enummerate the possible errno values here, and where
4655 * in the code they originated.
4656 */
4657 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)4658 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
4659 {
4660 fsid_t fsid;
4661 uint64_t objid;
4662 int error;
4663 char *buf = NULL;
4664 int buflen = MAXPATHLEN;
4665 int pathlen = 0;
4666 vfs_context_t ctx = vfs_context_current();
4667
4668 if ((error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_OPEN_BY_ID, 0))) {
4669 return error;
4670 }
4671
4672 if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
4673 return error;
4674 }
4675
4676 /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
4677 if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
4678 return error;
4679 }
4680
4681 AUDIT_ARG(value32, fsid.val[0]);
4682 AUDIT_ARG(value64, objid);
4683
4684 /*resolve path from fsis, objid*/
4685 do {
4686 buf = kalloc_data(buflen + 1, Z_WAITOK);
4687 if (buf == NULL) {
4688 return ENOMEM;
4689 }
4690
4691 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
4692 buf, FSOPT_ISREALFSID, &pathlen);
4693
4694 if (error) {
4695 kfree_data(buf, buflen + 1);
4696 buf = NULL;
4697 }
4698 } while (error == ENOSPC && (buflen += MAXPATHLEN));
4699
4700 if (error) {
4701 return error;
4702 }
4703
4704 buf[pathlen] = 0;
4705
4706 error = openat_internal(
4707 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
4708
4709 kfree_data(buf, buflen + 1);
4710
4711 return error;
4712 }
4713
4714
4715 /*
4716 * Create a special file.
4717 */
4718 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
4719
4720 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)4721 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
4722 {
4723 struct vnode_attr va;
4724 vfs_context_t ctx = vfs_context_current();
4725 int error;
4726 struct nameidata nd;
4727 vnode_t vp, dvp;
4728
4729 VATTR_INIT(&va);
4730 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
4731 VATTR_SET(&va, va_rdev, uap->dev);
4732
4733 /* If it's a mknod() of a FIFO, call mkfifo1() instead */
4734 if ((uap->mode & S_IFMT) == S_IFIFO) {
4735 return mkfifo1(ctx, uap->path, &va);
4736 }
4737
4738 AUDIT_ARG(mode, (mode_t)uap->mode);
4739 AUDIT_ARG(value32, uap->dev);
4740
4741 if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
4742 return error;
4743 }
4744 NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
4745 UIO_USERSPACE, uap->path, ctx);
4746 error = namei(&nd);
4747 if (error) {
4748 return error;
4749 }
4750 dvp = nd.ni_dvp;
4751 vp = nd.ni_vp;
4752
4753 if (vp != NULL) {
4754 error = EEXIST;
4755 goto out;
4756 }
4757
4758 switch (uap->mode & S_IFMT) {
4759 case S_IFCHR:
4760 VATTR_SET(&va, va_type, VCHR);
4761 break;
4762 case S_IFBLK:
4763 VATTR_SET(&va, va_type, VBLK);
4764 break;
4765 default:
4766 error = EINVAL;
4767 goto out;
4768 }
4769
4770 #if CONFIG_MACF
4771 error = mac_vnode_check_create(ctx,
4772 nd.ni_dvp, &nd.ni_cnd, &va);
4773 if (error) {
4774 goto out;
4775 }
4776 #endif
4777
4778 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
4779 goto out;
4780 }
4781
4782 if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0) {
4783 goto out;
4784 }
4785
4786 if (vp) {
4787 int update_flags = 0;
4788
4789 // Make sure the name & parent pointers are hooked up
4790 if (vp->v_name == NULL) {
4791 update_flags |= VNODE_UPDATE_NAME;
4792 }
4793 if (vp->v_parent == NULLVP) {
4794 update_flags |= VNODE_UPDATE_PARENT;
4795 }
4796
4797 if (update_flags) {
4798 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4799 }
4800
4801 #if CONFIG_FSE
4802 add_fsevent(FSE_CREATE_FILE, ctx,
4803 FSE_ARG_VNODE, vp,
4804 FSE_ARG_DONE);
4805 #endif
4806 }
4807
4808 out:
4809 /*
4810 * nameidone has to happen before we vnode_put(dvp)
4811 * since it may need to release the fs_nodelock on the dvp
4812 */
4813 nameidone(&nd);
4814
4815 if (vp) {
4816 vnode_put(vp);
4817 }
4818 vnode_put(dvp);
4819
4820 return error;
4821 }
4822
4823 /*
4824 * Create a named pipe.
4825 *
4826 * Returns: 0 Success
4827 * EEXIST
4828 * namei:???
4829 * vnode_authorize:???
4830 * vn_create:???
4831 */
4832 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap)4833 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
4834 {
4835 vnode_t vp, dvp;
4836 int error;
4837 struct nameidata nd;
4838
4839 NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
4840 UIO_USERSPACE, upath, ctx);
4841 error = namei(&nd);
4842 if (error) {
4843 return error;
4844 }
4845 dvp = nd.ni_dvp;
4846 vp = nd.ni_vp;
4847
4848 /* check that this is a new file and authorize addition */
4849 if (vp != NULL) {
4850 error = EEXIST;
4851 goto out;
4852 }
4853 VATTR_SET(vap, va_type, VFIFO);
4854
4855 if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
4856 goto out;
4857 }
4858
4859 error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
4860 out:
4861 /*
4862 * nameidone has to happen before we vnode_put(dvp)
4863 * since it may need to release the fs_nodelock on the dvp
4864 */
4865 nameidone(&nd);
4866
4867 if (vp) {
4868 vnode_put(vp);
4869 }
4870 vnode_put(dvp);
4871
4872 return error;
4873 }
4874
4875
4876 /*
4877 * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
4878 *
4879 * Parameters: p Process requesting the open
4880 * uap User argument descriptor (see below)
4881 * retval (Ignored)
4882 *
4883 * Indirect: uap->path Path to fifo (same as 'mkfifo')
4884 * uap->uid UID to set
4885 * uap->gid GID to set
4886 * uap->mode File mode to set (same as 'mkfifo')
4887 * uap->xsecurity ACL to set, if creating
4888 *
4889 * Returns: 0 Success
4890 * !0 errno value
4891 *
4892 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
4893 *
4894 * XXX: We should enummerate the possible errno values here, and where
4895 * in the code they originated.
4896 */
4897 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)4898 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
4899 {
4900 int ciferror;
4901 kauth_filesec_t xsecdst;
4902 struct vnode_attr va;
4903
4904 AUDIT_ARG(owner, uap->uid, uap->gid);
4905
4906 xsecdst = KAUTH_FILESEC_NONE;
4907 if (uap->xsecurity != USER_ADDR_NULL) {
4908 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
4909 return ciferror;
4910 }
4911 }
4912
4913 VATTR_INIT(&va);
4914 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
4915 if (uap->uid != KAUTH_UID_NONE) {
4916 VATTR_SET(&va, va_uid, uap->uid);
4917 }
4918 if (uap->gid != KAUTH_GID_NONE) {
4919 VATTR_SET(&va, va_gid, uap->gid);
4920 }
4921 if (xsecdst != KAUTH_FILESEC_NONE) {
4922 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4923 va.va_vaflags |= VA_FILESEC_ACL;
4924 }
4925
4926 ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
4927
4928 if (xsecdst != KAUTH_FILESEC_NONE) {
4929 kauth_filesec_free(xsecdst);
4930 }
4931 return ciferror;
4932 }
4933
4934 /* ARGSUSED */
4935 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)4936 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
4937 {
4938 struct vnode_attr va;
4939
4940 VATTR_INIT(&va);
4941 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
4942
4943 return mkfifo1(vfs_context_current(), uap->path, &va);
4944 }
4945
4946 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
4947 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4948 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4949
4950 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)4951 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
4952 {
4953 int ret, len = _len;
4954
4955 *truncated_path = 0;
4956
4957 if (firmlink) {
4958 ret = vn_getpath(dvp, path, &len);
4959 } else {
4960 ret = vn_getpath_no_firmlink(dvp, path, &len);
4961 }
4962 if (ret == 0 && len < (MAXPATHLEN - 1)) {
4963 if (leafname) {
4964 path[len - 1] = '/';
4965 len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
4966 if (len > MAXPATHLEN) {
4967 char *ptr;
4968
4969 // the string got truncated!
4970 *truncated_path = 1;
4971 ptr = strrchr(path, '/');
4972 if (ptr) {
4973 *ptr = '\0'; // chop off the string at the last directory component
4974 }
4975 len = (int)strlen(path) + 1;
4976 }
4977 }
4978 } else if (ret == 0) {
4979 *truncated_path = 1;
4980 } else if (ret != 0) {
4981 struct vnode *mydvp = dvp;
4982
4983 if (ret != ENOSPC) {
4984 printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
4985 dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
4986 }
4987 *truncated_path = 1;
4988
4989 do {
4990 if (mydvp->v_parent != NULL) {
4991 mydvp = mydvp->v_parent;
4992 } else if (mydvp->v_mount) {
4993 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4994 break;
4995 } else {
4996 // no parent and no mount point? only thing is to punt and say "/" changed
4997 strlcpy(path, "/", _len);
4998 len = 2;
4999 mydvp = NULL;
5000 }
5001
5002 if (mydvp == NULL) {
5003 break;
5004 }
5005
5006 len = _len;
5007 if (firmlink) {
5008 ret = vn_getpath(mydvp, path, &len);
5009 } else {
5010 ret = vn_getpath_no_firmlink(mydvp, path, &len);
5011 }
5012 } while (ret == ENOSPC);
5013 }
5014
5015 return len;
5016 }
5017
5018 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5019 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5020 {
5021 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5022 }
5023
5024 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5025 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5026 {
5027 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5028 }
5029
5030 /*
5031 * Make a hard file link.
5032 *
5033 * Returns: 0 Success
5034 * EPERM
5035 * EEXIST
5036 * EXDEV
5037 * namei:???
5038 * vnode_authorize:???
5039 * VNOP_LINK:???
5040 */
5041 /* ARGSUSED */
5042 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5043 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5044 user_addr_t link, int flag, enum uio_seg segflg)
5045 {
5046 vnode_t vp, pvp, dvp, lvp;
5047 struct nameidata nd;
5048 int follow;
5049 int error;
5050 #if CONFIG_FSE
5051 fse_info finfo;
5052 #endif
5053 int need_event, has_listeners, need_kpath2;
5054 char *target_path = NULL;
5055 char *no_firmlink_path = NULL;
5056 int truncated = 0;
5057 int truncated_no_firmlink_path = 0;
5058
5059 vp = dvp = lvp = NULLVP;
5060
5061 /* look up the object we are linking to */
5062 follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5063 NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5064 segflg, path, ctx);
5065
5066 error = nameiat(&nd, fd1);
5067 if (error) {
5068 return error;
5069 }
5070 vp = nd.ni_vp;
5071
5072 nameidone(&nd);
5073
5074 /*
5075 * Normally, linking to directories is not supported.
5076 * However, some file systems may have limited support.
5077 */
5078 if (vp->v_type == VDIR) {
5079 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5080 error = EPERM; /* POSIX */
5081 goto out;
5082 }
5083
5084 /* Linking to a directory requires ownership. */
5085 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5086 struct vnode_attr dva;
5087
5088 VATTR_INIT(&dva);
5089 VATTR_WANTED(&dva, va_uid);
5090 if (vnode_getattr(vp, &dva, ctx) != 0 ||
5091 !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5092 (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5093 error = EACCES;
5094 goto out;
5095 }
5096 }
5097 }
5098
5099 /* lookup the target node */
5100 #if CONFIG_TRIGGERS
5101 nd.ni_op = OP_LINK;
5102 #endif
5103 nd.ni_cnd.cn_nameiop = CREATE;
5104 nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5105 nd.ni_dirp = link;
5106 error = nameiat(&nd, fd2);
5107 if (error != 0) {
5108 goto out;
5109 }
5110 dvp = nd.ni_dvp;
5111 lvp = nd.ni_vp;
5112
5113 #if CONFIG_MACF
5114 if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5115 goto out2;
5116 }
5117 #endif
5118
5119 /* or to anything that kauth doesn't want us to (eg. immutable items) */
5120 if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5121 goto out2;
5122 }
5123
5124 /* target node must not exist */
5125 if (lvp != NULLVP) {
5126 error = EEXIST;
5127 goto out2;
5128 }
5129 /* cannot link across mountpoints */
5130 if (vnode_mount(vp) != vnode_mount(dvp)) {
5131 error = EXDEV;
5132 goto out2;
5133 }
5134
5135 /* authorize creation of the target note */
5136 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5137 goto out2;
5138 }
5139
5140 /* and finally make the link */
5141 error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5142 if (error) {
5143 goto out2;
5144 }
5145
5146 #if CONFIG_MACF
5147 (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5148 #endif
5149
5150 #if CONFIG_FSE
5151 need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5152 #else
5153 need_event = 0;
5154 #endif
5155 has_listeners = kauth_authorize_fileop_has_listeners();
5156
5157 need_kpath2 = 0;
5158 #if CONFIG_AUDIT
5159 if (AUDIT_RECORD_EXISTS()) {
5160 need_kpath2 = 1;
5161 }
5162 #endif
5163
5164 if (need_event || has_listeners || need_kpath2) {
5165 char *link_to_path = NULL;
5166 int len, link_name_len;
5167 int len_no_firmlink_path = 0;
5168
5169 /* build the path to the new link file */
5170 GET_PATH(target_path);
5171
5172 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5173 if (no_firmlink_path == NULL) {
5174 GET_PATH(no_firmlink_path);
5175 }
5176 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5177
5178 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5179
5180 if (has_listeners) {
5181 /* build the path to file we are linking to */
5182 GET_PATH(link_to_path);
5183
5184 link_name_len = MAXPATHLEN;
5185 if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5186 /*
5187 * Call out to allow 3rd party notification of rename.
5188 * Ignore result of kauth_authorize_fileop call.
5189 */
5190 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5191 (uintptr_t)link_to_path,
5192 (uintptr_t)target_path);
5193 }
5194 if (link_to_path != NULL) {
5195 RELEASE_PATH(link_to_path);
5196 }
5197 }
5198 #if CONFIG_FSE
5199 if (need_event) {
5200 /* construct fsevent */
5201 if (get_fse_info(vp, &finfo, ctx) == 0) {
5202 if (truncated_no_firmlink_path) {
5203 finfo.mode |= FSE_TRUNCATED_PATH;
5204 }
5205
5206 // build the path to the destination of the link
5207 add_fsevent(FSE_CREATE_FILE, ctx,
5208 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5209 FSE_ARG_FINFO, &finfo,
5210 FSE_ARG_DONE);
5211 }
5212
5213 pvp = vp->v_parent;
5214 // need an iocount on pvp in this case
5215 if (pvp && pvp != dvp) {
5216 error = vnode_get(pvp);
5217 if (error) {
5218 pvp = NULLVP;
5219 error = 0;
5220 }
5221 }
5222 if (pvp) {
5223 add_fsevent(FSE_STAT_CHANGED, ctx,
5224 FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5225 }
5226 if (pvp && pvp != dvp) {
5227 vnode_put(pvp);
5228 }
5229 }
5230 #endif
5231 }
5232 out2:
5233 /*
5234 * nameidone has to happen before we vnode_put(dvp)
5235 * since it may need to release the fs_nodelock on the dvp
5236 */
5237 nameidone(&nd);
5238 if (target_path != NULL) {
5239 RELEASE_PATH(target_path);
5240 }
5241 if (no_firmlink_path != NULL) {
5242 RELEASE_PATH(no_firmlink_path);
5243 no_firmlink_path = NULL;
5244 }
5245 out:
5246 if (lvp) {
5247 vnode_put(lvp);
5248 }
5249 if (dvp) {
5250 vnode_put(dvp);
5251 }
5252 vnode_put(vp);
5253 return error;
5254 }
5255
5256 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5257 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5258 {
5259 return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5260 AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5261 }
5262
5263 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5264 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5265 {
5266 if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5267 return EINVAL;
5268 }
5269
5270 return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5271 uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5272 }
5273
5274 /*
5275 * Make a symbolic link.
5276 *
5277 * We could add support for ACLs here too...
5278 */
5279 /* ARGSUSED */
5280 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5281 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5282 user_addr_t link, enum uio_seg segflg)
5283 {
5284 struct vnode_attr va;
5285 char *path;
5286 int error;
5287 struct nameidata nd;
5288 vnode_t vp, dvp;
5289 size_t dummy = 0;
5290 proc_t p;
5291
5292 error = 0;
5293 if (UIO_SEG_IS_USER_SPACE(segflg)) {
5294 path = zalloc(ZV_NAMEI);
5295 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5296 } else {
5297 path = (char *)path_data;
5298 }
5299 if (error) {
5300 goto out;
5301 }
5302 AUDIT_ARG(text, path); /* This is the link string */
5303
5304 NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5305 segflg, link, ctx);
5306
5307 error = nameiat(&nd, fd);
5308 if (error) {
5309 goto out;
5310 }
5311 dvp = nd.ni_dvp;
5312 vp = nd.ni_vp;
5313
5314 p = vfs_context_proc(ctx);
5315 VATTR_INIT(&va);
5316 VATTR_SET(&va, va_type, VLNK);
5317 VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5318
5319 #if CONFIG_MACF
5320 error = mac_vnode_check_create(ctx,
5321 dvp, &nd.ni_cnd, &va);
5322 #endif
5323 if (error != 0) {
5324 goto skipit;
5325 }
5326
5327 if (vp != NULL) {
5328 error = EEXIST;
5329 goto skipit;
5330 }
5331
5332 /* authorize */
5333 if (error == 0) {
5334 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5335 }
5336 /* get default ownership, etc. */
5337 if (error == 0) {
5338 error = vnode_authattr_new(dvp, &va, 0, ctx);
5339 }
5340 if (error == 0) {
5341 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5342 }
5343
5344 /* do fallback attribute handling */
5345 if (error == 0 && vp) {
5346 error = vnode_setattr_fallback(vp, &va, ctx);
5347 }
5348
5349 #if CONFIG_MACF
5350 if (error == 0 && vp) {
5351 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5352 }
5353 #endif
5354
5355 if (error == 0) {
5356 int update_flags = 0;
5357
5358 /*check if a new vnode was created, else try to get one*/
5359 if (vp == NULL) {
5360 nd.ni_cnd.cn_nameiop = LOOKUP;
5361 #if CONFIG_TRIGGERS
5362 nd.ni_op = OP_LOOKUP;
5363 #endif
5364 /*
5365 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5366 * reallocated again in namei().
5367 */
5368 nd.ni_cnd.cn_flags &= HASBUF;
5369 error = nameiat(&nd, fd);
5370 if (error) {
5371 goto skipit;
5372 }
5373 vp = nd.ni_vp;
5374 }
5375
5376 #if 0 /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5377 /* call out to allow 3rd party notification of rename.
5378 * Ignore result of kauth_authorize_fileop call.
5379 */
5380 if (kauth_authorize_fileop_has_listeners() &&
5381 namei(&nd) == 0) {
5382 char *new_link_path = NULL;
5383 int len;
5384
5385 /* build the path to the new link file */
5386 new_link_path = get_pathbuff();
5387 len = MAXPATHLEN;
5388 vn_getpath(dvp, new_link_path, &len);
5389 if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5390 new_link_path[len - 1] = '/';
5391 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5392 }
5393
5394 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5395 (uintptr_t)path, (uintptr_t)new_link_path);
5396 if (new_link_path != NULL) {
5397 release_pathbuff(new_link_path);
5398 }
5399 }
5400 #endif
5401 // Make sure the name & parent pointers are hooked up
5402 if (vp->v_name == NULL) {
5403 update_flags |= VNODE_UPDATE_NAME;
5404 }
5405 if (vp->v_parent == NULLVP) {
5406 update_flags |= VNODE_UPDATE_PARENT;
5407 }
5408
5409 if (update_flags) {
5410 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5411 }
5412
5413 #if CONFIG_FSE
5414 add_fsevent(FSE_CREATE_FILE, ctx,
5415 FSE_ARG_VNODE, vp,
5416 FSE_ARG_DONE);
5417 #endif
5418 }
5419
5420 skipit:
5421 /*
5422 * nameidone has to happen before we vnode_put(dvp)
5423 * since it may need to release the fs_nodelock on the dvp
5424 */
5425 nameidone(&nd);
5426
5427 if (vp) {
5428 vnode_put(vp);
5429 }
5430 vnode_put(dvp);
5431 out:
5432 if (path && (path != (char *)path_data)) {
5433 zfree(ZV_NAMEI, path);
5434 }
5435
5436 return error;
5437 }
5438
5439 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)5440 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5441 {
5442 return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5443 uap->link, UIO_USERSPACE);
5444 }
5445
5446 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)5447 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5448 __unused int32_t *retval)
5449 {
5450 return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5451 uap->path2, UIO_USERSPACE);
5452 }
5453
5454 /*
5455 * Delete a whiteout from the filesystem.
5456 * No longer supported.
5457 */
5458 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)5459 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5460 {
5461 return ENOTSUP;
5462 }
5463
5464 /*
5465 * Delete a name from the filesystem.
5466 */
5467 /* ARGSUSED */
5468 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5469 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5470 user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5471 {
5472 struct {
5473 struct nameidata nd;
5474 #if CONFIG_FSE
5475 struct vnode_attr va;
5476 fse_info finfo;
5477 #endif
5478 } *__unlink_data;
5479 struct nameidata *ndp;
5480 vnode_t vp, dvp;
5481 int error;
5482 struct componentname *cnp;
5483 char *path = NULL;
5484 char *no_firmlink_path = NULL;
5485 int len_path = 0;
5486 int len_no_firmlink_path = 0;
5487 int flags;
5488 int need_event;
5489 int has_listeners;
5490 int truncated_path;
5491 int truncated_no_firmlink_path;
5492 int batched;
5493 struct vnode_attr *vap;
5494 int do_retry;
5495 int retry_count = 0;
5496 int cn_flags;
5497
5498 cn_flags = LOCKPARENT;
5499 if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
5500 cn_flags |= AUDITVNPATH1;
5501 }
5502 /* If a starting dvp is passed, it trumps any fd passed. */
5503 if (start_dvp) {
5504 cn_flags |= USEDVP;
5505 }
5506
5507 #if NAMEDRSRCFORK
5508 /* unlink or delete is allowed on rsrc forks and named streams */
5509 cn_flags |= CN_ALLOWRSRCFORK;
5510 #endif
5511
5512 __unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
5513 ndp = &__unlink_data->nd;
5514 #if CONFIG_FSE
5515 fse_info *finfop = &__unlink_data->finfo;
5516 #endif
5517
5518 retry:
5519 do_retry = 0;
5520 flags = 0;
5521 need_event = 0;
5522 has_listeners = 0;
5523 truncated_path = 0;
5524 truncated_no_firmlink_path = 0;
5525 vap = NULL;
5526
5527 NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
5528
5529 ndp->ni_dvp = start_dvp;
5530 ndp->ni_flag |= NAMEI_COMPOUNDREMOVE;
5531 cnp = &ndp->ni_cnd;
5532
5533 continue_lookup:
5534 error = nameiat(ndp, fd);
5535 if (error) {
5536 goto early_out;
5537 }
5538
5539 dvp = ndp->ni_dvp;
5540 vp = ndp->ni_vp;
5541
5542 /* With Carbon delete semantics, busy files cannot be deleted */
5543 if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
5544 flags |= VNODE_REMOVE_NODELETEBUSY;
5545 }
5546
5547 /* Skip any potential upcalls if told to. */
5548 if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
5549 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
5550 }
5551
5552 if (vp) {
5553 batched = vnode_compound_remove_available(vp);
5554 /*
5555 * The root of a mounted filesystem cannot be deleted.
5556 */
5557 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
5558 error = EBUSY;
5559 goto out;
5560 }
5561
5562 #if DEVELOPMENT || DEBUG
5563 /*
5564 * XXX VSWAP: Check for entitlements or special flag here
5565 * so we can restrict access appropriately.
5566 */
5567 #else /* DEVELOPMENT || DEBUG */
5568
5569 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
5570 error = EPERM;
5571 goto out;
5572 }
5573 #endif /* DEVELOPMENT || DEBUG */
5574
5575 if (!batched) {
5576 error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
5577 if (error) {
5578 if (error == ENOENT) {
5579 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5580 do_retry = 1;
5581 retry_count++;
5582 }
5583 }
5584 goto out;
5585 }
5586 }
5587 } else {
5588 batched = 1;
5589
5590 if (!vnode_compound_remove_available(dvp)) {
5591 panic("No vp, but no compound remove?");
5592 }
5593 }
5594
5595 #if CONFIG_FSE
5596 need_event = need_fsevent(FSE_DELETE, dvp);
5597 if (need_event) {
5598 if (!batched) {
5599 if ((vp->v_flag & VISHARDLINK) == 0) {
5600 /* XXX need to get these data in batched VNOP */
5601 get_fse_info(vp, finfop, ctx);
5602 }
5603 } else {
5604 error =
5605 vfs_get_notify_attributes(&__unlink_data->va);
5606 if (error) {
5607 goto out;
5608 }
5609
5610 vap = &__unlink_data->va;
5611 }
5612 }
5613 #endif
5614 has_listeners = kauth_authorize_fileop_has_listeners();
5615 if (need_event || has_listeners) {
5616 if (path == NULL) {
5617 GET_PATH(path);
5618 }
5619 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
5620 if (no_firmlink_path == NULL) {
5621 GET_PATH(no_firmlink_path);
5622 }
5623 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5624 }
5625
5626 #if NAMEDRSRCFORK
5627 if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
5628 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
5629 } else
5630 #endif
5631 {
5632 error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
5633 vp = ndp->ni_vp;
5634 if (error == EKEEPLOOKING) {
5635 if (!batched) {
5636 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
5637 }
5638
5639 if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
5640 panic("EKEEPLOOKING, but continue flag not set?");
5641 }
5642
5643 if (vnode_isdir(vp)) {
5644 error = EISDIR;
5645 goto out;
5646 }
5647 goto continue_lookup;
5648 } else if (error == ENOENT && batched) {
5649 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5650 /*
5651 * For compound VNOPs, the authorization callback may
5652 * return ENOENT in case of racing hardlink lookups
5653 * hitting the name cache, redrive the lookup.
5654 */
5655 do_retry = 1;
5656 retry_count += 1;
5657 goto out;
5658 }
5659 }
5660 }
5661
5662 /*
5663 * Call out to allow 3rd party notification of delete.
5664 * Ignore result of kauth_authorize_fileop call.
5665 */
5666 if (!error) {
5667 if (has_listeners) {
5668 kauth_authorize_fileop(vfs_context_ucred(ctx),
5669 KAUTH_FILEOP_DELETE,
5670 (uintptr_t)vp,
5671 (uintptr_t)path);
5672 }
5673
5674 if (vp->v_flag & VISHARDLINK) {
5675 //
5676 // if a hardlink gets deleted we want to blow away the
5677 // v_parent link because the path that got us to this
5678 // instance of the link is no longer valid. this will
5679 // force the next call to get the path to ask the file
5680 // system instead of just following the v_parent link.
5681 //
5682 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
5683 }
5684
5685 #if CONFIG_FSE
5686 if (need_event) {
5687 if (vp->v_flag & VISHARDLINK) {
5688 get_fse_info(vp, finfop, ctx);
5689 } else if (vap) {
5690 vnode_get_fse_info_from_vap(vp, finfop, vap);
5691 }
5692 if (truncated_path) {
5693 finfop->mode |= FSE_TRUNCATED_PATH;
5694 }
5695 add_fsevent(FSE_DELETE, ctx,
5696 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5697 FSE_ARG_FINFO, finfop,
5698 FSE_ARG_DONE);
5699 }
5700 #endif
5701 }
5702
5703 out:
5704 if (path != NULL) {
5705 RELEASE_PATH(path);
5706 path = NULL;
5707 }
5708
5709 if (no_firmlink_path != NULL) {
5710 RELEASE_PATH(no_firmlink_path);
5711 no_firmlink_path = NULL;
5712 }
5713 #if NAMEDRSRCFORK
5714 /* recycle the deleted rsrc fork vnode to force a reclaim, which
5715 * will cause its shadow file to go away if necessary.
5716 */
5717 if (vp && (vnode_isnamedstream(vp)) &&
5718 (vp->v_parent != NULLVP) &&
5719 vnode_isshadow(vp)) {
5720 vnode_recycle(vp);
5721 }
5722 #endif
5723 /*
5724 * nameidone has to happen before we vnode_put(dvp)
5725 * since it may need to release the fs_nodelock on the dvp
5726 */
5727 nameidone(ndp);
5728 vnode_put(dvp);
5729 if (vp) {
5730 vnode_put(vp);
5731 }
5732
5733 if (do_retry) {
5734 goto retry;
5735 }
5736
5737 early_out:
5738 kfree_type(typeof(*__unlink_data), __unlink_data);
5739 return error;
5740 }
5741
5742 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5743 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
5744 enum uio_seg segflg, int unlink_flags)
5745 {
5746 return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
5747 unlink_flags);
5748 }
5749
5750 /*
5751 * Delete a name from the filesystem using Carbon semantics.
5752 */
5753 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)5754 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
5755 {
5756 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5757 uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
5758 }
5759
5760 /*
5761 * Delete a name from the filesystem using POSIX semantics.
5762 */
5763 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)5764 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
5765 {
5766 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5767 uap->path, UIO_USERSPACE, 0);
5768 }
5769
5770 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)5771 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
5772 {
5773 if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5774 return EINVAL;
5775 }
5776
5777 if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5778 int unlink_flags = 0;
5779
5780 if (uap->flag & AT_REMOVEDIR_DATALESS) {
5781 unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
5782 }
5783 return rmdirat_internal(vfs_context_current(), uap->fd,
5784 uap->path, UIO_USERSPACE, unlink_flags);
5785 } else {
5786 return unlinkat_internal(vfs_context_current(), uap->fd,
5787 NULLVP, uap->path, UIO_USERSPACE, 0);
5788 }
5789 }
5790
5791 /*
5792 * Reposition read/write file offset.
5793 */
5794 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)5795 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
5796 {
5797 struct fileproc *fp;
5798 vnode_t vp;
5799 struct vfs_context *ctx;
5800 off_t offset = uap->offset, file_size;
5801 int error;
5802
5803 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
5804 if (error == ENOTSUP) {
5805 return ESPIPE;
5806 }
5807 return error;
5808 }
5809 if (vnode_isfifo(vp)) {
5810 file_drop(uap->fd);
5811 return ESPIPE;
5812 }
5813
5814
5815 ctx = vfs_context_current();
5816 #if CONFIG_MACF
5817 if (uap->whence == L_INCR && uap->offset == 0) {
5818 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
5819 fp->fp_glob);
5820 } else {
5821 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
5822 fp->fp_glob);
5823 }
5824 if (error) {
5825 file_drop(uap->fd);
5826 return error;
5827 }
5828 #endif
5829 if ((error = vnode_getwithref(vp))) {
5830 file_drop(uap->fd);
5831 return error;
5832 }
5833
5834 switch (uap->whence) {
5835 case L_INCR:
5836 offset += fp->fp_glob->fg_offset;
5837 break;
5838 case L_XTND:
5839 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
5840 break;
5841 }
5842 offset += file_size;
5843 break;
5844 case L_SET:
5845 break;
5846 case SEEK_HOLE:
5847 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
5848 break;
5849 case SEEK_DATA:
5850 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
5851 break;
5852 default:
5853 error = EINVAL;
5854 }
5855 if (error == 0) {
5856 if (uap->offset > 0 && offset < 0) {
5857 /* Incremented/relative move past max size */
5858 error = EOVERFLOW;
5859 } else {
5860 /*
5861 * Allow negative offsets on character devices, per
5862 * POSIX 1003.1-2001. Most likely for writing disk
5863 * labels.
5864 */
5865 if (offset < 0 && vp->v_type != VCHR) {
5866 /* Decremented/relative move before start */
5867 error = EINVAL;
5868 } else {
5869 /* Success */
5870 fp->fp_glob->fg_offset = offset;
5871 *retval = fp->fp_glob->fg_offset;
5872 }
5873 }
5874 }
5875
5876 /*
5877 * An lseek can affect whether data is "available to read." Use
5878 * hint of NOTE_NONE so no EVFILT_VNODE events fire
5879 */
5880 post_event_if_success(vp, error, NOTE_NONE);
5881 (void)vnode_put(vp);
5882 file_drop(uap->fd);
5883 return error;
5884 }
5885
5886
5887 /*
5888 * Check access permissions.
5889 *
5890 * Returns: 0 Success
5891 * vnode_authorize:???
5892 */
5893 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)5894 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
5895 {
5896 kauth_action_t action;
5897 int error;
5898
5899 /*
5900 * If just the regular access bits, convert them to something
5901 * that vnode_authorize will understand.
5902 */
5903 if (!(uflags & _ACCESS_EXTENDED_MASK)) {
5904 action = 0;
5905 if (uflags & R_OK) {
5906 action |= KAUTH_VNODE_READ_DATA; /* aka KAUTH_VNODE_LIST_DIRECTORY */
5907 }
5908 if (uflags & W_OK) {
5909 if (vnode_isdir(vp)) {
5910 action |= KAUTH_VNODE_ADD_FILE |
5911 KAUTH_VNODE_ADD_SUBDIRECTORY;
5912 /* might want delete rights here too */
5913 } else {
5914 action |= KAUTH_VNODE_WRITE_DATA;
5915 }
5916 }
5917 if (uflags & X_OK) {
5918 if (vnode_isdir(vp)) {
5919 action |= KAUTH_VNODE_SEARCH;
5920 } else {
5921 action |= KAUTH_VNODE_EXECUTE;
5922 }
5923 }
5924 } else {
5925 /* take advantage of definition of uflags */
5926 action = uflags >> 8;
5927 }
5928
5929 #if CONFIG_MACF
5930 error = mac_vnode_check_access(ctx, vp, uflags);
5931 if (error) {
5932 return error;
5933 }
5934 #endif /* MAC */
5935
5936 /* action == 0 means only check for existence */
5937 if (action != 0) {
5938 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
5939 } else {
5940 error = 0;
5941 }
5942
5943 return error;
5944 }
5945
5946
5947
5948 /*
5949 * access_extended: Check access permissions in bulk.
5950 *
5951 * Description: uap->entries Pointer to an array of accessx
5952 * descriptor structs, plus one or
5953 * more NULL terminated strings (see
5954 * "Notes" section below).
5955 * uap->size Size of the area pointed to by
5956 * uap->entries.
5957 * uap->results Pointer to the results array.
5958 *
5959 * Returns: 0 Success
5960 * ENOMEM Insufficient memory
5961 * EINVAL Invalid arguments
5962 * namei:EFAULT Bad address
5963 * namei:ENAMETOOLONG Filename too long
5964 * namei:ENOENT No such file or directory
5965 * namei:ELOOP Too many levels of symbolic links
5966 * namei:EBADF Bad file descriptor
5967 * namei:ENOTDIR Not a directory
5968 * namei:???
5969 * access1:
5970 *
5971 * Implicit returns:
5972 * uap->results Array contents modified
5973 *
5974 * Notes: The uap->entries are structured as an arbitrary length array
5975 * of accessx descriptors, followed by one or more NULL terminated
5976 * strings
5977 *
5978 * struct accessx_descriptor[0]
5979 * ...
5980 * struct accessx_descriptor[n]
5981 * char name_data[0];
5982 *
5983 * We determine the entry count by walking the buffer containing
5984 * the uap->entries argument descriptor. For each descriptor we
5985 * see, the valid values for the offset ad_name_offset will be
5986 * in the byte range:
5987 *
5988 * [ uap->entries + sizeof(struct accessx_descriptor) ]
5989 * to
5990 * [ uap->entries + uap->size - 2 ]
5991 *
5992 * since we must have at least one string, and the string must
5993 * be at least one character plus the NULL terminator in length.
5994 *
5995 * XXX: Need to support the check-as uid argument
5996 */
5997 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)5998 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
5999 {
6000 struct accessx_descriptor *input = NULL;
6001 errno_t *result = NULL;
6002 errno_t error = 0;
6003 int wantdelete = 0;
6004 size_t desc_max, desc_actual;
6005 unsigned int i, j;
6006 struct vfs_context context;
6007 struct nameidata nd;
6008 int niopts;
6009 vnode_t vp = NULL;
6010 vnode_t dvp = NULL;
6011 #define ACCESSX_MAX_DESCR_ON_STACK 10
6012 struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6013
6014 context.vc_ucred = NULL;
6015
6016 /*
6017 * Validate parameters; if valid, copy the descriptor array and string
6018 * arguments into local memory. Before proceeding, the following
6019 * conditions must have been met:
6020 *
6021 * o The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6022 * o There must be sufficient room in the request for at least one
6023 * descriptor and a one yte NUL terminated string.
6024 * o The allocation of local storage must not fail.
6025 */
6026 if (uap->size > ACCESSX_MAX_TABLESIZE) {
6027 return ENOMEM;
6028 }
6029 if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6030 return EINVAL;
6031 }
6032 if (uap->size <= sizeof(stack_input)) {
6033 input = stack_input;
6034 } else {
6035 input = kalloc_data(uap->size, Z_WAITOK);
6036 if (input == NULL) {
6037 error = ENOMEM;
6038 goto out;
6039 }
6040 }
6041 error = copyin(uap->entries, input, uap->size);
6042 if (error) {
6043 goto out;
6044 }
6045
6046 AUDIT_ARG(opaque, input, uap->size);
6047
6048 /*
6049 * Force NUL termination of the copyin buffer to avoid nami() running
6050 * off the end. If the caller passes us bogus data, they may get a
6051 * bogus result.
6052 */
6053 ((char *)input)[uap->size - 1] = 0;
6054
6055 /*
6056 * Access is defined as checking against the process' real identity,
6057 * even if operations are checking the effective identity. This
6058 * requires that we use a local vfs context.
6059 */
6060 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6061 context.vc_thread = current_thread();
6062
6063 /*
6064 * Find out how many entries we have, so we can allocate the result
6065 * array by walking the list and adjusting the count downward by the
6066 * earliest string offset we see.
6067 */
6068 desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6069 desc_actual = desc_max;
6070 for (i = 0; i < desc_actual; i++) {
6071 /*
6072 * Take the offset to the name string for this entry and
6073 * convert to an input array index, which would be one off
6074 * the end of the array if this entry was the lowest-addressed
6075 * name string.
6076 */
6077 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6078
6079 /*
6080 * An offset greater than the max allowable offset is an error.
6081 * It is also an error for any valid entry to point
6082 * to a location prior to the end of the current entry, if
6083 * it's not a reference to the string of the previous entry.
6084 */
6085 if (j > desc_max || (j != 0 && j <= i)) {
6086 error = EINVAL;
6087 goto out;
6088 }
6089
6090 /* Also do not let ad_name_offset point to something beyond the size of the input */
6091 if (input[i].ad_name_offset >= uap->size) {
6092 error = EINVAL;
6093 goto out;
6094 }
6095
6096 /*
6097 * An offset of 0 means use the previous descriptor's offset;
6098 * this is used to chain multiple requests for the same file
6099 * to avoid multiple lookups.
6100 */
6101 if (j == 0) {
6102 /* This is not valid for the first entry */
6103 if (i == 0) {
6104 error = EINVAL;
6105 goto out;
6106 }
6107 continue;
6108 }
6109
6110 /*
6111 * If the offset of the string for this descriptor is before
6112 * what we believe is the current actual last descriptor,
6113 * then we need to adjust our estimate downward; this permits
6114 * the string table following the last descriptor to be out
6115 * of order relative to the descriptor list.
6116 */
6117 if (j < desc_actual) {
6118 desc_actual = j;
6119 }
6120 }
6121
6122 /*
6123 * We limit the actual number of descriptors we are willing to process
6124 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS. If the number being
6125 * requested does not exceed this limit,
6126 */
6127 if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6128 error = ENOMEM;
6129 goto out;
6130 }
6131 result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6132 if (result == NULL) {
6133 error = ENOMEM;
6134 goto out;
6135 }
6136
6137 /*
6138 * Do the work by iterating over the descriptor entries we know to
6139 * at least appear to contain valid data.
6140 */
6141 error = 0;
6142 for (i = 0; i < desc_actual; i++) {
6143 /*
6144 * If the ad_name_offset is 0, then we use the previous
6145 * results to make the check; otherwise, we are looking up
6146 * a new file name.
6147 */
6148 if (input[i].ad_name_offset != 0) {
6149 /* discard old vnodes */
6150 if (vp) {
6151 vnode_put(vp);
6152 vp = NULL;
6153 }
6154 if (dvp) {
6155 vnode_put(dvp);
6156 dvp = NULL;
6157 }
6158
6159 /*
6160 * Scan forward in the descriptor list to see if we
6161 * need the parent vnode. We will need it if we are
6162 * deleting, since we must have rights to remove
6163 * entries in the parent directory, as well as the
6164 * rights to delete the object itself.
6165 */
6166 wantdelete = input[i].ad_flags & _DELETE_OK;
6167 for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6168 if (input[j].ad_flags & _DELETE_OK) {
6169 wantdelete = 1;
6170 }
6171 }
6172
6173 niopts = FOLLOW | AUDITVNPATH1;
6174
6175 /* need parent for vnode_authorize for deletion test */
6176 if (wantdelete) {
6177 niopts |= WANTPARENT;
6178 }
6179
6180 /* do the lookup */
6181 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6182 CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6183 &context);
6184 error = namei(&nd);
6185 if (!error) {
6186 vp = nd.ni_vp;
6187 if (wantdelete) {
6188 dvp = nd.ni_dvp;
6189 }
6190 }
6191 nameidone(&nd);
6192 }
6193
6194 /*
6195 * Handle lookup errors.
6196 */
6197 switch (error) {
6198 case ENOENT:
6199 case EACCES:
6200 case EPERM:
6201 case ENOTDIR:
6202 result[i] = error;
6203 break;
6204 case 0:
6205 /* run this access check */
6206 result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6207 break;
6208 default:
6209 /* fatal lookup error */
6210
6211 goto out;
6212 }
6213 }
6214
6215 AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6216
6217 /* copy out results */
6218 error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6219
6220 out:
6221 if (input && input != stack_input) {
6222 kfree_data(input, uap->size);
6223 }
6224 if (result) {
6225 kfree_data(result, desc_actual * sizeof(errno_t));
6226 }
6227 if (vp) {
6228 vnode_put(vp);
6229 }
6230 if (dvp) {
6231 vnode_put(dvp);
6232 }
6233 if (IS_VALID_CRED(context.vc_ucred)) {
6234 kauth_cred_unref(&context.vc_ucred);
6235 }
6236 return error;
6237 }
6238
6239
6240 /*
6241 * Returns: 0 Success
6242 * namei:EFAULT Bad address
6243 * namei:ENAMETOOLONG Filename too long
6244 * namei:ENOENT No such file or directory
6245 * namei:ELOOP Too many levels of symbolic links
6246 * namei:EBADF Bad file descriptor
6247 * namei:ENOTDIR Not a directory
6248 * namei:???
6249 * access1:
6250 */
6251 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6252 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6253 int flag, enum uio_seg segflg)
6254 {
6255 int error;
6256 struct nameidata nd;
6257 int niopts;
6258 struct vfs_context context;
6259 #if NAMEDRSRCFORK
6260 int is_namedstream = 0;
6261 #endif
6262
6263 /*
6264 * Unless the AT_EACCESS option is used, Access is defined as checking
6265 * against the process' real identity, even if operations are checking
6266 * the effective identity. So we need to tweak the credential
6267 * in the context for that case.
6268 */
6269 if (!(flag & AT_EACCESS)) {
6270 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6271 } else {
6272 context.vc_ucred = ctx->vc_ucred;
6273 }
6274 context.vc_thread = ctx->vc_thread;
6275
6276
6277 niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6278 /* need parent for vnode_authorize for deletion test */
6279 if (amode & _DELETE_OK) {
6280 niopts |= WANTPARENT;
6281 }
6282 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6283 path, &context);
6284 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6285 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6286 }
6287
6288 #if NAMEDRSRCFORK
6289 /* access(F_OK) calls are allowed for resource forks. */
6290 if (amode == F_OK) {
6291 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6292 }
6293 #endif
6294 error = nameiat(&nd, fd);
6295 if (error) {
6296 goto out;
6297 }
6298
6299 #if NAMEDRSRCFORK
6300 /* Grab reference on the shadow stream file vnode to
6301 * force an inactive on release which will mark it
6302 * for recycle.
6303 */
6304 if (vnode_isnamedstream(nd.ni_vp) &&
6305 (nd.ni_vp->v_parent != NULLVP) &&
6306 vnode_isshadow(nd.ni_vp)) {
6307 is_namedstream = 1;
6308 vnode_ref(nd.ni_vp);
6309 }
6310 #endif
6311
6312 error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6313
6314 #if NAMEDRSRCFORK
6315 if (is_namedstream) {
6316 vnode_rele(nd.ni_vp);
6317 }
6318 #endif
6319
6320 vnode_put(nd.ni_vp);
6321 if (amode & _DELETE_OK) {
6322 vnode_put(nd.ni_dvp);
6323 }
6324 nameidone(&nd);
6325
6326 out:
6327 if (!(flag & AT_EACCESS)) {
6328 kauth_cred_unref(&context.vc_ucred);
6329 }
6330 return error;
6331 }
6332
6333 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)6334 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6335 {
6336 return faccessat_internal(vfs_context_current(), AT_FDCWD,
6337 uap->path, uap->flags, 0, UIO_USERSPACE);
6338 }
6339
6340 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)6341 faccessat(__unused proc_t p, struct faccessat_args *uap,
6342 __unused int32_t *retval)
6343 {
6344 if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
6345 return EINVAL;
6346 }
6347
6348 return faccessat_internal(vfs_context_current(), uap->fd,
6349 uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6350 }
6351
6352 /*
6353 * Returns: 0 Success
6354 * EFAULT
6355 * copyout:EFAULT
6356 * namei:???
6357 * vn_stat:???
6358 */
6359 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)6360 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6361 user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6362 enum uio_seg segflg, int fd, int flag)
6363 {
6364 struct nameidata nd;
6365 int follow;
6366 union {
6367 struct stat sb;
6368 struct stat64 sb64;
6369 } source = {};
6370 union {
6371 struct user64_stat user64_sb;
6372 struct user32_stat user32_sb;
6373 struct user64_stat64 user64_sb64;
6374 struct user32_stat64 user32_sb64;
6375 } dest = {};
6376 caddr_t sbp;
6377 int error, my_size;
6378 kauth_filesec_t fsec;
6379 size_t xsecurity_bufsize;
6380 void * statptr;
6381 struct fileproc *fp = NULL;
6382 int needsrealdev = 0;
6383
6384 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
6385 NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6386 segflg, path, ctx);
6387 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6388 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6389 }
6390
6391 #if NAMEDRSRCFORK
6392 int is_namedstream = 0;
6393 /* stat calls are allowed for resource forks. */
6394 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6395 #endif
6396
6397 if (flag & AT_FDONLY) {
6398 vnode_t fvp;
6399
6400 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6401 if (error) {
6402 return error;
6403 }
6404 if ((error = vnode_getwithref(fvp))) {
6405 file_drop(fd);
6406 return error;
6407 }
6408 nd.ni_vp = fvp;
6409 } else {
6410 error = nameiat(&nd, fd);
6411 if (error) {
6412 return error;
6413 }
6414 }
6415 fsec = KAUTH_FILESEC_NONE;
6416
6417 statptr = (void *)&source;
6418
6419 #if NAMEDRSRCFORK
6420 /* Grab reference on the shadow stream file vnode to
6421 * force an inactive on release which will mark it
6422 * for recycle.
6423 */
6424 if (vnode_isnamedstream(nd.ni_vp) &&
6425 (nd.ni_vp->v_parent != NULLVP) &&
6426 vnode_isshadow(nd.ni_vp)) {
6427 is_namedstream = 1;
6428 vnode_ref(nd.ni_vp);
6429 }
6430 #endif
6431
6432 needsrealdev = flag & AT_REALDEV ? 1 : 0;
6433 if (fp && (xsecurity == USER_ADDR_NULL)) {
6434 /*
6435 * If the caller has the file open, and is not
6436 * requesting extended security information, we are
6437 * going to let them get the basic stat information.
6438 */
6439 error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6440 fp->fp_glob->fg_cred);
6441 } else {
6442 error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6443 isstat64, needsrealdev, ctx);
6444 }
6445
6446 #if NAMEDRSRCFORK
6447 if (is_namedstream) {
6448 vnode_rele(nd.ni_vp);
6449 }
6450 #endif
6451 vnode_put(nd.ni_vp);
6452 nameidone(&nd);
6453 if (fp) {
6454 file_drop(fd);
6455 fp = NULL;
6456 }
6457
6458 if (error) {
6459 return error;
6460 }
6461 /* Zap spare fields */
6462 if (isstat64 != 0) {
6463 source.sb64.st_lspare = 0;
6464 source.sb64.st_qspare[0] = 0LL;
6465 source.sb64.st_qspare[1] = 0LL;
6466 if (vfs_context_is64bit(ctx)) {
6467 munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6468 my_size = sizeof(dest.user64_sb64);
6469 sbp = (caddr_t)&dest.user64_sb64;
6470 } else {
6471 munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6472 my_size = sizeof(dest.user32_sb64);
6473 sbp = (caddr_t)&dest.user32_sb64;
6474 }
6475 /*
6476 * Check if we raced (post lookup) against the last unlink of a file.
6477 */
6478 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
6479 source.sb64.st_nlink = 1;
6480 }
6481 } else {
6482 source.sb.st_lspare = 0;
6483 source.sb.st_qspare[0] = 0LL;
6484 source.sb.st_qspare[1] = 0LL;
6485 if (vfs_context_is64bit(ctx)) {
6486 munge_user64_stat(&source.sb, &dest.user64_sb);
6487 my_size = sizeof(dest.user64_sb);
6488 sbp = (caddr_t)&dest.user64_sb;
6489 } else {
6490 munge_user32_stat(&source.sb, &dest.user32_sb);
6491 my_size = sizeof(dest.user32_sb);
6492 sbp = (caddr_t)&dest.user32_sb;
6493 }
6494
6495 /*
6496 * Check if we raced (post lookup) against the last unlink of a file.
6497 */
6498 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
6499 source.sb.st_nlink = 1;
6500 }
6501 }
6502 if ((error = copyout(sbp, ub, my_size)) != 0) {
6503 goto out;
6504 }
6505
6506 /* caller wants extended security information? */
6507 if (xsecurity != USER_ADDR_NULL) {
6508 /* did we get any? */
6509 if (fsec == KAUTH_FILESEC_NONE) {
6510 if (susize(xsecurity_size, 0) != 0) {
6511 error = EFAULT;
6512 goto out;
6513 }
6514 } else {
6515 /* find the user buffer size */
6516 xsecurity_bufsize = fusize(xsecurity_size);
6517
6518 /* copy out the actual data size */
6519 if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
6520 error = EFAULT;
6521 goto out;
6522 }
6523
6524 /* if the caller supplied enough room, copy out to it */
6525 if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
6526 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
6527 }
6528 }
6529 }
6530 out:
6531 if (fsec != KAUTH_FILESEC_NONE) {
6532 kauth_filesec_free(fsec);
6533 }
6534 return error;
6535 }
6536
6537 /*
6538 * stat_extended: Get file status; with extended security (ACL).
6539 *
6540 * Parameters: p (ignored)
6541 * uap User argument descriptor (see below)
6542 * retval (ignored)
6543 *
6544 * Indirect: uap->path Path of file to get status from
6545 * uap->ub User buffer (holds file status info)
6546 * uap->xsecurity ACL to get (extended security)
6547 * uap->xsecurity_size Size of ACL
6548 *
6549 * Returns: 0 Success
6550 * !0 errno value
6551 *
6552 */
6553 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)6554 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
6555 __unused int32_t *retval)
6556 {
6557 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6558 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6559 0);
6560 }
6561
6562 /*
6563 * Returns: 0 Success
6564 * fstatat_internal:??? [see fstatat_internal() in this file]
6565 */
6566 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)6567 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
6568 {
6569 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6570 0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
6571 }
6572
6573 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)6574 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
6575 {
6576 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6577 0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
6578 }
6579
6580 /*
6581 * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
6582 *
6583 * Parameters: p (ignored)
6584 * uap User argument descriptor (see below)
6585 * retval (ignored)
6586 *
6587 * Indirect: uap->path Path of file to get status from
6588 * uap->ub User buffer (holds file status info)
6589 * uap->xsecurity ACL to get (extended security)
6590 * uap->xsecurity_size Size of ACL
6591 *
6592 * Returns: 0 Success
6593 * !0 errno value
6594 *
6595 */
6596 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)6597 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
6598 {
6599 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6600 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6601 0);
6602 }
6603
6604 /*
6605 * lstat_extended: Get file status; does not follow links; with extended security (ACL).
6606 *
6607 * Parameters: p (ignored)
6608 * uap User argument descriptor (see below)
6609 * retval (ignored)
6610 *
6611 * Indirect: uap->path Path of file to get status from
6612 * uap->ub User buffer (holds file status info)
6613 * uap->xsecurity ACL to get (extended security)
6614 * uap->xsecurity_size Size of ACL
6615 *
6616 * Returns: 0 Success
6617 * !0 errno value
6618 *
6619 */
6620 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)6621 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
6622 {
6623 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6624 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6625 AT_SYMLINK_NOFOLLOW);
6626 }
6627
6628 /*
6629 * Get file status; this version does not follow links.
6630 */
6631 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)6632 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
6633 {
6634 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6635 0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6636 }
6637
6638 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)6639 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
6640 {
6641 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6642 0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6643 }
6644
6645 /*
6646 * lstat64_extended: Get file status; can handle large inode numbers; does not
6647 * follow links; with extended security (ACL).
6648 *
6649 * Parameters: p (ignored)
6650 * uap User argument descriptor (see below)
6651 * retval (ignored)
6652 *
6653 * Indirect: uap->path Path of file to get status from
6654 * uap->ub User buffer (holds file status info)
6655 * uap->xsecurity ACL to get (extended security)
6656 * uap->xsecurity_size Size of ACL
6657 *
6658 * Returns: 0 Success
6659 * !0 errno value
6660 *
6661 */
6662 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)6663 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
6664 {
6665 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6666 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6667 AT_SYMLINK_NOFOLLOW);
6668 }
6669
6670 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)6671 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
6672 {
6673 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
6674 return EINVAL;
6675 }
6676
6677 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6678 0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
6679 }
6680
6681 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)6682 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
6683 __unused int32_t *retval)
6684 {
6685 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
6686 return EINVAL;
6687 }
6688
6689 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6690 0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
6691 }
6692
6693 /*
6694 * Get configurable pathname variables.
6695 *
6696 * Returns: 0 Success
6697 * namei:???
6698 * vn_pathconf:???
6699 *
6700 * Notes: Global implementation constants are intended to be
6701 * implemented in this function directly; all other constants
6702 * are per-FS implementation, and therefore must be handled in
6703 * each respective FS, instead.
6704 *
6705 * XXX We implement some things globally right now that should actually be
6706 * XXX per-FS; we will need to deal with this at some point.
6707 */
6708 /* ARGSUSED */
6709 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)6710 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
6711 {
6712 int error;
6713 struct nameidata nd;
6714 vfs_context_t ctx = vfs_context_current();
6715
6716 NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
6717 UIO_USERSPACE, uap->path, ctx);
6718 error = namei(&nd);
6719 if (error) {
6720 return error;
6721 }
6722
6723 error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
6724
6725 vnode_put(nd.ni_vp);
6726 nameidone(&nd);
6727 return error;
6728 }
6729
6730 /*
6731 * Return target name of a symbolic link.
6732 */
6733 /* ARGSUSED */
6734 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)6735 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
6736 enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
6737 int *retval)
6738 {
6739 vnode_t vp;
6740 uio_t auio;
6741 int error;
6742 struct nameidata nd;
6743 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
6744 bool put_vnode;
6745
6746 if (bufsize > INT32_MAX) {
6747 return EINVAL;
6748 }
6749
6750 if (lnk_vp) {
6751 vp = lnk_vp;
6752 put_vnode = false;
6753 } else {
6754 NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
6755 seg, path, ctx);
6756
6757 error = nameiat(&nd, fd);
6758 if (error) {
6759 return error;
6760 }
6761 vp = nd.ni_vp;
6762 put_vnode = true;
6763 nameidone(&nd);
6764 }
6765
6766 auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
6767 &uio_buf[0], sizeof(uio_buf));
6768 uio_addiov(auio, buf, bufsize);
6769 if (vp->v_type != VLNK) {
6770 error = EINVAL;
6771 } else {
6772 #if CONFIG_MACF
6773 error = mac_vnode_check_readlink(ctx, vp);
6774 #endif
6775 if (error == 0) {
6776 error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
6777 ctx);
6778 }
6779 if (error == 0) {
6780 error = VNOP_READLINK(vp, auio, ctx);
6781 }
6782 }
6783
6784 if (put_vnode) {
6785 vnode_put(vp);
6786 }
6787
6788 *retval = (int)(bufsize - uio_resid(auio));
6789 return error;
6790 }
6791
6792 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)6793 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
6794 {
6795 enum uio_seg procseg;
6796 vnode_t vp;
6797 int error;
6798
6799 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6800
6801 AUDIT_ARG(fd, uap->fd);
6802
6803 if ((error = file_vnode(uap->fd, &vp))) {
6804 return error;
6805 }
6806 if ((error = vnode_getwithref(vp))) {
6807 file_drop(uap->fd);
6808 return error;
6809 }
6810
6811 error = readlinkat_internal(vfs_context_current(), -1,
6812 vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
6813 uap->bufsize, procseg, retval);
6814
6815 vnode_put(vp);
6816 file_drop(uap->fd);
6817 return error;
6818 }
6819
6820 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)6821 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
6822 {
6823 enum uio_seg procseg;
6824
6825 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6826 return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
6827 CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
6828 uap->count, procseg, retval);
6829 }
6830
6831 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)6832 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
6833 {
6834 enum uio_seg procseg;
6835
6836 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6837 return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
6838 CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
6839 retval);
6840 }
6841
6842 /*
6843 * Change file flags, the deep inner layer.
6844 */
6845 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)6846 chflags0(vnode_t vp, struct vnode_attr *va,
6847 int (*setattr)(vnode_t, void *, vfs_context_t),
6848 void *arg, vfs_context_t ctx)
6849 {
6850 kauth_action_t action = 0;
6851 int error;
6852
6853 #if CONFIG_MACF
6854 error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
6855 if (error) {
6856 goto out;
6857 }
6858 #endif
6859
6860 /* request authorisation, disregard immutability */
6861 if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
6862 goto out;
6863 }
6864 /*
6865 * Request that the auth layer disregard those file flags it's allowed to when
6866 * authorizing this operation; we need to do this in order to be able to
6867 * clear immutable flags.
6868 */
6869 if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
6870 goto out;
6871 }
6872 error = (*setattr)(vp, arg, ctx);
6873
6874 #if CONFIG_MACF
6875 if (error == 0) {
6876 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
6877 }
6878 #endif
6879
6880 out:
6881 return error;
6882 }
6883
6884 /*
6885 * Change file flags.
6886 *
6887 * NOTE: this will vnode_put() `vp'
6888 */
6889 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)6890 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
6891 {
6892 struct vnode_attr va;
6893 int error;
6894
6895 VATTR_INIT(&va);
6896 VATTR_SET(&va, va_flags, flags);
6897
6898 error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
6899 vnode_put(vp);
6900
6901 if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
6902 error = ENOTSUP;
6903 }
6904
6905 return error;
6906 }
6907
6908 /*
6909 * Change flags of a file given a path name.
6910 */
6911 /* ARGSUSED */
6912 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)6913 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
6914 {
6915 vnode_t vp;
6916 vfs_context_t ctx = vfs_context_current();
6917 int error;
6918 struct nameidata nd;
6919
6920 AUDIT_ARG(fflags, uap->flags);
6921 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6922 UIO_USERSPACE, uap->path, ctx);
6923 error = namei(&nd);
6924 if (error) {
6925 return error;
6926 }
6927 vp = nd.ni_vp;
6928 nameidone(&nd);
6929
6930 /* we don't vnode_put() here because chflags1 does internally */
6931 error = chflags1(vp, uap->flags, ctx);
6932
6933 return error;
6934 }
6935
6936 /*
6937 * Change flags of a file given a file descriptor.
6938 */
6939 /* ARGSUSED */
6940 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)6941 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
6942 {
6943 vnode_t vp;
6944 int error;
6945
6946 AUDIT_ARG(fd, uap->fd);
6947 AUDIT_ARG(fflags, uap->flags);
6948 if ((error = file_vnode(uap->fd, &vp))) {
6949 return error;
6950 }
6951
6952 if ((error = vnode_getwithref(vp))) {
6953 file_drop(uap->fd);
6954 return error;
6955 }
6956
6957 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6958
6959 /* we don't vnode_put() here because chflags1 does internally */
6960 error = chflags1(vp, uap->flags, vfs_context_current());
6961
6962 file_drop(uap->fd);
6963 return error;
6964 }
6965
6966 /*
6967 * Change security information on a filesystem object.
6968 *
6969 * Returns: 0 Success
6970 * EPERM Operation not permitted
6971 * vnode_authattr:??? [anything vnode_authattr can return]
6972 * vnode_authorize:??? [anything vnode_authorize can return]
6973 * vnode_setattr:??? [anything vnode_setattr can return]
6974 *
6975 * Notes: If vnode_authattr or vnode_authorize return EACCES, it will be
6976 * translated to EPERM before being returned.
6977 */
6978 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)6979 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
6980 {
6981 kauth_action_t action;
6982 int error;
6983
6984 AUDIT_ARG(mode, vap->va_mode);
6985 /* XXX audit new args */
6986
6987 #if NAMEDSTREAMS
6988 /* chmod calls are not allowed for resource forks. */
6989 if (vp->v_flag & VISNAMEDSTREAM) {
6990 return EPERM;
6991 }
6992 #endif
6993
6994 #if CONFIG_MACF
6995 if (VATTR_IS_ACTIVE(vap, va_mode) &&
6996 (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
6997 return error;
6998 }
6999
7000 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7001 if ((error = mac_vnode_check_setowner(ctx, vp,
7002 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7003 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7004 return error;
7005 }
7006 }
7007
7008 if (VATTR_IS_ACTIVE(vap, va_acl) &&
7009 (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7010 return error;
7011 }
7012 #endif
7013
7014 /* make sure that the caller is allowed to set this security information */
7015 if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7016 ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7017 if (error == EACCES) {
7018 error = EPERM;
7019 }
7020 return error;
7021 }
7022
7023 if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7024 return error;
7025 }
7026
7027 #if CONFIG_MACF
7028 if (VATTR_IS_ACTIVE(vap, va_mode)) {
7029 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7030 }
7031
7032 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7033 mac_vnode_notify_setowner(ctx, vp,
7034 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7035 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7036 }
7037
7038 if (VATTR_IS_ACTIVE(vap, va_acl)) {
7039 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7040 }
7041 #endif
7042
7043 return error;
7044 }
7045
7046
7047 /*
7048 * Change mode of a file given a path name.
7049 *
7050 * Returns: 0 Success
7051 * namei:??? [anything namei can return]
7052 * chmod_vnode:??? [anything chmod_vnode can return]
7053 */
7054 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7055 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7056 int fd, int flag, enum uio_seg segflg)
7057 {
7058 struct nameidata nd;
7059 int follow, error;
7060
7061 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7062 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
7063 segflg, path, ctx);
7064 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7065 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7066 }
7067 if ((error = nameiat(&nd, fd))) {
7068 return error;
7069 }
7070 error = chmod_vnode(ctx, nd.ni_vp, vap);
7071 vnode_put(nd.ni_vp);
7072 nameidone(&nd);
7073 return error;
7074 }
7075
7076 /*
7077 * chmod_extended: Change the mode of a file given a path name; with extended
7078 * argument list (including extended security (ACL)).
7079 *
7080 * Parameters: p Process requesting the open
7081 * uap User argument descriptor (see below)
7082 * retval (ignored)
7083 *
7084 * Indirect: uap->path Path to object (same as 'chmod')
7085 * uap->uid UID to set
7086 * uap->gid GID to set
7087 * uap->mode File mode to set (same as 'chmod')
7088 * uap->xsecurity ACL to set (or delete)
7089 *
7090 * Returns: 0 Success
7091 * !0 errno value
7092 *
7093 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
7094 *
7095 * XXX: We should enummerate the possible errno values here, and where
7096 * in the code they originated.
7097 */
7098 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7099 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7100 {
7101 int error;
7102 struct vnode_attr va;
7103 kauth_filesec_t xsecdst;
7104
7105 AUDIT_ARG(owner, uap->uid, uap->gid);
7106
7107 VATTR_INIT(&va);
7108 if (uap->mode != -1) {
7109 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7110 }
7111 if (uap->uid != KAUTH_UID_NONE) {
7112 VATTR_SET(&va, va_uid, uap->uid);
7113 }
7114 if (uap->gid != KAUTH_GID_NONE) {
7115 VATTR_SET(&va, va_gid, uap->gid);
7116 }
7117
7118 xsecdst = NULL;
7119 switch (uap->xsecurity) {
7120 /* explicit remove request */
7121 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7122 VATTR_SET(&va, va_acl, NULL);
7123 break;
7124 /* not being set */
7125 case USER_ADDR_NULL:
7126 break;
7127 default:
7128 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
7129 return error;
7130 }
7131 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
7132 va.va_vaflags |= VA_FILESEC_ACL;
7133 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
7134 }
7135
7136 error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7137 UIO_USERSPACE);
7138
7139 if (xsecdst != NULL) {
7140 kauth_filesec_free(xsecdst);
7141 }
7142 return error;
7143 }
7144
7145 /*
7146 * Returns: 0 Success
7147 * chmodat:??? [anything chmodat can return]
7148 */
7149 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7150 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7151 int flag, enum uio_seg segflg)
7152 {
7153 struct vnode_attr va;
7154
7155 VATTR_INIT(&va);
7156 VATTR_SET(&va, va_mode, mode & ALLPERMS);
7157
7158 return chmodat(ctx, path, &va, fd, flag, segflg);
7159 }
7160
7161 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7162 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7163 {
7164 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7165 AT_FDCWD, 0, UIO_USERSPACE);
7166 }
7167
7168 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7169 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7170 {
7171 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7172 return EINVAL;
7173 }
7174
7175 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7176 uap->fd, uap->flag, UIO_USERSPACE);
7177 }
7178
7179 /*
7180 * Change mode of a file given a file descriptor.
7181 */
7182 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7183 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7184 {
7185 vnode_t vp;
7186 int error;
7187
7188 AUDIT_ARG(fd, fd);
7189
7190 if ((error = file_vnode(fd, &vp)) != 0) {
7191 return error;
7192 }
7193 if ((error = vnode_getwithref(vp)) != 0) {
7194 file_drop(fd);
7195 return error;
7196 }
7197 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7198
7199 error = chmod_vnode(vfs_context_current(), vp, vap);
7200 (void)vnode_put(vp);
7201 file_drop(fd);
7202
7203 return error;
7204 }
7205
7206 /*
7207 * fchmod_extended: Change mode of a file given a file descriptor; with
7208 * extended argument list (including extended security (ACL)).
7209 *
7210 * Parameters: p Process requesting to change file mode
7211 * uap User argument descriptor (see below)
7212 * retval (ignored)
7213 *
7214 * Indirect: uap->mode File mode to set (same as 'chmod')
7215 * uap->uid UID to set
7216 * uap->gid GID to set
7217 * uap->xsecurity ACL to set (or delete)
7218 * uap->fd File descriptor of file to change mode
7219 *
7220 * Returns: 0 Success
7221 * !0 errno value
7222 *
7223 */
7224 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7225 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7226 {
7227 int error;
7228 struct vnode_attr va;
7229 kauth_filesec_t xsecdst;
7230
7231 AUDIT_ARG(owner, uap->uid, uap->gid);
7232
7233 VATTR_INIT(&va);
7234 if (uap->mode != -1) {
7235 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7236 } else {
7237 va.va_mode = 0;
7238 }
7239
7240 if (uap->uid != KAUTH_UID_NONE) {
7241 VATTR_SET(&va, va_uid, uap->uid);
7242 }
7243 if (uap->gid != KAUTH_GID_NONE) {
7244 VATTR_SET(&va, va_gid, uap->gid);
7245 }
7246
7247 xsecdst = NULL;
7248 switch (uap->xsecurity) {
7249 case USER_ADDR_NULL:
7250 VATTR_SET(&va, va_acl, NULL);
7251 break;
7252 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7253 VATTR_SET(&va, va_acl, NULL);
7254 break;
7255 /* not being set */
7256 case CAST_USER_ADDR_T(-1):
7257 break;
7258 default:
7259 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
7260 return error;
7261 }
7262 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
7263 va.va_vaflags |= VA_FILESEC_ACL;
7264 }
7265
7266 error = fchmod1(p, uap->fd, &va);
7267
7268
7269 switch (uap->xsecurity) {
7270 case USER_ADDR_NULL:
7271 case CAST_USER_ADDR_T(-1):
7272 break;
7273 default:
7274 if (xsecdst != NULL) {
7275 kauth_filesec_free(xsecdst);
7276 }
7277 }
7278 return error;
7279 }
7280
7281 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)7282 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7283 {
7284 struct vnode_attr va;
7285
7286 VATTR_INIT(&va);
7287 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7288
7289 return fchmod1(p, uap->fd, &va);
7290 }
7291
7292
7293 /*
7294 * Set ownership given a path name.
7295 */
7296 /* ARGSUSED */
7297 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)7298 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7299 gid_t gid, int flag, enum uio_seg segflg)
7300 {
7301 vnode_t vp;
7302 struct vnode_attr va;
7303 int error;
7304 struct nameidata nd;
7305 int follow;
7306 kauth_action_t action;
7307
7308 AUDIT_ARG(owner, uid, gid);
7309
7310 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7311 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
7312 path, ctx);
7313 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7314 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7315 }
7316 error = nameiat(&nd, fd);
7317 if (error) {
7318 return error;
7319 }
7320 vp = nd.ni_vp;
7321
7322 nameidone(&nd);
7323
7324 VATTR_INIT(&va);
7325 if (uid != (uid_t)VNOVAL) {
7326 VATTR_SET(&va, va_uid, uid);
7327 }
7328 if (gid != (gid_t)VNOVAL) {
7329 VATTR_SET(&va, va_gid, gid);
7330 }
7331
7332 #if CONFIG_MACF
7333 error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7334 if (error) {
7335 goto out;
7336 }
7337 #endif
7338
7339 /* preflight and authorize attribute changes */
7340 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7341 goto out;
7342 }
7343 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7344 goto out;
7345 }
7346 error = vnode_setattr(vp, &va, ctx);
7347
7348 #if CONFIG_MACF
7349 if (error == 0) {
7350 mac_vnode_notify_setowner(ctx, vp, uid, gid);
7351 }
7352 #endif
7353
7354 out:
7355 /*
7356 * EACCES is only allowed from namei(); permissions failure should
7357 * return EPERM, so we need to translate the error code.
7358 */
7359 if (error == EACCES) {
7360 error = EPERM;
7361 }
7362
7363 vnode_put(vp);
7364 return error;
7365 }
7366
7367 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)7368 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7369 {
7370 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7371 uap->uid, uap->gid, 0, UIO_USERSPACE);
7372 }
7373
7374 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)7375 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7376 {
7377 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7378 uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7379 }
7380
7381 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)7382 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7383 {
7384 if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7385 return EINVAL;
7386 }
7387
7388 return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7389 uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7390 }
7391
7392 /*
7393 * Set ownership given a file descriptor.
7394 */
7395 /* ARGSUSED */
7396 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)7397 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7398 {
7399 struct vnode_attr va;
7400 vfs_context_t ctx = vfs_context_current();
7401 vnode_t vp;
7402 int error;
7403 kauth_action_t action;
7404
7405 AUDIT_ARG(owner, uap->uid, uap->gid);
7406 AUDIT_ARG(fd, uap->fd);
7407
7408 if ((error = file_vnode(uap->fd, &vp))) {
7409 return error;
7410 }
7411
7412 if ((error = vnode_getwithref(vp))) {
7413 file_drop(uap->fd);
7414 return error;
7415 }
7416 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7417
7418 VATTR_INIT(&va);
7419 if (uap->uid != VNOVAL) {
7420 VATTR_SET(&va, va_uid, uap->uid);
7421 }
7422 if (uap->gid != VNOVAL) {
7423 VATTR_SET(&va, va_gid, uap->gid);
7424 }
7425
7426 #if NAMEDSTREAMS
7427 /* chown calls are not allowed for resource forks. */
7428 if (vp->v_flag & VISNAMEDSTREAM) {
7429 error = EPERM;
7430 goto out;
7431 }
7432 #endif
7433
7434 #if CONFIG_MACF
7435 error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
7436 if (error) {
7437 goto out;
7438 }
7439 #endif
7440
7441 /* preflight and authorize attribute changes */
7442 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7443 goto out;
7444 }
7445 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7446 if (error == EACCES) {
7447 error = EPERM;
7448 }
7449 goto out;
7450 }
7451 error = vnode_setattr(vp, &va, ctx);
7452
7453 #if CONFIG_MACF
7454 if (error == 0) {
7455 mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
7456 }
7457 #endif
7458
7459 out:
7460 (void)vnode_put(vp);
7461 file_drop(uap->fd);
7462 return error;
7463 }
7464
7465 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)7466 getutimes(user_addr_t usrtvp, struct timespec *tsp)
7467 {
7468 int error;
7469
7470 if (usrtvp == USER_ADDR_NULL) {
7471 struct timeval old_tv;
7472 /* XXX Y2038 bug because of microtime argument */
7473 microtime(&old_tv);
7474 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
7475 tsp[1] = tsp[0];
7476 } else {
7477 if (IS_64BIT_PROCESS(current_proc())) {
7478 struct user64_timeval tv[2];
7479 error = copyin(usrtvp, (void *)tv, sizeof(tv));
7480 if (error) {
7481 return error;
7482 }
7483 TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
7484 TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
7485 } else {
7486 struct user32_timeval tv[2];
7487 error = copyin(usrtvp, (void *)tv, sizeof(tv));
7488 if (error) {
7489 return error;
7490 }
7491 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
7492 TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
7493 }
7494 }
7495 return 0;
7496 }
7497
7498 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)7499 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
7500 int nullflag)
7501 {
7502 int error;
7503 struct vnode_attr va;
7504 kauth_action_t action;
7505
7506 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7507
7508 VATTR_INIT(&va);
7509 VATTR_SET(&va, va_access_time, ts[0]);
7510 VATTR_SET(&va, va_modify_time, ts[1]);
7511 if (nullflag) {
7512 va.va_vaflags |= VA_UTIMES_NULL;
7513 }
7514
7515 #if NAMEDSTREAMS
7516 /* utimes calls are not allowed for resource forks. */
7517 if (vp->v_flag & VISNAMEDSTREAM) {
7518 error = EPERM;
7519 goto out;
7520 }
7521 #endif
7522
7523 #if CONFIG_MACF
7524 error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
7525 if (error) {
7526 goto out;
7527 }
7528 #endif
7529 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7530 if (!nullflag && error == EACCES) {
7531 error = EPERM;
7532 }
7533 goto out;
7534 }
7535
7536 /* since we may not need to auth anything, check here */
7537 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7538 if (!nullflag && error == EACCES) {
7539 error = EPERM;
7540 }
7541 goto out;
7542 }
7543 error = vnode_setattr(vp, &va, ctx);
7544
7545 #if CONFIG_MACF
7546 if (error == 0) {
7547 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
7548 }
7549 #endif
7550
7551 out:
7552 return error;
7553 }
7554
7555 /*
7556 * Set the access and modification times of a file.
7557 */
7558 /* ARGSUSED */
7559 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)7560 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
7561 {
7562 struct timespec ts[2];
7563 user_addr_t usrtvp;
7564 int error;
7565 struct nameidata nd;
7566 vfs_context_t ctx = vfs_context_current();
7567
7568 /*
7569 * AUDIT: Needed to change the order of operations to do the
7570 * name lookup first because auditing wants the path.
7571 */
7572 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
7573 UIO_USERSPACE, uap->path, ctx);
7574 error = namei(&nd);
7575 if (error) {
7576 return error;
7577 }
7578 nameidone(&nd);
7579
7580 /*
7581 * Fetch the user-supplied time. If usrtvp is USER_ADDR_NULL, we fetch
7582 * the current time instead.
7583 */
7584 usrtvp = uap->tptr;
7585 if ((error = getutimes(usrtvp, ts)) != 0) {
7586 goto out;
7587 }
7588
7589 error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
7590
7591 out:
7592 vnode_put(nd.ni_vp);
7593 return error;
7594 }
7595
7596 /*
7597 * Set the access and modification times of a file.
7598 */
7599 /* ARGSUSED */
7600 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)7601 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
7602 {
7603 struct timespec ts[2];
7604 vnode_t vp;
7605 user_addr_t usrtvp;
7606 int error;
7607
7608 AUDIT_ARG(fd, uap->fd);
7609 usrtvp = uap->tptr;
7610 if ((error = getutimes(usrtvp, ts)) != 0) {
7611 return error;
7612 }
7613 if ((error = file_vnode(uap->fd, &vp)) != 0) {
7614 return error;
7615 }
7616 if ((error = vnode_getwithref(vp))) {
7617 file_drop(uap->fd);
7618 return error;
7619 }
7620
7621 error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
7622 vnode_put(vp);
7623 file_drop(uap->fd);
7624 return error;
7625 }
7626
7627 /*
7628 * Truncate a file given its path name.
7629 */
7630 /* ARGSUSED */
7631 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)7632 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
7633 {
7634 vnode_t vp;
7635 struct vnode_attr va;
7636 vfs_context_t ctx = vfs_context_current();
7637 int error;
7638 struct nameidata nd;
7639 kauth_action_t action;
7640 rlim_t fsize_limit;
7641
7642 if (uap->length < 0) {
7643 return EINVAL;
7644 }
7645
7646 fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
7647 if ((rlim_t)uap->length > fsize_limit) {
7648 psignal(p, SIGXFSZ);
7649 return EFBIG;
7650 }
7651
7652 NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
7653 UIO_USERSPACE, uap->path, ctx);
7654 if ((error = namei(&nd))) {
7655 return error;
7656 }
7657 vp = nd.ni_vp;
7658
7659 nameidone(&nd);
7660
7661 VATTR_INIT(&va);
7662 VATTR_SET(&va, va_data_size, uap->length);
7663
7664 #if CONFIG_MACF
7665 error = mac_vnode_check_truncate(ctx, NOCRED, vp);
7666 if (error) {
7667 goto out;
7668 }
7669 #endif
7670
7671 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7672 goto out;
7673 }
7674 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7675 goto out;
7676 }
7677 error = vnode_setattr(vp, &va, ctx);
7678
7679 #if CONFIG_MACF
7680 if (error == 0) {
7681 mac_vnode_notify_truncate(ctx, NOCRED, vp);
7682 }
7683 #endif
7684
7685 out:
7686 vnode_put(vp);
7687 return error;
7688 }
7689
7690 /*
7691 * Truncate a file given a file descriptor.
7692 */
7693 /* ARGSUSED */
7694 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)7695 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
7696 {
7697 vfs_context_t ctx = vfs_context_current();
7698 struct vnode_attr va;
7699 vnode_t vp;
7700 struct fileproc *fp;
7701 int error;
7702 int fd = uap->fd;
7703 rlim_t fsize_limit;
7704
7705 AUDIT_ARG(fd, uap->fd);
7706 if (uap->length < 0) {
7707 return EINVAL;
7708 }
7709
7710 fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
7711 if ((rlim_t)uap->length > fsize_limit) {
7712 psignal(p, SIGXFSZ);
7713 return EFBIG;
7714 }
7715
7716 if ((error = fp_lookup(p, fd, &fp, 0))) {
7717 return error;
7718 }
7719
7720 switch (FILEGLOB_DTYPE(fp->fp_glob)) {
7721 case DTYPE_PSXSHM:
7722 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
7723 goto out;
7724 case DTYPE_VNODE:
7725 break;
7726 default:
7727 error = EINVAL;
7728 goto out;
7729 }
7730
7731 vp = (vnode_t)fp_get_data(fp);
7732
7733 if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
7734 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7735 error = EINVAL;
7736 goto out;
7737 }
7738
7739 if ((error = vnode_getwithref(vp)) != 0) {
7740 goto out;
7741 }
7742
7743 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7744
7745 #if CONFIG_MACF
7746 error = mac_vnode_check_truncate(ctx,
7747 fp->fp_glob->fg_cred, vp);
7748 if (error) {
7749 (void)vnode_put(vp);
7750 goto out;
7751 }
7752 #endif
7753 VATTR_INIT(&va);
7754 VATTR_SET(&va, va_data_size, uap->length);
7755 error = vnode_setattr(vp, &va, ctx);
7756
7757 #if CONFIG_MACF
7758 if (error == 0) {
7759 mac_vnode_notify_truncate(ctx, fp->fp_glob->fg_cred, vp);
7760 }
7761 #endif
7762
7763 (void)vnode_put(vp);
7764 out:
7765 file_drop(fd);
7766 return error;
7767 }
7768
7769
7770 /*
7771 * Sync an open file with synchronized I/O _file_ integrity completion
7772 */
7773 /* ARGSUSED */
7774 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)7775 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
7776 {
7777 __pthread_testcancel(1);
7778 return fsync_common(p, uap, MNT_WAIT);
7779 }
7780
7781
7782 /*
7783 * Sync an open file with synchronized I/O _file_ integrity completion
7784 *
7785 * Notes: This is a legacy support function that does not test for
7786 * thread cancellation points.
7787 */
7788 /* ARGSUSED */
7789 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)7790 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
7791 {
7792 return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
7793 }
7794
7795
7796 /*
7797 * Sync an open file with synchronized I/O _data_ integrity completion
7798 */
7799 /* ARGSUSED */
7800 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)7801 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
7802 {
7803 __pthread_testcancel(1);
7804 return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
7805 }
7806
7807
7808 /*
7809 * fsync_common
7810 *
7811 * Common fsync code to support both synchronized I/O file integrity completion
7812 * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
7813 *
7814 * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
7815 * will only guarantee that the file data contents are retrievable. If
7816 * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
7817 * includes additional metadata unnecessary for retrieving the file data
7818 * contents, such as atime, mtime, ctime, etc., also be committed to stable
7819 * storage.
7820 *
7821 * Parameters: p The process
7822 * uap->fd The descriptor to synchronize
7823 * flags The data integrity flags
7824 *
7825 * Returns: int Success
7826 * fp_getfvp:EBADF Bad file descriptor
7827 * fp_getfvp:ENOTSUP fd does not refer to a vnode
7828 * VNOP_FSYNC:??? unspecified
7829 *
7830 * Notes: We use struct fsync_args because it is a short name, and all
7831 * caller argument structures are otherwise identical.
7832 */
7833 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)7834 fsync_common(proc_t p, struct fsync_args *uap, int flags)
7835 {
7836 vnode_t vp;
7837 struct fileproc *fp;
7838 vfs_context_t ctx = vfs_context_current();
7839 int error;
7840
7841 AUDIT_ARG(fd, uap->fd);
7842
7843 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
7844 return error;
7845 }
7846 if ((error = vnode_getwithref(vp))) {
7847 file_drop(uap->fd);
7848 return error;
7849 }
7850
7851 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7852
7853 error = VNOP_FSYNC(vp, flags, ctx);
7854
7855 #if NAMEDRSRCFORK
7856 /* Sync resource fork shadow file if necessary. */
7857 if ((error == 0) &&
7858 (vp->v_flag & VISNAMEDSTREAM) &&
7859 (vp->v_parent != NULLVP) &&
7860 vnode_isshadow(vp) &&
7861 (fp->fp_glob->fg_flag & FWASWRITTEN)) {
7862 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
7863 }
7864 #endif
7865
7866 (void)vnode_put(vp);
7867 file_drop(uap->fd);
7868 return error;
7869 }
7870
7871 /*
7872 * Duplicate files. Source must be a file, target must be a file or
7873 * must not exist.
7874 *
7875 * XXX Copyfile authorisation checking is woefully inadequate, and will not
7876 * perform inheritance correctly.
7877 */
7878 /* ARGSUSED */
7879 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)7880 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
7881 {
7882 vnode_t tvp, fvp, tdvp, sdvp;
7883 struct nameidata fromnd, tond;
7884 int error;
7885 vfs_context_t ctx = vfs_context_current();
7886
7887 /* Check that the flags are valid. */
7888 if (uap->flags & ~CPF_MASK) {
7889 return EINVAL;
7890 }
7891
7892 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
7893 UIO_USERSPACE, uap->from, ctx);
7894 if ((error = namei(&fromnd))) {
7895 return error;
7896 }
7897 fvp = fromnd.ni_vp;
7898
7899 NDINIT(&tond, CREATE, OP_LINK,
7900 LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
7901 UIO_USERSPACE, uap->to, ctx);
7902 if ((error = namei(&tond))) {
7903 goto out1;
7904 }
7905 tdvp = tond.ni_dvp;
7906 tvp = tond.ni_vp;
7907
7908 if (tvp != NULL) {
7909 if (!(uap->flags & CPF_OVERWRITE)) {
7910 error = EEXIST;
7911 goto out;
7912 }
7913 }
7914
7915 if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
7916 error = EISDIR;
7917 goto out;
7918 }
7919
7920 if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
7921 error = EOPNOTSUPP;
7922 goto out;
7923 }
7924
7925 #if CONFIG_MACF
7926 if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
7927 goto out;
7928 }
7929 #endif /* CONFIG_MACF */
7930
7931 if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
7932 goto out;
7933 }
7934 if (tvp) {
7935 if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
7936 goto out;
7937 }
7938 }
7939 if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
7940 goto out;
7941 }
7942
7943 if (fvp == tdvp) {
7944 error = EINVAL;
7945 }
7946 /*
7947 * If source is the same as the destination (that is the
7948 * same inode number) then there is nothing to do.
7949 * (fixed to have POSIX semantics - CSM 3/2/98)
7950 */
7951 if (fvp == tvp) {
7952 error = -1;
7953 }
7954 if (!error) {
7955 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
7956 }
7957 out:
7958 sdvp = tond.ni_startdir;
7959 /*
7960 * nameidone has to happen before we vnode_put(tdvp)
7961 * since it may need to release the fs_nodelock on the tdvp
7962 */
7963 nameidone(&tond);
7964
7965 if (tvp) {
7966 vnode_put(tvp);
7967 }
7968 vnode_put(tdvp);
7969 vnode_put(sdvp);
7970 out1:
7971 vnode_put(fvp);
7972
7973 nameidone(&fromnd);
7974
7975 if (error == -1) {
7976 return 0;
7977 }
7978 return error;
7979 }
7980
7981 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
7982
7983 /*
7984 * Helper function for doing clones. The caller is expected to provide an
7985 * iocounted source vnode and release it.
7986 */
7987 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)7988 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
7989 user_addr_t dst, uint32_t flags, vfs_context_t ctx)
7990 {
7991 vnode_t tvp, tdvp;
7992 struct nameidata tond;
7993 int error;
7994 int follow;
7995 boolean_t free_src_acl;
7996 boolean_t attr_cleanup;
7997 enum vtype v_type;
7998 kauth_action_t action;
7999 struct componentname *cnp;
8000 uint32_t defaulted;
8001 struct vnode_attr va;
8002 struct vnode_attr nva;
8003 uint32_t vnop_flags;
8004
8005 v_type = vnode_vtype(fvp);
8006 switch (v_type) {
8007 case VLNK:
8008 /* FALLTHRU */
8009 case VREG:
8010 action = KAUTH_VNODE_ADD_FILE;
8011 break;
8012 case VDIR:
8013 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8014 fvp->v_mountedhere) {
8015 return EINVAL;
8016 }
8017 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8018 break;
8019 default:
8020 return EINVAL;
8021 }
8022
8023 AUDIT_ARG(fd2, dst_dirfd);
8024 AUDIT_ARG(value32, flags);
8025
8026 follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8027 NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8028 UIO_USERSPACE, dst, ctx);
8029 if ((error = nameiat(&tond, dst_dirfd))) {
8030 return error;
8031 }
8032 cnp = &tond.ni_cnd;
8033 tdvp = tond.ni_dvp;
8034 tvp = tond.ni_vp;
8035
8036 free_src_acl = FALSE;
8037 attr_cleanup = FALSE;
8038
8039 if (tvp != NULL) {
8040 error = EEXIST;
8041 goto out;
8042 }
8043
8044 if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8045 error = EXDEV;
8046 goto out;
8047 }
8048
8049 #if CONFIG_MACF
8050 if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8051 goto out;
8052 }
8053 #endif
8054 if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8055 goto out;
8056 }
8057
8058 action = KAUTH_VNODE_GENERIC_READ_BITS;
8059 if (data_read_authorised) {
8060 action &= ~KAUTH_VNODE_READ_DATA;
8061 }
8062 if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8063 goto out;
8064 }
8065
8066 /*
8067 * certain attributes may need to be changed from the source, we ask for
8068 * those here with the exception of source file's ACL. The clone file
8069 * will inherit the target directory's ACL.
8070 */
8071 VATTR_INIT(&va);
8072 VATTR_WANTED(&va, va_uid);
8073 VATTR_WANTED(&va, va_gid);
8074 VATTR_WANTED(&va, va_mode);
8075 VATTR_WANTED(&va, va_flags);
8076
8077 if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
8078 goto out;
8079 }
8080
8081 VATTR_INIT(&nva);
8082 VATTR_SET(&nva, va_type, v_type);
8083 if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8084 VATTR_SET(&nva, va_acl, va.va_acl);
8085 free_src_acl = TRUE;
8086 }
8087
8088 /* Handle ACL inheritance, initialize vap. */
8089 if (v_type == VLNK) {
8090 error = vnode_authattr_new(tdvp, &nva, 0, ctx);
8091 } else {
8092 error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
8093 if (error) {
8094 goto out;
8095 }
8096 attr_cleanup = TRUE;
8097 }
8098
8099 vnop_flags = VNODE_CLONEFILE_DEFAULT;
8100 /*
8101 * We've got initial values for all security parameters,
8102 * If we are superuser, then we can change owners to be the
8103 * same as the source. Both superuser and the owner have default
8104 * WRITE_SECURITY privileges so all other fields can be taken
8105 * from source as well.
8106 */
8107 if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8108 if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8109 VATTR_SET(&nva, va_uid, va.va_uid);
8110 }
8111 if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8112 VATTR_SET(&nva, va_gid, va.va_gid);
8113 }
8114 } else {
8115 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8116 }
8117
8118 if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8119 VATTR_SET(&nva, va_mode, va.va_mode);
8120 }
8121 if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8122 VATTR_SET(&nva, va_flags,
8123 ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8124 (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8125 }
8126
8127 error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8128
8129 if (!error && tvp) {
8130 int update_flags = 0;
8131 #if CONFIG_FSE
8132 int fsevent;
8133 #endif /* CONFIG_FSE */
8134
8135 /*
8136 * If some of the requested attributes weren't handled by the
8137 * VNOP, use our fallback code.
8138 */
8139 if (!VATTR_ALL_SUPPORTED(&nva)) {
8140 (void)vnode_setattr_fallback(tvp, &nva, ctx);
8141 }
8142
8143 #if CONFIG_MACF
8144 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8145 VNODE_LABEL_CREATE, ctx);
8146 #endif
8147
8148 // Make sure the name & parent pointers are hooked up
8149 if (tvp->v_name == NULL) {
8150 update_flags |= VNODE_UPDATE_NAME;
8151 }
8152 if (tvp->v_parent == NULLVP) {
8153 update_flags |= VNODE_UPDATE_PARENT;
8154 }
8155
8156 if (update_flags) {
8157 (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8158 cnp->cn_namelen, cnp->cn_hash, update_flags);
8159 }
8160
8161 #if CONFIG_FSE
8162 switch (vnode_vtype(tvp)) {
8163 case VLNK:
8164 /* FALLTHRU */
8165 case VREG:
8166 fsevent = FSE_CREATE_FILE;
8167 break;
8168 case VDIR:
8169 fsevent = FSE_CREATE_DIR;
8170 break;
8171 default:
8172 goto out;
8173 }
8174
8175 if (need_fsevent(fsevent, tvp)) {
8176 /*
8177 * The following is a sequence of three explicit events.
8178 * A pair of FSE_CLONE events representing the source and destination
8179 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8180 * fseventsd may coalesce the destination clone and create events
8181 * into a single event resulting in the following sequence for a client
8182 * FSE_CLONE (src)
8183 * FSE_CLONE | FSE_CREATE (dst)
8184 */
8185 add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8186 FSE_ARG_DONE);
8187 add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8188 FSE_ARG_DONE);
8189 }
8190 #endif /* CONFIG_FSE */
8191 }
8192
8193 out:
8194 if (attr_cleanup) {
8195 vn_attribute_cleanup(&nva, defaulted);
8196 }
8197 if (free_src_acl && va.va_acl) {
8198 kauth_acl_free(va.va_acl);
8199 }
8200 nameidone(&tond);
8201 if (tvp) {
8202 vnode_put(tvp);
8203 }
8204 vnode_put(tdvp);
8205 return error;
8206 }
8207
8208 /*
8209 * clone files or directories, target must not exist.
8210 */
8211 /* ARGSUSED */
8212 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8213 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8214 __unused int32_t *retval)
8215 {
8216 vnode_t fvp;
8217 struct nameidata fromnd;
8218 int follow;
8219 int error;
8220 vfs_context_t ctx = vfs_context_current();
8221
8222 /* Check that the flags are valid. */
8223 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
8224 return EINVAL;
8225 }
8226
8227 AUDIT_ARG(fd, uap->src_dirfd);
8228
8229 follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8230 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8231 UIO_USERSPACE, uap->src, ctx);
8232 if ((error = nameiat(&fromnd, uap->src_dirfd))) {
8233 return error;
8234 }
8235
8236 fvp = fromnd.ni_vp;
8237 nameidone(&fromnd);
8238
8239 error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8240 uap->flags, ctx);
8241
8242 vnode_put(fvp);
8243 return error;
8244 }
8245
8246 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)8247 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8248 __unused int32_t *retval)
8249 {
8250 vnode_t fvp;
8251 struct fileproc *fp;
8252 int error;
8253 vfs_context_t ctx = vfs_context_current();
8254
8255 /* Check that the flags are valid. */
8256 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
8257 return EINVAL;
8258 }
8259
8260 AUDIT_ARG(fd, uap->src_fd);
8261 error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8262 if (error) {
8263 return error;
8264 }
8265
8266 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8267 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8268 error = EBADF;
8269 goto out;
8270 }
8271
8272 if ((error = vnode_getwithref(fvp))) {
8273 goto out;
8274 }
8275
8276 AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8277
8278 error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8279 uap->flags, ctx);
8280
8281 vnode_put(fvp);
8282 out:
8283 file_drop(uap->src_fd);
8284 return error;
8285 }
8286
8287 static int
rename_submounts_callback(mount_t mp,void * arg)8288 rename_submounts_callback(mount_t mp, void *arg)
8289 {
8290 int error = 0;
8291 mount_t pmp = (mount_t)arg;
8292 int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8293
8294 if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8295 return 0;
8296 }
8297
8298 if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8299 return 0;
8300 }
8301
8302 if ((error = vfs_busy(mp, LK_NOWAIT))) {
8303 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8304 return -1;
8305 }
8306
8307 int pathlen = MAXPATHLEN;
8308 if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8309 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8310 }
8311
8312 vfs_unbusy(mp);
8313
8314 return error;
8315 }
8316
8317 /*
8318 * Rename files. Source and destination must either both be directories,
8319 * or both not be directories. If target is a directory, it must be empty.
8320 */
8321 /* ARGSUSED */
8322 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)8323 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8324 int tofd, user_addr_t to, int segflg, u_int uflags)
8325 {
8326 vnode_t tvp, tdvp;
8327 vnode_t fvp, fdvp;
8328 vnode_t mnt_fvp;
8329 struct nameidata *fromnd, *tond;
8330 int error;
8331 int do_retry;
8332 int retry_count;
8333 int mntrename;
8334 int need_event;
8335 int need_kpath2;
8336 int has_listeners;
8337 const char *oname = NULL;
8338 char *from_name = NULL, *to_name = NULL;
8339 char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8340 int from_len = 0, to_len = 0;
8341 int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8342 int holding_mntlock;
8343 int vn_authorize_skipped;
8344 mount_t locked_mp = NULL;
8345 vnode_t oparent = NULLVP;
8346 #if CONFIG_FSE
8347 fse_info from_finfo = {}, to_finfo;
8348 #endif
8349 int from_truncated = 0, to_truncated = 0;
8350 int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8351 int batched = 0;
8352 struct vnode_attr *fvap, *tvap;
8353 int continuing = 0;
8354 vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
8355 int32_t nofollow_any = 0;
8356 /* carving out a chunk for structs that are too big to be on stack. */
8357 struct {
8358 struct nameidata from_node, to_node;
8359 struct vnode_attr fv_attr, tv_attr;
8360 } * __rename_data;
8361
8362 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
8363 fromnd = &__rename_data->from_node;
8364 tond = &__rename_data->to_node;
8365
8366 holding_mntlock = 0;
8367 do_retry = 0;
8368 retry_count = 0;
8369 retry:
8370 fvp = tvp = NULL;
8371 fdvp = tdvp = NULL;
8372 fvap = tvap = NULL;
8373 mnt_fvp = NULLVP;
8374 mntrename = FALSE;
8375 vn_authorize_skipped = FALSE;
8376
8377 if (uflags & RENAME_NOFOLLOW_ANY) {
8378 nofollow_any = NAMEI_NOFOLLOW_ANY;
8379 }
8380 NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8381 segflg, from, ctx);
8382 fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8383
8384 NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8385 segflg, to, ctx);
8386 tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8387
8388 continue_lookup:
8389 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8390 if ((error = nameiat(fromnd, fromfd))) {
8391 goto out1;
8392 }
8393 fdvp = fromnd->ni_dvp;
8394 fvp = fromnd->ni_vp;
8395
8396 if (fvp && fvp->v_type == VDIR) {
8397 tond->ni_cnd.cn_flags |= WILLBEDIR;
8398 }
8399 }
8400
8401 if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8402 if ((error = nameiat(tond, tofd))) {
8403 /*
8404 * Translate error code for rename("dir1", "dir2/.").
8405 */
8406 if (error == EISDIR && fvp->v_type == VDIR) {
8407 error = EINVAL;
8408 }
8409 goto out1;
8410 }
8411 tdvp = tond->ni_dvp;
8412 tvp = tond->ni_vp;
8413 }
8414
8415 #if DEVELOPMENT || DEBUG
8416 /*
8417 * XXX VSWAP: Check for entitlements or special flag here
8418 * so we can restrict access appropriately.
8419 */
8420 #else /* DEVELOPMENT || DEBUG */
8421
8422 if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
8423 error = EPERM;
8424 goto out1;
8425 }
8426
8427 if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
8428 error = EPERM;
8429 goto out1;
8430 }
8431 #endif /* DEVELOPMENT || DEBUG */
8432
8433 if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
8434 error = ENOENT;
8435 goto out1;
8436 }
8437
8438 if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
8439 int32_t pval = 0;
8440 int err = 0;
8441
8442 /*
8443 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
8444 * has the same name as target iff the following conditions are met:
8445 * 1. the target file system is case insensitive
8446 * 2. source and target directories are the same
8447 * 3. source and target files are the same
8448 * 4. name only differs in case (determined by underlying filesystem)
8449 */
8450 if (fvp != tvp || fdvp != tdvp) {
8451 error = EEXIST;
8452 goto out1;
8453 }
8454
8455 /*
8456 * Assume that the target file system is case sensitive if
8457 * _PC_CASE_SENSITIVE selector isn't supported.
8458 */
8459 err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
8460 if (err != 0 || pval != 0) {
8461 error = EEXIST;
8462 goto out1;
8463 }
8464 }
8465
8466 batched = vnode_compound_rename_available(fdvp);
8467
8468 #if CONFIG_FSE
8469 need_event = need_fsevent(FSE_RENAME, fdvp);
8470 if (need_event) {
8471 if (fvp) {
8472 get_fse_info(fvp, &from_finfo, ctx);
8473 } else {
8474 error = vfs_get_notify_attributes(&__rename_data->fv_attr);
8475 if (error) {
8476 goto out1;
8477 }
8478
8479 fvap = &__rename_data->fv_attr;
8480 }
8481
8482 if (tvp) {
8483 get_fse_info(tvp, &to_finfo, ctx);
8484 } else if (batched) {
8485 error = vfs_get_notify_attributes(&__rename_data->tv_attr);
8486 if (error) {
8487 goto out1;
8488 }
8489
8490 tvap = &__rename_data->tv_attr;
8491 }
8492 }
8493 #else
8494 need_event = 0;
8495 #endif /* CONFIG_FSE */
8496
8497 has_listeners = kauth_authorize_fileop_has_listeners();
8498
8499 need_kpath2 = 0;
8500 #if CONFIG_AUDIT
8501 if (AUDIT_RECORD_EXISTS()) {
8502 need_kpath2 = 1;
8503 }
8504 #endif
8505
8506 if (need_event || has_listeners) {
8507 if (from_name == NULL) {
8508 GET_PATH(from_name);
8509 }
8510
8511 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
8512
8513 if (from_name_no_firmlink == NULL) {
8514 GET_PATH(from_name_no_firmlink);
8515 }
8516
8517 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
8518 }
8519
8520 if (need_event || need_kpath2 || has_listeners) {
8521 if (to_name == NULL) {
8522 GET_PATH(to_name);
8523 }
8524
8525 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
8526
8527 if (to_name_no_firmlink == NULL) {
8528 GET_PATH(to_name_no_firmlink);
8529 }
8530
8531 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
8532 if (to_name && need_kpath2) {
8533 AUDIT_ARG(kpath, to_name, ARG_KPATH2);
8534 }
8535 }
8536 if (!fvp) {
8537 /*
8538 * Claim: this check will never reject a valid rename.
8539 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
8540 * Suppose fdvp and tdvp are not on the same mount.
8541 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root,
8542 * then you can't move it to within another dir on the same mountpoint.
8543 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
8544 *
8545 * If this check passes, then we are safe to pass these vnodes to the same FS.
8546 */
8547 if (fdvp->v_mount != tdvp->v_mount) {
8548 error = EXDEV;
8549 goto out1;
8550 }
8551 goto skipped_lookup;
8552 }
8553
8554 /*
8555 * If the source and destination are the same (i.e. they're
8556 * links to the same vnode) and the target file system is
8557 * case sensitive, then there is nothing to do.
8558 *
8559 * XXX Come back to this.
8560 */
8561 if (fvp == tvp) {
8562 int pathconf_val;
8563
8564 /*
8565 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
8566 * then assume that this file system is case sensitive.
8567 */
8568 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
8569 pathconf_val != 0) {
8570 vn_authorize_skipped = TRUE;
8571 goto out1;
8572 }
8573 }
8574
8575 /*
8576 * Allow the renaming of mount points.
8577 * - target must not exist
8578 * - target must reside in the same directory as source
8579 * - union mounts cannot be renamed
8580 * - the root fs, and tightly-linked system volumes, cannot be renamed
8581 *
8582 * XXX Handle this in VFS after a continued lookup (if we missed
8583 * in the cache to start off)
8584 *
8585 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
8586 * we'll skip past here. The file system is responsible for
8587 * checking that @tvp is not a descendent of @fvp and vice versa
8588 * so it should always return EINVAL if either @tvp or @fvp is the
8589 * root of a volume.
8590 */
8591 if ((fvp->v_flag & VROOT) &&
8592 (fvp->v_type == VDIR) &&
8593 (tvp == NULL) &&
8594 (fvp->v_mountedhere == NULL) &&
8595 (fdvp == tdvp) &&
8596 ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
8597 ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
8598 (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
8599 vnode_t coveredvp;
8600
8601 /* switch fvp to the covered vnode */
8602 coveredvp = fvp->v_mount->mnt_vnodecovered;
8603 if ((vnode_getwithref(coveredvp))) {
8604 error = ENOENT;
8605 goto out1;
8606 }
8607 /*
8608 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
8609 * later.
8610 */
8611 mnt_fvp = fvp;
8612
8613 fvp = coveredvp;
8614 mntrename = TRUE;
8615 }
8616 /*
8617 * Check for cross-device rename.
8618 */
8619 if ((fvp->v_mount != tdvp->v_mount) ||
8620 (tvp && (fvp->v_mount != tvp->v_mount))) {
8621 error = EXDEV;
8622 goto out1;
8623 }
8624
8625 /*
8626 * If source is the same as the destination (that is the
8627 * same inode number) then there is nothing to do...
8628 * EXCEPT if the underlying file system supports case
8629 * insensitivity and is case preserving. In this case
8630 * the file system needs to handle the special case of
8631 * getting the same vnode as target (fvp) and source (tvp).
8632 *
8633 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
8634 * and _PC_CASE_PRESERVING can have this exception, and they need to
8635 * handle the special case of getting the same vnode as target and
8636 * source. NOTE: Then the target is unlocked going into vnop_rename,
8637 * so not to cause locking problems. There is a single reference on tvp.
8638 *
8639 * NOTE - that fvp == tvp also occurs if they are hard linked and
8640 * that correct behaviour then is just to return success without doing
8641 * anything.
8642 *
8643 * XXX filesystem should take care of this itself, perhaps...
8644 */
8645 if (fvp == tvp && fdvp == tdvp) {
8646 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
8647 !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
8648 fromnd->ni_cnd.cn_namelen)) {
8649 vn_authorize_skipped = TRUE;
8650 goto out1;
8651 }
8652 }
8653
8654 if (holding_mntlock && fvp->v_mount != locked_mp) {
8655 /*
8656 * we're holding a reference and lock
8657 * on locked_mp, but it no longer matches
8658 * what we want to do... so drop our hold
8659 */
8660 mount_unlock_renames(locked_mp);
8661 mount_drop(locked_mp, 0);
8662 holding_mntlock = 0;
8663 }
8664 if (tdvp != fdvp && fvp->v_type == VDIR) {
8665 /*
8666 * serialize renames that re-shape
8667 * the tree... if holding_mntlock is
8668 * set, then we're ready to go...
8669 * otherwise we
8670 * first need to drop the iocounts
8671 * we picked up, second take the
8672 * lock to serialize the access,
8673 * then finally start the lookup
8674 * process over with the lock held
8675 */
8676 if (!holding_mntlock) {
8677 /*
8678 * need to grab a reference on
8679 * the mount point before we
8680 * drop all the iocounts... once
8681 * the iocounts are gone, the mount
8682 * could follow
8683 */
8684 locked_mp = fvp->v_mount;
8685 mount_ref(locked_mp, 0);
8686
8687 /*
8688 * nameidone has to happen before we vnode_put(tvp)
8689 * since it may need to release the fs_nodelock on the tvp
8690 */
8691 nameidone(tond);
8692
8693 if (tvp) {
8694 vnode_put(tvp);
8695 }
8696 vnode_put(tdvp);
8697
8698 /*
8699 * nameidone has to happen before we vnode_put(fdvp)
8700 * since it may need to release the fs_nodelock on the fvp
8701 */
8702 nameidone(fromnd);
8703
8704 vnode_put(fvp);
8705 vnode_put(fdvp);
8706
8707 if (mnt_fvp != NULLVP) {
8708 vnode_put(mnt_fvp);
8709 }
8710
8711 mount_lock_renames(locked_mp);
8712 holding_mntlock = 1;
8713
8714 goto retry;
8715 }
8716 } else {
8717 /*
8718 * when we dropped the iocounts to take
8719 * the lock, we allowed the identity of
8720 * the various vnodes to change... if they did,
8721 * we may no longer be dealing with a rename
8722 * that reshapes the tree... once we're holding
8723 * the iocounts, the vnodes can't change type
8724 * so we're free to drop the lock at this point
8725 * and continue on
8726 */
8727 if (holding_mntlock) {
8728 mount_unlock_renames(locked_mp);
8729 mount_drop(locked_mp, 0);
8730 holding_mntlock = 0;
8731 }
8732 }
8733
8734 if (!batched) {
8735 error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
8736 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
8737 flags, NULL);
8738 if (error) {
8739 if (error == ENOENT) {
8740 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8741 /*
8742 * We encountered a race where after doing the namei,
8743 * tvp stops being valid. If so, simply re-drive the rename
8744 * call from the top.
8745 */
8746 do_retry = 1;
8747 retry_count += 1;
8748 }
8749 }
8750 goto out1;
8751 }
8752 }
8753
8754 /* Release the 'mnt_fvp' now that it is no longer needed. */
8755 if (mnt_fvp != NULLVP) {
8756 vnode_put(mnt_fvp);
8757 mnt_fvp = NULLVP;
8758 }
8759
8760 // save these off so we can later verify that fvp is the same
8761 oname = fvp->v_name;
8762 oparent = fvp->v_parent;
8763
8764 skipped_lookup:
8765 error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
8766 tdvp, &tvp, &tond->ni_cnd, tvap,
8767 flags, ctx);
8768
8769 if (holding_mntlock) {
8770 /*
8771 * we can drop our serialization
8772 * lock now
8773 */
8774 mount_unlock_renames(locked_mp);
8775 mount_drop(locked_mp, 0);
8776 holding_mntlock = 0;
8777 }
8778 if (error) {
8779 if (error == EDATALESS) {
8780 /*
8781 * If we've been here before, something has gone
8782 * horribly wrong and we should just get out lest
8783 * we spiral around the drain forever.
8784 */
8785 if (flags & VFS_RENAME_DATALESS) {
8786 error = EIO;
8787 goto out1;
8788 }
8789
8790 /*
8791 * The object we're renaming is dataless (or has a
8792 * dataless descendent) and requires materialization
8793 * before the rename occurs. But we're holding the
8794 * mount point's rename lock, so it's not safe to
8795 * make the upcall.
8796 *
8797 * In this case, we release the lock, perform the
8798 * materialization, and start the whole thing over.
8799 */
8800 error = vnode_materialize_dataless_file(fvp,
8801 NAMESPACE_HANDLER_RENAME_OP);
8802
8803 if (error == 0) {
8804 /*
8805 * The next time around we need to tell the
8806 * file system that the materializtaion has
8807 * been performed.
8808 */
8809 flags |= VFS_RENAME_DATALESS;
8810 do_retry = 1;
8811 }
8812 goto out1;
8813 }
8814 if (error == EKEEPLOOKING) {
8815 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8816 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8817 panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
8818 }
8819 }
8820
8821 fromnd->ni_vp = fvp;
8822 tond->ni_vp = tvp;
8823
8824 goto continue_lookup;
8825 }
8826
8827 /*
8828 * We may encounter a race in the VNOP where the destination didn't
8829 * exist when we did the namei, but it does by the time we go and
8830 * try to create the entry. In this case, we should re-drive this rename
8831 * call from the top again. Currently, only HFS bubbles out ERECYCLE,
8832 * but other filesystems susceptible to this race could return it, too.
8833 */
8834 if (error == ERECYCLE) {
8835 if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
8836 do_retry = 1;
8837 retry_count += 1;
8838 } else {
8839 printf("rename retry limit due to ERECYCLE reached\n");
8840 error = ENOENT;
8841 }
8842 }
8843
8844 /*
8845 * For compound VNOPs, the authorization callback may return
8846 * ENOENT in case of racing hardlink lookups hitting the name
8847 * cache, redrive the lookup.
8848 */
8849 if (batched && error == ENOENT) {
8850 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8851 do_retry = 1;
8852 retry_count += 1;
8853 }
8854 }
8855
8856 goto out1;
8857 }
8858
8859 /* call out to allow 3rd party notification of rename.
8860 * Ignore result of kauth_authorize_fileop call.
8861 */
8862 kauth_authorize_fileop(vfs_context_ucred(ctx),
8863 KAUTH_FILEOP_RENAME,
8864 (uintptr_t)from_name, (uintptr_t)to_name);
8865 if (flags & VFS_RENAME_SWAP) {
8866 kauth_authorize_fileop(vfs_context_ucred(ctx),
8867 KAUTH_FILEOP_RENAME,
8868 (uintptr_t)to_name, (uintptr_t)from_name);
8869 }
8870
8871 #if CONFIG_FSE
8872 if (from_name != NULL && to_name != NULL) {
8873 if (from_truncated || to_truncated) {
8874 // set it here since only the from_finfo gets reported up to user space
8875 from_finfo.mode |= FSE_TRUNCATED_PATH;
8876 }
8877
8878 if (tvap && tvp) {
8879 vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
8880 }
8881 if (fvap) {
8882 vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
8883 }
8884
8885 if (tvp) {
8886 add_fsevent(FSE_RENAME, ctx,
8887 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8888 FSE_ARG_FINFO, &from_finfo,
8889 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8890 FSE_ARG_FINFO, &to_finfo,
8891 FSE_ARG_DONE);
8892 if (flags & VFS_RENAME_SWAP) {
8893 /*
8894 * Strictly speaking, swap is the equivalent of
8895 * *three* renames. FSEvents clients should only take
8896 * the events as a hint, so we only bother reporting
8897 * two.
8898 */
8899 add_fsevent(FSE_RENAME, ctx,
8900 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8901 FSE_ARG_FINFO, &to_finfo,
8902 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8903 FSE_ARG_FINFO, &from_finfo,
8904 FSE_ARG_DONE);
8905 }
8906 } else {
8907 add_fsevent(FSE_RENAME, ctx,
8908 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8909 FSE_ARG_FINFO, &from_finfo,
8910 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8911 FSE_ARG_DONE);
8912 }
8913 }
8914 #endif /* CONFIG_FSE */
8915
8916 /*
8917 * update filesystem's mount point data
8918 */
8919 if (mntrename) {
8920 char *cp, *pathend, *mpname;
8921 char * tobuf;
8922 struct mount *mp;
8923 int maxlen;
8924 size_t len = 0;
8925
8926 mp = fvp->v_mountedhere;
8927
8928 if (vfs_busy(mp, LK_NOWAIT)) {
8929 error = EBUSY;
8930 goto out1;
8931 }
8932 tobuf = zalloc(ZV_NAMEI);
8933
8934 if (UIO_SEG_IS_USER_SPACE(segflg)) {
8935 error = copyinstr(to, tobuf, MAXPATHLEN, &len);
8936 } else {
8937 error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
8938 }
8939 if (!error) {
8940 /* find current mount point prefix */
8941 pathend = &mp->mnt_vfsstat.f_mntonname[0];
8942 for (cp = pathend; *cp != '\0'; ++cp) {
8943 if (*cp == '/') {
8944 pathend = cp + 1;
8945 }
8946 }
8947 /* find last component of target name */
8948 for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
8949 if (*cp == '/') {
8950 mpname = cp + 1;
8951 }
8952 }
8953
8954 /* Update f_mntonname of sub mounts */
8955 vfs_iterate(0, rename_submounts_callback, (void *)mp);
8956
8957 /* append name to prefix */
8958 maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
8959 bzero(pathend, maxlen);
8960
8961 strlcpy(pathend, mpname, maxlen);
8962 }
8963 zfree(ZV_NAMEI, tobuf);
8964
8965 vfs_unbusy(mp);
8966
8967 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
8968 }
8969 /*
8970 * fix up name & parent pointers. note that we first
8971 * check that fvp has the same name/parent pointers it
8972 * had before the rename call... this is a 'weak' check
8973 * at best...
8974 *
8975 * XXX oparent and oname may not be set in the compound vnop case
8976 */
8977 if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
8978 int update_flags;
8979
8980 update_flags = VNODE_UPDATE_NAME;
8981
8982 if (fdvp != tdvp) {
8983 update_flags |= VNODE_UPDATE_PARENT;
8984 }
8985
8986 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
8987 }
8988 out1:
8989 /*
8990 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
8991 * skipped earlier as no actual rename was performed.
8992 */
8993 if (vn_authorize_skipped && error == 0) {
8994 error = vn_authorize_renamex_with_paths(fdvp, fvp,
8995 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
8996 flags, NULL);
8997 if (error && error == ENOENT) {
8998 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8999 do_retry = 1;
9000 retry_count += 1;
9001 }
9002 }
9003 }
9004 if (to_name != NULL) {
9005 RELEASE_PATH(to_name);
9006 to_name = NULL;
9007 }
9008 if (to_name_no_firmlink != NULL) {
9009 RELEASE_PATH(to_name_no_firmlink);
9010 to_name_no_firmlink = NULL;
9011 }
9012 if (from_name != NULL) {
9013 RELEASE_PATH(from_name);
9014 from_name = NULL;
9015 }
9016 if (from_name_no_firmlink != NULL) {
9017 RELEASE_PATH(from_name_no_firmlink);
9018 from_name_no_firmlink = NULL;
9019 }
9020 if (holding_mntlock) {
9021 mount_unlock_renames(locked_mp);
9022 mount_drop(locked_mp, 0);
9023 holding_mntlock = 0;
9024 }
9025 if (tdvp) {
9026 /*
9027 * nameidone has to happen before we vnode_put(tdvp)
9028 * since it may need to release the fs_nodelock on the tdvp
9029 */
9030 nameidone(tond);
9031
9032 if (tvp) {
9033 vnode_put(tvp);
9034 }
9035 vnode_put(tdvp);
9036 }
9037 if (fdvp) {
9038 /*
9039 * nameidone has to happen before we vnode_put(fdvp)
9040 * since it may need to release the fs_nodelock on the fdvp
9041 */
9042 nameidone(fromnd);
9043
9044 if (fvp) {
9045 vnode_put(fvp);
9046 }
9047 vnode_put(fdvp);
9048 }
9049 if (mnt_fvp != NULLVP) {
9050 vnode_put(mnt_fvp);
9051 }
9052 /*
9053 * If things changed after we did the namei, then we will re-drive
9054 * this rename call from the top.
9055 */
9056 if (do_retry) {
9057 do_retry = 0;
9058 goto retry;
9059 }
9060
9061 kfree_type(typeof(*__rename_data), __rename_data);
9062 return error;
9063 }
9064
9065 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9066 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9067 {
9068 return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9069 AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9070 }
9071
9072 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9073 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9074 {
9075 if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9076 return EINVAL;
9077 }
9078
9079 if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9080 return EINVAL;
9081 }
9082
9083 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9084 uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9085 }
9086
9087 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9088 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9089 {
9090 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9091 uap->tofd, uap->to, UIO_USERSPACE, 0);
9092 }
9093
9094 /*
9095 * Make a directory file.
9096 *
9097 * Returns: 0 Success
9098 * EEXIST
9099 * namei:???
9100 * vnode_authorize:???
9101 * vn_create:???
9102 */
9103 /* ARGSUSED */
9104 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9105 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9106 enum uio_seg segflg)
9107 {
9108 vnode_t vp, dvp;
9109 int error;
9110 int update_flags = 0;
9111 int batched;
9112 struct nameidata nd;
9113
9114 AUDIT_ARG(mode, vap->va_mode);
9115 NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9116 path, ctx);
9117 nd.ni_cnd.cn_flags |= WILLBEDIR;
9118 nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9119
9120 continue_lookup:
9121 error = nameiat(&nd, fd);
9122 if (error) {
9123 return error;
9124 }
9125 dvp = nd.ni_dvp;
9126 vp = nd.ni_vp;
9127
9128 if (vp != NULL) {
9129 error = EEXIST;
9130 goto out;
9131 }
9132
9133 batched = vnode_compound_mkdir_available(dvp);
9134
9135 VATTR_SET(vap, va_type, VDIR);
9136
9137 /*
9138 * XXX
9139 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9140 * only get EXISTS or EISDIR for existing path components, and not that it could see
9141 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9142 * it will fail in a spurious manner. Need to figure out if this is valid behavior.
9143 */
9144 if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9145 if (error == EACCES || error == EPERM) {
9146 int error2;
9147
9148 nameidone(&nd);
9149 vnode_put(dvp);
9150 dvp = NULLVP;
9151
9152 /*
9153 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9154 * rather than EACCESS if the target exists.
9155 */
9156 NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9157 path, ctx);
9158 error2 = nameiat(&nd, fd);
9159 if (error2) {
9160 goto out;
9161 } else {
9162 vp = nd.ni_vp;
9163 error = EEXIST;
9164 goto out;
9165 }
9166 }
9167
9168 goto out;
9169 }
9170
9171 /*
9172 * make the directory
9173 */
9174 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9175 if (error == EKEEPLOOKING) {
9176 nd.ni_vp = vp;
9177 goto continue_lookup;
9178 }
9179
9180 goto out;
9181 }
9182
9183 // Make sure the name & parent pointers are hooked up
9184 if (vp->v_name == NULL) {
9185 update_flags |= VNODE_UPDATE_NAME;
9186 }
9187 if (vp->v_parent == NULLVP) {
9188 update_flags |= VNODE_UPDATE_PARENT;
9189 }
9190
9191 if (update_flags) {
9192 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9193 }
9194
9195 #if CONFIG_FSE
9196 add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9197 #endif
9198
9199 out:
9200 /*
9201 * nameidone has to happen before we vnode_put(dvp)
9202 * since it may need to release the fs_nodelock on the dvp
9203 */
9204 nameidone(&nd);
9205
9206 if (vp) {
9207 vnode_put(vp);
9208 }
9209 if (dvp) {
9210 vnode_put(dvp);
9211 }
9212
9213 return error;
9214 }
9215
9216 /*
9217 * mkdir_extended: Create a directory; with extended security (ACL).
9218 *
9219 * Parameters: p Process requesting to create the directory
9220 * uap User argument descriptor (see below)
9221 * retval (ignored)
9222 *
9223 * Indirect: uap->path Path of directory to create
9224 * uap->mode Access permissions to set
9225 * uap->xsecurity ACL to set
9226 *
9227 * Returns: 0 Success
9228 * !0 Not success
9229 *
9230 */
9231 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)9232 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9233 {
9234 int ciferror;
9235 kauth_filesec_t xsecdst;
9236 struct vnode_attr va;
9237
9238 AUDIT_ARG(owner, uap->uid, uap->gid);
9239
9240 xsecdst = NULL;
9241 if ((uap->xsecurity != USER_ADDR_NULL) &&
9242 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9243 return ciferror;
9244 }
9245
9246 VATTR_INIT(&va);
9247 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9248 if (xsecdst != NULL) {
9249 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9250 va.va_vaflags |= VA_FILESEC_ACL;
9251 }
9252
9253 ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9254 UIO_USERSPACE);
9255 if (xsecdst != NULL) {
9256 kauth_filesec_free(xsecdst);
9257 }
9258 return ciferror;
9259 }
9260
9261 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)9262 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9263 {
9264 struct vnode_attr va;
9265
9266 VATTR_INIT(&va);
9267 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9268
9269 return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9270 UIO_USERSPACE);
9271 }
9272
9273 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)9274 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9275 {
9276 struct vnode_attr va;
9277
9278 VATTR_INIT(&va);
9279 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9280
9281 return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9282 UIO_USERSPACE);
9283 }
9284
9285 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)9286 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9287 enum uio_seg segflg, int unlink_flags)
9288 {
9289 struct {
9290 struct nameidata nd;
9291 #if CONFIG_FSE
9292 struct vnode_attr va;
9293 #endif /* CONFIG_FSE */
9294 } *__rmdir_data;
9295 vnode_t vp, dvp;
9296 int error;
9297 struct nameidata *ndp;
9298 char *path = NULL;
9299 char *no_firmlink_path = NULL;
9300 int len_path = 0;
9301 int len_no_firmlink_path = 0;
9302 int has_listeners = 0;
9303 int need_event = 0;
9304 int truncated_path = 0;
9305 int truncated_no_firmlink_path = 0;
9306 struct vnode_attr *vap = NULL;
9307 int restart_count = 0;
9308 int batched;
9309
9310 int restart_flag;
9311
9312 __rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
9313 ndp = &__rmdir_data->nd;
9314
9315 /*
9316 * This loop exists to restart rmdir in the unlikely case that two
9317 * processes are simultaneously trying to remove the same directory
9318 * containing orphaned appleDouble files.
9319 */
9320 do {
9321 NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9322 segflg, dirpath, ctx);
9323 ndp->ni_flag = NAMEI_COMPOUNDRMDIR;
9324 continue_lookup:
9325 restart_flag = 0;
9326 vap = NULL;
9327
9328 error = nameiat(ndp, fd);
9329 if (error) {
9330 goto err_out;
9331 }
9332
9333 dvp = ndp->ni_dvp;
9334 vp = ndp->ni_vp;
9335
9336 if (vp) {
9337 batched = vnode_compound_rmdir_available(vp);
9338
9339 if (vp->v_flag & VROOT) {
9340 /*
9341 * The root of a mounted filesystem cannot be deleted.
9342 */
9343 error = EBUSY;
9344 goto out;
9345 }
9346
9347 #if DEVELOPMENT || DEBUG
9348 /*
9349 * XXX VSWAP: Check for entitlements or special flag here
9350 * so we can restrict access appropriately.
9351 */
9352 #else /* DEVELOPMENT || DEBUG */
9353
9354 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
9355 error = EPERM;
9356 goto out;
9357 }
9358 #endif /* DEVELOPMENT || DEBUG */
9359
9360 /*
9361 * Removed a check here; we used to abort if vp's vid
9362 * was not the same as what we'd seen the last time around.
9363 * I do not think that check was valid, because if we retry
9364 * and all dirents are gone, the directory could legitimately
9365 * be recycled but still be present in a situation where we would
9366 * have had permission to delete. Therefore, we won't make
9367 * an effort to preserve that check now that we may not have a
9368 * vp here.
9369 */
9370
9371 if (!batched) {
9372 error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
9373 if (error) {
9374 if (error == ENOENT) {
9375 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9376 restart_flag = 1;
9377 restart_count += 1;
9378 }
9379 }
9380 goto out;
9381 }
9382 }
9383 } else {
9384 batched = 1;
9385
9386 if (!vnode_compound_rmdir_available(dvp)) {
9387 panic("No error, but no compound rmdir?");
9388 }
9389 }
9390
9391 #if CONFIG_FSE
9392 fse_info finfo = {0};
9393
9394 need_event = need_fsevent(FSE_DELETE, dvp);
9395 if (need_event) {
9396 if (!batched) {
9397 get_fse_info(vp, &finfo, ctx);
9398 } else {
9399 error = vfs_get_notify_attributes(&__rmdir_data->va);
9400 if (error) {
9401 goto out;
9402 }
9403
9404 vap = &__rmdir_data->va;
9405 }
9406 }
9407 #endif
9408 has_listeners = kauth_authorize_fileop_has_listeners();
9409 if (need_event || has_listeners) {
9410 if (path == NULL) {
9411 GET_PATH(path);
9412 }
9413
9414 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
9415
9416 if (no_firmlink_path == NULL) {
9417 GET_PATH(no_firmlink_path);
9418 }
9419
9420 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
9421 #if CONFIG_FSE
9422 if (truncated_no_firmlink_path) {
9423 finfo.mode |= FSE_TRUNCATED_PATH;
9424 }
9425 #endif
9426 }
9427
9428 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
9429 ndp->ni_vp = vp;
9430 if (vp == NULLVP) {
9431 /* Couldn't find a vnode */
9432 goto out;
9433 }
9434
9435 if (error == EKEEPLOOKING) {
9436 goto continue_lookup;
9437 } else if (batched && error == ENOENT) {
9438 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9439 /*
9440 * For compound VNOPs, the authorization callback
9441 * may return ENOENT in case of racing hard link lookups
9442 * redrive the lookup.
9443 */
9444 restart_flag = 1;
9445 restart_count += 1;
9446 goto out;
9447 }
9448 }
9449
9450 /*
9451 * XXX There's no provision for passing flags
9452 * to VNOP_RMDIR(). So, if vn_rmdir() fails
9453 * because it's not empty, then we try again
9454 * with VNOP_REMOVE(), passing in a special
9455 * flag that clever file systems will know
9456 * how to handle.
9457 */
9458 if (error == ENOTEMPTY &&
9459 (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
9460 /*
9461 * If this fails, we want to keep the original
9462 * error.
9463 */
9464 if (vn_remove(dvp, &vp, ndp,
9465 VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
9466 error = 0;
9467 }
9468 }
9469
9470 #if CONFIG_APPLEDOUBLE
9471 /*
9472 * Special case to remove orphaned AppleDouble
9473 * files. I don't like putting this in the kernel,
9474 * but carbon does not like putting this in carbon either,
9475 * so here we are.
9476 */
9477 if (error == ENOTEMPTY) {
9478 int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
9479 if (ad_error == EBUSY) {
9480 error = ad_error;
9481 goto out;
9482 }
9483
9484
9485 /*
9486 * Assuming everything went well, we will try the RMDIR again
9487 */
9488 if (!ad_error) {
9489 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
9490 }
9491 }
9492 #endif /* CONFIG_APPLEDOUBLE */
9493 /*
9494 * Call out to allow 3rd party notification of delete.
9495 * Ignore result of kauth_authorize_fileop call.
9496 */
9497 if (!error) {
9498 if (has_listeners) {
9499 kauth_authorize_fileop(vfs_context_ucred(ctx),
9500 KAUTH_FILEOP_DELETE,
9501 (uintptr_t)vp,
9502 (uintptr_t)path);
9503 }
9504
9505 if (vp->v_flag & VISHARDLINK) {
9506 // see the comment in unlink1() about why we update
9507 // the parent of a hard link when it is removed
9508 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
9509 }
9510
9511 #if CONFIG_FSE
9512 if (need_event) {
9513 if (vap) {
9514 vnode_get_fse_info_from_vap(vp, &finfo, vap);
9515 }
9516 add_fsevent(FSE_DELETE, ctx,
9517 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
9518 FSE_ARG_FINFO, &finfo,
9519 FSE_ARG_DONE);
9520 }
9521 #endif
9522 }
9523
9524 out:
9525 if (path != NULL) {
9526 RELEASE_PATH(path);
9527 path = NULL;
9528 }
9529
9530 if (no_firmlink_path != NULL) {
9531 RELEASE_PATH(no_firmlink_path);
9532 no_firmlink_path = NULL;
9533 }
9534
9535 /*
9536 * nameidone has to happen before we vnode_put(dvp)
9537 * since it may need to release the fs_nodelock on the dvp
9538 */
9539 nameidone(ndp);
9540 vnode_put(dvp);
9541
9542 if (vp) {
9543 vnode_put(vp);
9544 }
9545
9546 if (restart_flag == 0) {
9547 wakeup_one((caddr_t)vp);
9548 goto err_out;
9549 }
9550 tsleep(vp, PVFS, "rm AD", 1);
9551 } while (restart_flag != 0);
9552
9553 err_out:
9554 kfree_type(typeof(*__rmdir_data), __rmdir_data);
9555
9556 return error;
9557 }
9558
9559 /*
9560 * Remove a directory file.
9561 */
9562 /* ARGSUSED */
9563 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)9564 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
9565 {
9566 return rmdirat_internal(vfs_context_current(), AT_FDCWD,
9567 CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
9568 }
9569
9570 /* Get direntry length padded to 8 byte alignment */
9571 #define DIRENT64_LEN(namlen) \
9572 ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
9573
9574 /* Get dirent length padded to 4 byte alignment */
9575 #define DIRENT_LEN(namelen) \
9576 ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
9577
9578 /* Get the end of this dirent */
9579 #define DIRENT_END(dep) \
9580 (((char *)(dep)) + (dep)->d_reclen - 1)
9581
9582 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)9583 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
9584 int *numdirent, vfs_context_t ctxp)
9585 {
9586 /* Check if fs natively supports VNODE_READDIR_EXTENDED */
9587 if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
9588 ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
9589 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
9590 } else {
9591 size_t bufsize;
9592 void * bufptr;
9593 uio_t auio;
9594 struct direntry *entry64;
9595 struct dirent *dep;
9596 size_t bytesread;
9597 int error;
9598
9599 /*
9600 * We're here because the underlying file system does not
9601 * support direnties or we mounted denying support so we must
9602 * fall back to dirents and convert them to direntries.
9603 *
9604 * Our kernel buffer needs to be smaller since re-packing will
9605 * expand each dirent. The worse case (when the name length
9606 * is 3 or less) corresponds to a struct direntry size of 32
9607 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
9608 * (4-byte aligned). So having a buffer that is 3/8 the size
9609 * will prevent us from reading more than we can pack.
9610 *
9611 * Since this buffer is wired memory, we will limit the
9612 * buffer size to a maximum of 32K. We would really like to
9613 * use 32K in the MIN(), but we use magic number 87371 to
9614 * prevent uio_resid() * 3 / 8 from overflowing.
9615 */
9616 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
9617 bufptr = kalloc_data(bufsize, Z_WAITOK);
9618 if (bufptr == NULL) {
9619 return ENOMEM;
9620 }
9621
9622 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
9623 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
9624 auio->uio_offset = uio->uio_offset;
9625
9626 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
9627
9628 dep = (struct dirent *)bufptr;
9629 bytesread = bufsize - uio_resid(auio);
9630
9631 entry64 = kalloc_type(struct direntry, Z_WAITOK);
9632 /*
9633 * Convert all the entries and copy them out to user's buffer.
9634 */
9635 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
9636 /* First check that the dirent struct up to d_name is within the buffer */
9637 if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
9638 /* Check that the length of the entire dirent is within the buffer */
9639 DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
9640 /* Check that the actual length including the name doesn't exceed d_reclen */
9641 DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
9642 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
9643 vp->v_mount->mnt_vfsstat.f_mntonname,
9644 vp->v_name ? vp->v_name : "<unknown>");
9645 error = EIO;
9646 break;
9647 }
9648
9649 size_t enbufsize = DIRENT64_LEN(dep->d_namlen);
9650
9651 bzero(entry64, enbufsize);
9652 /* Convert a dirent to a dirent64. */
9653 entry64->d_ino = dep->d_ino;
9654 entry64->d_seekoff = 0;
9655 entry64->d_reclen = (uint16_t)enbufsize;
9656 entry64->d_namlen = dep->d_namlen;
9657 entry64->d_type = dep->d_type;
9658 bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
9659
9660 /* Move to next entry. */
9661 dep = (struct dirent *)((char *)dep + dep->d_reclen);
9662
9663 /* Copy entry64 to user's buffer. */
9664 error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
9665 }
9666
9667 /* Update the real offset using the offset we got from VNOP_READDIR. */
9668 if (error == 0) {
9669 uio->uio_offset = auio->uio_offset;
9670 }
9671 uio_free(auio);
9672 kfree_data(bufptr, bufsize);
9673 kfree_type(struct direntry, entry64);
9674 return error;
9675 }
9676 }
9677
9678 #define GETDIRENTRIES_MAXBUFSIZE (128 * 1024 * 1024U)
9679
9680 /*
9681 * Read a block of directory entries in a file system independent format.
9682 */
9683 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)9684 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
9685 off_t *offset, int *eofflag, int flags)
9686 {
9687 vnode_t vp;
9688 struct vfs_context context = *vfs_context_current(); /* local copy */
9689 struct fileproc *fp;
9690 uio_t auio;
9691 int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9692 off_t loff;
9693 int error, numdirent;
9694 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
9695
9696 get_from_fd:
9697 error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
9698 if (error) {
9699 return error;
9700 }
9701
9702 vn_offset_lock(fp->fp_glob);
9703 if (((vnode_t)fp_get_data(fp)) != vp) {
9704 vn_offset_unlock(fp->fp_glob);
9705 file_drop(fd);
9706 goto get_from_fd;
9707 }
9708
9709 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
9710 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9711 error = EBADF;
9712 goto out;
9713 }
9714
9715 if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
9716 bufsize = GETDIRENTRIES_MAXBUFSIZE;
9717 }
9718
9719 #if CONFIG_MACF
9720 error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
9721 if (error) {
9722 goto out;
9723 }
9724 #endif
9725
9726 if ((error = vnode_getwithref(vp))) {
9727 goto out;
9728 }
9729 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9730
9731 #if CONFIG_UNION_MOUNTS
9732 unionread:
9733 #endif /* CONFIG_UNION_MOUNTS */
9734 if (vp->v_type != VDIR) {
9735 (void)vnode_put(vp);
9736 error = EINVAL;
9737 goto out;
9738 }
9739
9740 #if CONFIG_MACF
9741 error = mac_vnode_check_readdir(&context, vp);
9742 if (error != 0) {
9743 (void)vnode_put(vp);
9744 goto out;
9745 }
9746 #endif /* MAC */
9747
9748 loff = fp->fp_glob->fg_offset;
9749 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9750 uio_addiov(auio, bufp, bufsize);
9751
9752 if (flags & VNODE_READDIR_EXTENDED) {
9753 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
9754 fp->fp_glob->fg_offset = uio_offset(auio);
9755 } else {
9756 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
9757 fp->fp_glob->fg_offset = uio_offset(auio);
9758 }
9759 if (error) {
9760 (void)vnode_put(vp);
9761 goto out;
9762 }
9763
9764 #if CONFIG_UNION_MOUNTS
9765 if ((user_ssize_t)bufsize == uio_resid(auio) &&
9766 (vp->v_mount->mnt_flag & MNT_UNION)) {
9767 vnode_t uvp;
9768
9769 if (lookup_traverse_union(vp, &uvp, &context) == 0) {
9770 if (vnode_ref(uvp) == 0) {
9771 fp_set_data(fp, uvp);
9772 fp->fp_glob->fg_offset = 0;
9773 vnode_rele(vp);
9774 vnode_put(vp);
9775 vp = uvp;
9776 goto unionread;
9777 } else {
9778 /* could not get a ref, can't replace in fd */
9779 vnode_put(uvp);
9780 }
9781 }
9782 }
9783 #endif /* CONFIG_UNION_MOUNTS */
9784
9785 vnode_put(vp);
9786 if (offset) {
9787 *offset = loff;
9788 }
9789
9790 *bytesread = bufsize - uio_resid(auio);
9791 out:
9792 vn_offset_unlock(fp->fp_glob);
9793 file_drop(fd);
9794 return error;
9795 }
9796
9797
9798 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)9799 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
9800 {
9801 off_t offset;
9802 ssize_t bytesread;
9803 int error, eofflag;
9804
9805 AUDIT_ARG(fd, uap->fd);
9806 error = getdirentries_common(uap->fd, uap->buf, uap->count,
9807 &bytesread, &offset, &eofflag, 0);
9808
9809 if (error == 0) {
9810 if (proc_is64bit(p)) {
9811 user64_long_t base = (user64_long_t)offset;
9812 error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
9813 } else {
9814 user32_long_t base = (user32_long_t)offset;
9815 error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
9816 }
9817 *retval = (int)bytesread;
9818 }
9819 return error;
9820 }
9821
9822 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)9823 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
9824 {
9825 off_t offset;
9826 ssize_t bytesread;
9827 int error, eofflag;
9828 user_size_t bufsize;
9829
9830 AUDIT_ARG(fd, uap->fd);
9831
9832 /*
9833 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
9834 * then the kernel carves out the last 4 bytes to return extended
9835 * information to userspace (namely whether we reached EOF with this call).
9836 */
9837 if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9838 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
9839 } else {
9840 bufsize = uap->bufsize;
9841 }
9842
9843 error = getdirentries_common(uap->fd, uap->buf, bufsize,
9844 &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
9845
9846 if (error == 0) {
9847 *retval = bytesread;
9848 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
9849
9850 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9851 getdirentries64_flags_t flags = 0;
9852 if (eofflag) {
9853 flags |= GETDIRENTRIES64_EOF;
9854 }
9855 error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
9856 sizeof(flags));
9857 }
9858 }
9859 return error;
9860 }
9861
9862
9863 /*
9864 * Set the mode mask for creation of filesystem nodes.
9865 * XXX implement xsecurity
9866 */
9867 #define UMASK_NOXSECURITY (void *)1 /* leave existing xsecurity alone */
9868 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)9869 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
9870 {
9871 AUDIT_ARG(mask, newmask);
9872 proc_fdlock(p);
9873 *retval = p->p_fd.fd_cmask;
9874 p->p_fd.fd_cmask = newmask & ALLPERMS;
9875 proc_fdunlock(p);
9876 return 0;
9877 }
9878
9879 /*
9880 * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
9881 *
9882 * Parameters: p Process requesting to set the umask
9883 * uap User argument descriptor (see below)
9884 * retval umask of the process (parameter p)
9885 *
9886 * Indirect: uap->newmask umask to set
9887 * uap->xsecurity ACL to set
9888 *
9889 * Returns: 0 Success
9890 * !0 Not success
9891 *
9892 */
9893 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)9894 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
9895 {
9896 return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
9897 }
9898
9899 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)9900 umask(proc_t p, struct umask_args *uap, int32_t *retval)
9901 {
9902 return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
9903 }
9904
9905 /*
9906 * Void all references to file by ripping underlying filesystem
9907 * away from vnode.
9908 */
9909 /* ARGSUSED */
9910 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)9911 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
9912 {
9913 vnode_t vp;
9914 struct vnode_attr va;
9915 vfs_context_t ctx = vfs_context_current();
9916 int error;
9917 struct nameidata nd;
9918
9919 NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
9920 uap->path, ctx);
9921 error = namei(&nd);
9922 if (error) {
9923 return error;
9924 }
9925 vp = nd.ni_vp;
9926
9927 nameidone(&nd);
9928
9929 if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
9930 error = ENOTSUP;
9931 goto out;
9932 }
9933
9934 if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
9935 error = EBUSY;
9936 goto out;
9937 }
9938
9939 #if CONFIG_MACF
9940 error = mac_vnode_check_revoke(ctx, vp);
9941 if (error) {
9942 goto out;
9943 }
9944 #endif
9945
9946 VATTR_INIT(&va);
9947 VATTR_WANTED(&va, va_uid);
9948 if ((error = vnode_getattr(vp, &va, ctx))) {
9949 goto out;
9950 }
9951 if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
9952 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
9953 goto out;
9954 }
9955 if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
9956 VNOP_REVOKE(vp, REVOKEALL, ctx);
9957 }
9958 out:
9959 vnode_put(vp);
9960 return error;
9961 }
9962
9963
9964 /*
9965 * HFS/HFS PlUS SPECIFIC SYSTEM CALLS
9966 * The following system calls are designed to support features
9967 * which are specific to the HFS & HFS Plus volume formats
9968 */
9969
9970
9971 /*
9972 * Obtain attribute information on objects in a directory while enumerating
9973 * the directory.
9974 */
9975 /* ARGSUSED */
9976 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)9977 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
9978 {
9979 vnode_t vp;
9980 struct fileproc *fp;
9981 uio_t auio = NULL;
9982 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9983 uint32_t count = 0, savecount = 0;
9984 uint32_t newstate = 0;
9985 int error, eofflag;
9986 off_t loff = 0;
9987 struct attrlist attributelist;
9988 vfs_context_t ctx = vfs_context_current();
9989 int fd = uap->fd;
9990 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
9991 kauth_action_t action;
9992
9993 AUDIT_ARG(fd, fd);
9994
9995 /* Get the attributes into kernel space */
9996 if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
9997 return error;
9998 }
9999 if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10000 return error;
10001 }
10002 savecount = count;
10003
10004 get_from_fd:
10005 if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10006 return error;
10007 }
10008
10009 vn_offset_lock(fp->fp_glob);
10010 if (((vnode_t)fp_get_data(fp)) != vp) {
10011 vn_offset_unlock(fp->fp_glob);
10012 file_drop(fd);
10013 goto get_from_fd;
10014 }
10015
10016 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10017 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10018 error = EBADF;
10019 goto out;
10020 }
10021
10022
10023 #if CONFIG_MACF
10024 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10025 fp->fp_glob);
10026 if (error) {
10027 goto out;
10028 }
10029 #endif
10030
10031
10032 if ((error = vnode_getwithref(vp))) {
10033 goto out;
10034 }
10035
10036 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10037
10038 #if CONFIG_UNION_MOUNTS
10039 unionread:
10040 #endif /* CONFIG_UNION_MOUNTS */
10041 if (vp->v_type != VDIR) {
10042 (void)vnode_put(vp);
10043 error = EINVAL;
10044 goto out;
10045 }
10046
10047 #if CONFIG_MACF
10048 error = mac_vnode_check_readdir(ctx, vp);
10049 if (error != 0) {
10050 (void)vnode_put(vp);
10051 goto out;
10052 }
10053 #endif /* MAC */
10054
10055 /* set up the uio structure which will contain the users return buffer */
10056 loff = fp->fp_glob->fg_offset;
10057 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10058 uio_addiov(auio, uap->buffer, uap->buffersize);
10059
10060 /*
10061 * If the only item requested is file names, we can let that past with
10062 * just LIST_DIRECTORY. If they want any other attributes, that means
10063 * they need SEARCH as well.
10064 */
10065 action = KAUTH_VNODE_LIST_DIRECTORY;
10066 if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10067 attributelist.fileattr || attributelist.dirattr) {
10068 action |= KAUTH_VNODE_SEARCH;
10069 }
10070
10071 if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10072 /* Believe it or not, uap->options only has 32-bits of valid
10073 * info, so truncate before extending again */
10074
10075 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10076 (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10077 }
10078
10079 if (error) {
10080 (void) vnode_put(vp);
10081 goto out;
10082 }
10083
10084 #if CONFIG_UNION_MOUNTS
10085 /*
10086 * If we've got the last entry of a directory in a union mount
10087 * then reset the eofflag and pretend there's still more to come.
10088 * The next call will again set eofflag and the buffer will be empty,
10089 * so traverse to the underlying directory and do the directory
10090 * read there.
10091 */
10092 if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10093 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10094 eofflag = 0;
10095 } else { // Empty buffer
10096 vnode_t uvp;
10097 if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10098 if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10099 fp_set_data(fp, uvp);
10100 fp->fp_glob->fg_offset = 0; // reset index for new dir
10101 count = savecount;
10102 vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10103 vnode_put(vp);
10104 vp = uvp;
10105 goto unionread;
10106 } else {
10107 /* could not get a ref, can't replace in fd */
10108 vnode_put(uvp);
10109 }
10110 }
10111 }
10112 }
10113 #endif /* CONFIG_UNION_MOUNTS */
10114
10115 (void)vnode_put(vp);
10116
10117 if (error) {
10118 goto out;
10119 }
10120 fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10121
10122 if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10123 goto out;
10124 }
10125 if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10126 goto out;
10127 }
10128 if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10129 goto out;
10130 }
10131
10132 *retval = eofflag; /* similar to getdirentries */
10133 error = 0;
10134 out:
10135 vn_offset_unlock(fp->fp_glob);
10136 file_drop(fd);
10137 return error; /* return error earlier, an retval of 0 or 1 now */
10138 } /* end of getdirentriesattr system call */
10139
10140 /*
10141 * Exchange data between two files
10142 */
10143
10144 /* ARGSUSED */
10145 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10146 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10147 {
10148 struct nameidata fnd, snd;
10149 vfs_context_t ctx = vfs_context_current();
10150 vnode_t fvp;
10151 vnode_t svp;
10152 int error;
10153 u_int32_t nameiflags;
10154 char *fpath = NULL;
10155 char *spath = NULL;
10156 int flen = 0, slen = 0;
10157 int from_truncated = 0, to_truncated = 0;
10158 #if CONFIG_FSE
10159 fse_info f_finfo, s_finfo;
10160 #endif
10161
10162 nameiflags = 0;
10163 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10164 nameiflags |= FOLLOW;
10165 }
10166
10167 NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10168 UIO_USERSPACE, uap->path1, ctx);
10169
10170 error = namei(&fnd);
10171 if (error) {
10172 goto out2;
10173 }
10174
10175 nameidone(&fnd);
10176 fvp = fnd.ni_vp;
10177
10178 NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10179 UIO_USERSPACE, uap->path2, ctx);
10180
10181 error = namei(&snd);
10182 if (error) {
10183 vnode_put(fvp);
10184 goto out2;
10185 }
10186 nameidone(&snd);
10187 svp = snd.ni_vp;
10188
10189 /*
10190 * if the files are the same, return an inval error
10191 */
10192 if (svp == fvp) {
10193 error = EINVAL;
10194 goto out;
10195 }
10196
10197 /*
10198 * if the files are on different volumes, return an error
10199 */
10200 if (svp->v_mount != fvp->v_mount) {
10201 error = EXDEV;
10202 goto out;
10203 }
10204
10205 /* If they're not files, return an error */
10206 if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10207 error = EINVAL;
10208 goto out;
10209 }
10210
10211 #if CONFIG_MACF
10212 error = mac_vnode_check_exchangedata(ctx,
10213 fvp, svp);
10214 if (error) {
10215 goto out;
10216 }
10217 #endif
10218 if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10219 ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10220 goto out;
10221 }
10222
10223 if (
10224 #if CONFIG_FSE
10225 need_fsevent(FSE_EXCHANGE, fvp) ||
10226 #endif
10227 kauth_authorize_fileop_has_listeners()) {
10228 GET_PATH(fpath);
10229 GET_PATH(spath);
10230
10231 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10232 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10233
10234 #if CONFIG_FSE
10235 get_fse_info(fvp, &f_finfo, ctx);
10236 get_fse_info(svp, &s_finfo, ctx);
10237 if (from_truncated || to_truncated) {
10238 // set it here since only the f_finfo gets reported up to user space
10239 f_finfo.mode |= FSE_TRUNCATED_PATH;
10240 }
10241 #endif
10242 }
10243 /* Ok, make the call */
10244 error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10245
10246 if (error == 0) {
10247 const char *tmpname;
10248
10249 if (fpath != NULL && spath != NULL) {
10250 /* call out to allow 3rd party notification of exchangedata.
10251 * Ignore result of kauth_authorize_fileop call.
10252 */
10253 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10254 (uintptr_t)fpath, (uintptr_t)spath);
10255 }
10256 name_cache_lock();
10257
10258 tmpname = fvp->v_name;
10259 fvp->v_name = svp->v_name;
10260 svp->v_name = tmpname;
10261
10262 if (fvp->v_parent != svp->v_parent) {
10263 vnode_t tmp;
10264
10265 tmp = fvp->v_parent;
10266 fvp->v_parent = svp->v_parent;
10267 svp->v_parent = tmp;
10268 }
10269 name_cache_unlock();
10270
10271 #if CONFIG_FSE
10272 if (fpath != NULL && spath != NULL) {
10273 add_fsevent(FSE_EXCHANGE, ctx,
10274 FSE_ARG_STRING, flen, fpath,
10275 FSE_ARG_FINFO, &f_finfo,
10276 FSE_ARG_STRING, slen, spath,
10277 FSE_ARG_FINFO, &s_finfo,
10278 FSE_ARG_DONE);
10279 }
10280 #endif
10281 }
10282
10283 out:
10284 if (fpath != NULL) {
10285 RELEASE_PATH(fpath);
10286 }
10287 if (spath != NULL) {
10288 RELEASE_PATH(spath);
10289 }
10290 vnode_put(svp);
10291 vnode_put(fvp);
10292 out2:
10293 return error;
10294 }
10295
10296 /*
10297 * Return (in MB) the amount of freespace on the given vnode's volume.
10298 */
10299 uint32_t freespace_mb(vnode_t vp);
10300
10301 uint32_t
freespace_mb(vnode_t vp)10302 freespace_mb(vnode_t vp)
10303 {
10304 vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
10305 return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10306 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10307 }
10308
10309 #if CONFIG_SEARCHFS
10310
10311 /* ARGSUSED */
10312
10313 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)10314 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10315 {
10316 vnode_t vp, tvp;
10317 int i, error = 0;
10318 int fserror = 0;
10319 struct nameidata nd;
10320 struct user64_fssearchblock searchblock;
10321 struct searchstate *state;
10322 struct attrlist *returnattrs;
10323 struct timeval timelimit;
10324 void *searchparams1, *searchparams2;
10325 uio_t auio = NULL;
10326 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10327 uint32_t nummatches;
10328 size_t mallocsize;
10329 uint32_t nameiflags;
10330 vfs_context_t ctx = vfs_context_current();
10331 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10332
10333 /* Start by copying in fsearchblock parameter list */
10334 if (IS_64BIT_PROCESS(p)) {
10335 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
10336 timelimit.tv_sec = searchblock.timelimit.tv_sec;
10337 timelimit.tv_usec = searchblock.timelimit.tv_usec;
10338 } else {
10339 struct user32_fssearchblock tmp_searchblock;
10340
10341 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
10342 // munge into 64-bit version
10343 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
10344 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
10345 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
10346 searchblock.maxmatches = tmp_searchblock.maxmatches;
10347 /*
10348 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
10349 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
10350 */
10351 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
10352 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
10353 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
10354 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
10355 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
10356 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
10357 searchblock.searchattrs = tmp_searchblock.searchattrs;
10358 }
10359 if (error) {
10360 return error;
10361 }
10362
10363 /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
10364 */
10365 if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10366 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10367 return EINVAL;
10368 }
10369
10370 /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
10371 /* It all has to do into local memory and it's not that big so we might as well put it all together. */
10372 /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
10373 /* block. */
10374 /* */
10375 /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate */
10376 /* due to the changes in rdar://problem/12438273. That way if a 3rd party file system */
10377 /* assumes the size is still 556 bytes it will continue to work */
10378
10379 mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
10380 sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
10381
10382 searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
10383
10384 /* Now set up the various pointers to the correct place in our newly allocated memory */
10385
10386 searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
10387 returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
10388 state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
10389
10390 /* Now copy in the stuff given our local variables. */
10391
10392 if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
10393 goto freeandexit;
10394 }
10395
10396 if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
10397 goto freeandexit;
10398 }
10399
10400 if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
10401 goto freeandexit;
10402 }
10403
10404 if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
10405 goto freeandexit;
10406 }
10407
10408 /*
10409 * When searching a union mount, need to set the
10410 * start flag at the first call on each layer to
10411 * reset state for the new volume.
10412 */
10413 if (uap->options & SRCHFS_START) {
10414 state->ss_union_layer = 0;
10415 } else {
10416 uap->options |= state->ss_union_flags;
10417 }
10418 state->ss_union_flags = 0;
10419
10420 /*
10421 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
10422 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
10423 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
10424 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
10425 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
10426 */
10427
10428 if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
10429 attrreference_t* string_ref;
10430 u_int32_t* start_length;
10431 user64_size_t param_length;
10432
10433 /* validate searchparams1 */
10434 param_length = searchblock.sizeofsearchparams1;
10435 /* skip the word that specifies length of the buffer */
10436 start_length = (u_int32_t*) searchparams1;
10437 start_length = start_length + 1;
10438 string_ref = (attrreference_t*) start_length;
10439
10440 /* ensure no negative offsets or too big offsets */
10441 if (string_ref->attr_dataoffset < 0) {
10442 error = EINVAL;
10443 goto freeandexit;
10444 }
10445 if (string_ref->attr_length > MAXPATHLEN) {
10446 error = EINVAL;
10447 goto freeandexit;
10448 }
10449
10450 /* Check for pointer overflow in the string ref */
10451 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
10452 error = EINVAL;
10453 goto freeandexit;
10454 }
10455
10456 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
10457 error = EINVAL;
10458 goto freeandexit;
10459 }
10460 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
10461 error = EINVAL;
10462 goto freeandexit;
10463 }
10464 }
10465
10466 /* set up the uio structure which will contain the users return buffer */
10467 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10468 uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
10469
10470 nameiflags = 0;
10471 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10472 nameiflags |= FOLLOW;
10473 }
10474 NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
10475 UIO_USERSPACE, uap->path, ctx);
10476
10477 error = namei(&nd);
10478 if (error) {
10479 goto freeandexit;
10480 }
10481 vp = nd.ni_vp;
10482 nameidone(&nd);
10483
10484 /*
10485 * Switch to the root vnode for the volume
10486 */
10487 error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
10488 vnode_put(vp);
10489 if (error) {
10490 goto freeandexit;
10491 }
10492 vp = tvp;
10493
10494 #if CONFIG_UNION_MOUNTS
10495 /*
10496 * If it's a union mount, the path lookup takes
10497 * us to the top layer. But we may need to descend
10498 * to a lower layer. For non-union mounts the layer
10499 * is always zero.
10500 */
10501 for (i = 0; i < (int) state->ss_union_layer; i++) {
10502 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
10503 break;
10504 }
10505 tvp = vp;
10506 vp = vp->v_mount->mnt_vnodecovered;
10507 if (vp == NULL) {
10508 vnode_put(tvp);
10509 error = ENOENT;
10510 goto freeandexit;
10511 }
10512 error = vnode_getwithref(vp);
10513 vnode_put(tvp);
10514 if (error) {
10515 goto freeandexit;
10516 }
10517 }
10518 #endif /* CONFIG_UNION_MOUNTS */
10519
10520 #if CONFIG_MACF
10521 error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
10522 if (error) {
10523 vnode_put(vp);
10524 goto freeandexit;
10525 }
10526 #endif
10527
10528
10529 /*
10530 * If searchblock.maxmatches == 0, then skip the search. This has happened
10531 * before and sometimes the underlying code doesnt deal with it well.
10532 */
10533 if (searchblock.maxmatches == 0) {
10534 nummatches = 0;
10535 goto saveandexit;
10536 }
10537
10538 /*
10539 * Allright, we have everything we need, so lets make that call.
10540 *
10541 * We keep special track of the return value from the file system:
10542 * EAGAIN is an acceptable error condition that shouldn't keep us
10543 * from copying out any results...
10544 */
10545
10546 fserror = VNOP_SEARCHFS(vp,
10547 searchparams1,
10548 searchparams2,
10549 &searchblock.searchattrs,
10550 (uint32_t)searchblock.maxmatches,
10551 &timelimit,
10552 returnattrs,
10553 &nummatches,
10554 (uint32_t)uap->scriptcode,
10555 (uint32_t)uap->options,
10556 auio,
10557 (struct searchstate *) &state->ss_fsstate,
10558 ctx);
10559
10560 #if CONFIG_UNION_MOUNTS
10561 /*
10562 * If it's a union mount we need to be called again
10563 * to search the mounted-on filesystem.
10564 */
10565 if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
10566 state->ss_union_flags = SRCHFS_START;
10567 state->ss_union_layer++; // search next layer down
10568 fserror = EAGAIN;
10569 }
10570 #endif /* CONFIG_UNION_MOUNTS */
10571
10572 saveandexit:
10573
10574 vnode_put(vp);
10575
10576 /* Now copy out the stuff that needs copying out. That means the number of matches, the
10577 * search state. Everything was already put into he return buffer by the vop call. */
10578
10579 if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
10580 goto freeandexit;
10581 }
10582
10583 if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
10584 goto freeandexit;
10585 }
10586
10587 error = fserror;
10588
10589 freeandexit:
10590
10591 kfree_data(searchparams1, mallocsize);
10592
10593 return error;
10594 } /* end of searchfs system call */
10595
10596 #else /* CONFIG_SEARCHFS */
10597
10598 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)10599 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
10600 {
10601 return ENOTSUP;
10602 }
10603
10604 #endif /* CONFIG_SEARCHFS */
10605
10606
10607 #if CONFIG_DATALESS_FILES
10608
10609 /*
10610 * === Namespace Resolver Up-call Mechanism ===
10611 *
10612 * When I/O is performed to a dataless file or directory (read, write,
10613 * lookup-in, etc.), the file system performs an upcall to the namespace
10614 * resolver (filecoordinationd) to materialize the object.
10615 *
10616 * We need multiple up-calls to be in flight at once, and we need these
10617 * up-calls to be interruptible, thus the following implementation:
10618 *
10619 * => The nspace_resolver_request represents the in-kernel request state.
10620 * It contains a request ID, storage space for the errno code returned
10621 * by filecoordinationd, and flags.
10622 *
10623 * => The request ID is simply a global monotonically incrementing 32-bit
10624 * number. Outstanding requests are stored in a hash table, and the
10625 * hash function is extremely simple.
10626 *
10627 * => When an upcall is to be made to filecoordinationd, a request structure
10628 * is allocated on the stack (it is small, and needs to live only during
10629 * the duration of the call to resolve_nspace_item_ext()). It is
10630 * initialized and inserted into the table. Some backpressure from
10631 * filecoordinationd is applied by limiting the numnber of entries that
10632 * can be inserted into the table (and thus limiting the number of
10633 * outstanding requests issued to filecoordinationd); waiting for an
10634 * available slot is interruptible.
10635 *
10636 * => Once the request has been inserted into the table, the up-call is made
10637 * to filecoordinationd via a MiG-generated stub. The up-call returns
10638 * immediately and filecoordinationd processes the request asynchronously.
10639 *
10640 * => The caller now waits for the request to complete. Tnis is achieved by
10641 * sleeping on the address of the request structure and waiting for
10642 * filecoordinationd to mark the request structure as complete. This
10643 * is an interruptible sleep call; if interrupted, the request structure
10644 * is removed from the table and EINTR is returned to the caller. If
10645 * this occurs, an advisory up-call is made to filecoordinationd with
10646 * the request ID to indicate that the request can be aborted or
10647 * de-prioritized at the discretion of filecoordinationd.
10648 *
10649 * => When filecoordinationd has completed the request, it signals completion
10650 * by writing to the vfs.nspace.complete sysctl node. Only a process
10651 * decorated as a namespace resolver can write to this sysctl node. The
10652 * value is a request ID / errno tuple passed as an array of 2 uint32_t's.
10653 * The request ID is looked up in the table, and if the request is found,
10654 * the error code is stored in the request structure and a wakeup()
10655 * issued on the address of the request structure. If the request is not
10656 * found, we simply drop the completion notification, assuming that the
10657 * caller was interrupted.
10658 *
10659 * => When the waiting thread wakes up, it extracts the error code from the
10660 * request structure, removes the request from the table, and returns the
10661 * error code to the calling function. Fini!
10662 */
10663
10664 struct nspace_resolver_request {
10665 LIST_ENTRY(nspace_resolver_request) r_hashlink;
10666 vnode_t r_vp;
10667 uint32_t r_req_id;
10668 int r_resolver_error;
10669 int r_flags;
10670 };
10671
10672 #define RRF_COMPLETE 0x0001
10673
10674 static uint32_t
next_nspace_req_id(void)10675 next_nspace_req_id(void)
10676 {
10677 static uint32_t next_req_id;
10678
10679 return OSAddAtomic(1, &next_req_id);
10680 }
10681
10682 #define NSPACE_RESOLVER_REQ_HASHSIZE 32 /* XXX tune */
10683 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256 /* XXX tune */
10684
10685 static LIST_HEAD(nspace_resolver_requesthead,
10686 nspace_resolver_request) * nspace_resolver_request_hashtbl;
10687 static u_long nspace_resolver_request_hashmask;
10688 static u_int nspace_resolver_request_count;
10689 static bool nspace_resolver_request_wait_slot;
10690 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
10691 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
10692 &nspace_resolver_request_lck_grp);
10693
10694 #define NSPACE_REQ_LOCK() \
10695 lck_mtx_lock(&nspace_resolver_request_hash_mutex)
10696 #define NSPACE_REQ_UNLOCK() \
10697 lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
10698
10699 #define NSPACE_RESOLVER_HASH(req_id) \
10700 (&nspace_resolver_request_hashtbl[(req_id) & \
10701 nspace_resolver_request_hashmask])
10702
10703 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id)10704 nspace_resolver_req_lookup(uint32_t req_id)
10705 {
10706 struct nspace_resolver_requesthead *bucket;
10707 struct nspace_resolver_request *req;
10708
10709 bucket = NSPACE_RESOLVER_HASH(req_id);
10710 LIST_FOREACH(req, bucket, r_hashlink) {
10711 if (req->r_req_id == req_id) {
10712 return req;
10713 }
10714 }
10715
10716 return NULL;
10717 }
10718
10719 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)10720 nspace_resolver_req_add(struct nspace_resolver_request *req)
10721 {
10722 struct nspace_resolver_requesthead *bucket;
10723 int error;
10724
10725 while (nspace_resolver_request_count >=
10726 NSPACE_RESOLVER_MAX_OUTSTANDING) {
10727 nspace_resolver_request_wait_slot = true;
10728 error = msleep(&nspace_resolver_request_count,
10729 &nspace_resolver_request_hash_mutex,
10730 PVFS | PCATCH, "nspacerq", NULL);
10731 if (error) {
10732 return error;
10733 }
10734 }
10735
10736 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10737 #if DIAGNOSTIC
10738 assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
10739 #endif /* DIAGNOSTIC */
10740 LIST_INSERT_HEAD(bucket, req, r_hashlink);
10741 nspace_resolver_request_count++;
10742
10743 return 0;
10744 }
10745
10746 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)10747 nspace_resolver_req_remove(struct nspace_resolver_request *req)
10748 {
10749 struct nspace_resolver_requesthead *bucket;
10750
10751 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10752 #if DIAGNOSTIC
10753 assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
10754 #endif /* DIAGNOSTIC */
10755 LIST_REMOVE(req, r_hashlink);
10756 nspace_resolver_request_count--;
10757
10758 if (nspace_resolver_request_wait_slot) {
10759 nspace_resolver_request_wait_slot = false;
10760 wakeup(&nspace_resolver_request_count);
10761 }
10762 }
10763
10764 static void
nspace_resolver_req_cancel(uint32_t req_id)10765 nspace_resolver_req_cancel(uint32_t req_id)
10766 {
10767 kern_return_t kr;
10768 mach_port_t mp;
10769
10770 // Failures here aren't fatal -- the cancellation message
10771 // sent to the resolver is merely advisory.
10772
10773 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
10774 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
10775 return;
10776 }
10777
10778 kr = send_nspace_resolve_cancel(mp, req_id);
10779 if (kr != KERN_SUCCESS) {
10780 os_log_error(OS_LOG_DEFAULT,
10781 "NSPACE send_nspace_resolve_cancel failure: %d", kr);
10782 }
10783
10784 ipc_port_release_send(mp);
10785 }
10786
10787 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)10788 nspace_resolver_req_wait(struct nspace_resolver_request *req)
10789 {
10790 bool send_cancel_message = false;
10791 int error;
10792
10793 NSPACE_REQ_LOCK();
10794
10795 while ((req->r_flags & RRF_COMPLETE) == 0) {
10796 error = msleep(req, &nspace_resolver_request_hash_mutex,
10797 PVFS | PCATCH, "nspace", NULL);
10798 if (error && error != ERESTART) {
10799 req->r_resolver_error = (error == EINTR) ? EINTR :
10800 ETIMEDOUT;
10801 send_cancel_message = true;
10802 break;
10803 }
10804 }
10805
10806 nspace_resolver_req_remove(req);
10807
10808 NSPACE_REQ_UNLOCK();
10809
10810 if (send_cancel_message) {
10811 nspace_resolver_req_cancel(req->r_req_id);
10812 }
10813
10814 return req->r_resolver_error;
10815 }
10816
10817 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)10818 nspace_resolver_req_mark_complete(
10819 struct nspace_resolver_request *req,
10820 int resolver_error)
10821 {
10822 req->r_resolver_error = resolver_error;
10823 req->r_flags |= RRF_COMPLETE;
10824 wakeup(req);
10825 }
10826
10827 static void
nspace_resolver_req_completed(uint32_t req_id,int resolver_error,uint64_t orig_gencount)10828 nspace_resolver_req_completed(uint32_t req_id, int resolver_error, uint64_t orig_gencount)
10829 {
10830 struct nspace_resolver_request *req;
10831
10832 NSPACE_REQ_LOCK();
10833
10834 // If we don't find the request corresponding to our req_id,
10835 // just drop the completion signal on the floor; it's likely
10836 // that the requester interrupted with a signal.
10837
10838 req = nspace_resolver_req_lookup(req_id);
10839 if (req) {
10840 mount_t locked_mp = NULL;
10841
10842 locked_mp = req->r_vp->v_mount;
10843 mount_ref(locked_mp, 0);
10844 mount_lock_renames(locked_mp);
10845
10846 //
10847 // if the resolver isn't already returning an error and we have an
10848 // orig_gencount, then get an iocount on the request vnode and check
10849 // that the gencount on req->r_vp has not changed.
10850 //
10851 // note: a ref was taken on req->r_vp when the request was created
10852 // and that ref will be dropped by that thread when it wakes up.
10853 //
10854 if (resolver_error == 0 &&
10855 orig_gencount != 0 &&
10856 vnode_getwithref(req->r_vp) == 0) {
10857 struct vnode_attr va;
10858 uint64_t cur_gencount;
10859
10860 VATTR_INIT(&va);
10861 VATTR_WANTED(&va, va_recursive_gencount);
10862
10863 if (vnode_getattr(req->r_vp, &va, vfs_context_kernel()) == 0) {
10864 cur_gencount = va.va_recursive_gencount;
10865 } else {
10866 cur_gencount = 0;
10867 }
10868
10869 if (resolver_error == 0 && cur_gencount && orig_gencount && cur_gencount != orig_gencount) {
10870 printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n", orig_gencount, cur_gencount);
10871
10872 // this error will be returned to the thread that initiated the
10873 // materialization of req->r_vp.
10874 resolver_error = EBUSY;
10875
10876 // note: we explicitly do not return an error to the caller (i.e.
10877 // the thread that did the materialization) because they said they
10878 // don't want one.
10879 }
10880
10881 vnode_put(req->r_vp);
10882 }
10883
10884 mount_unlock_renames(locked_mp);
10885 mount_drop(locked_mp, 0);
10886
10887 nspace_resolver_req_mark_complete(req, resolver_error);
10888 }
10889
10890 NSPACE_REQ_UNLOCK();
10891
10892 return;
10893 }
10894
10895 static struct proc *nspace_resolver_proc;
10896
10897 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)10898 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
10899 {
10900 *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
10901 p == nspace_resolver_proc) ? 1 : 0;
10902 return 0;
10903 }
10904
10905 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)10906 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
10907 {
10908 vfs_context_t ctx = vfs_context_current();
10909 int error = 0;
10910
10911 //
10912 // The system filecoordinationd runs as uid == 0. This also
10913 // has the nice side-effect of filtering out filecoordinationd
10914 // running in the simulator.
10915 //
10916 if (!vfs_context_issuser(ctx)) {
10917 return EPERM;
10918 }
10919
10920 error = priv_check_cred(vfs_context_ucred(ctx),
10921 PRIV_VFS_DATALESS_RESOLVER, 0);
10922 if (error) {
10923 return error;
10924 }
10925
10926 if (is_resolver) {
10927 NSPACE_REQ_LOCK();
10928
10929 if (nspace_resolver_proc == NULL) {
10930 proc_lock(p);
10931 p->p_lflag |= P_LNSPACE_RESOLVER;
10932 proc_unlock(p);
10933 nspace_resolver_proc = p;
10934 } else {
10935 error = EBUSY;
10936 }
10937
10938 NSPACE_REQ_UNLOCK();
10939 } else {
10940 // This is basically just like the exit case.
10941 // nspace_resolver_exited() will verify that the
10942 // process is the resolver, and will clear the
10943 // global.
10944 nspace_resolver_exited(p);
10945 }
10946
10947 return error;
10948 }
10949
10950 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)10951 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
10952 {
10953 if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
10954 (p->p_vfs_iopolicy &
10955 P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
10956 *is_prevented = 1;
10957 } else {
10958 *is_prevented = 0;
10959 }
10960 return 0;
10961 }
10962
10963 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)10964 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
10965 {
10966 if (p->p_lflag & P_LNSPACE_RESOLVER) {
10967 return is_prevented ? 0 : EBUSY;
10968 }
10969
10970 if (is_prevented) {
10971 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
10972 } else {
10973 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
10974 }
10975 return 0;
10976 }
10977
10978 static int
nspace_materialization_get_thread_state(int * is_prevented)10979 nspace_materialization_get_thread_state(int *is_prevented)
10980 {
10981 uthread_t ut = current_uthread();
10982
10983 *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
10984 return 0;
10985 }
10986
10987 static int
nspace_materialization_set_thread_state(int is_prevented)10988 nspace_materialization_set_thread_state(int is_prevented)
10989 {
10990 uthread_t ut = current_uthread();
10991
10992 if (is_prevented) {
10993 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
10994 } else {
10995 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
10996 }
10997 return 0;
10998 }
10999
11000 /* the vfs.nspace branch */
11001 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11002
11003 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11004 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11005 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11006 {
11007 struct proc *p = req->p;
11008 int new_value, old_value, changed = 0;
11009 int error;
11010
11011 error = nspace_resolver_get_proc_state(p, &old_value);
11012 if (error) {
11013 return error;
11014 }
11015
11016 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11017 &changed);
11018 if (error == 0 && changed) {
11019 error = nspace_resolver_set_proc_state(p, new_value);
11020 }
11021 return error;
11022 }
11023
11024 /* decorate this process as the dataless file resolver */
11025 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11026 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11027 0, 0, sysctl_nspace_resolver, "I", "");
11028
11029 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11030 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11031 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11032 {
11033 struct proc *p = req->p;
11034 int new_value, old_value, changed = 0;
11035 int error;
11036
11037 error = nspace_materialization_get_proc_state(p, &old_value);
11038 if (error) {
11039 return error;
11040 }
11041
11042 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11043 &changed);
11044 if (error == 0 && changed) {
11045 error = nspace_materialization_set_proc_state(p, new_value);
11046 }
11047 return error;
11048 }
11049
11050 /* decorate this process as not wanting to materialize dataless files */
11051 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
11052 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11053 0, 0, sysctl_nspace_prevent_materialization, "I", "");
11054
11055 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11056 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
11057 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11058 {
11059 int new_value, old_value, changed = 0;
11060 int error;
11061
11062 error = nspace_materialization_get_thread_state(&old_value);
11063 if (error) {
11064 return error;
11065 }
11066
11067 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11068 &changed);
11069 if (error == 0 && changed) {
11070 error = nspace_materialization_set_thread_state(new_value);
11071 }
11072 return error;
11073 }
11074
11075 /* decorate this thread as not wanting to materialize dataless files */
11076 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
11077 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11078 0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
11079
11080 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11081 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
11082 __unused int arg2, struct sysctl_req *req)
11083 {
11084 struct proc *p = req->p;
11085 uint32_t req_status[2] = { 0, 0 };
11086 uint64_t gencount = 0;
11087 int error, is_resolver, changed = 0, gencount_changed;
11088
11089 error = nspace_resolver_get_proc_state(p, &is_resolver);
11090 if (error) {
11091 return error;
11092 }
11093
11094 if (!is_resolver) {
11095 return EPERM;
11096 }
11097
11098 error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11099 &changed);
11100 if (error) {
11101 return error;
11102 }
11103
11104 // get the gencount if it was passed
11105 error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11106 &gencount_changed);
11107 if (error) {
11108 gencount = 0;
11109 // we ignore the error because the gencount was optional
11110 error = 0;
11111 }
11112
11113 /*
11114 * req_status[0] is the req_id
11115 *
11116 * req_status[1] is the errno
11117 */
11118 if (error == 0 && changed) {
11119 nspace_resolver_req_completed(req_status[0],
11120 (int)req_status[1], gencount);
11121 }
11122 return error;
11123 }
11124
11125 /* Resolver reports completed reqs here. */
11126 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11127 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11128 0, 0, sysctl_nspace_complete, "-", "");
11129
11130 #endif /* CONFIG_DATALESS_FILES */
11131
11132 #if CONFIG_DATALESS_FILES
11133 #define __no_dataless_unused /* nothing */
11134 #else
11135 #define __no_dataless_unused __unused
11136 #endif
11137
11138 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)11139 vfs_context_dataless_materialization_is_prevented(
11140 vfs_context_t const ctx __no_dataless_unused)
11141 {
11142 #if CONFIG_DATALESS_FILES
11143 proc_t const p = vfs_context_proc(ctx);
11144 thread_t const t = vfs_context_thread(ctx);
11145 uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11146
11147 /*
11148 * Kernel context ==> return EDEADLK, as we would with any random
11149 * process decorated as no-materialize.
11150 */
11151 if (ctx == vfs_context_kernel()) {
11152 return EDEADLK;
11153 }
11154
11155 /*
11156 * If the process has the dataless-manipulation entitlement,
11157 * materialization is prevented, and depending on the kind
11158 * of file system operation, things get to proceed as if the
11159 * object is not dataless.
11160 */
11161 if (vfs_context_is_dataless_manipulator(ctx)) {
11162 return EJUSTRETURN;
11163 }
11164
11165 /*
11166 * Per-thread decorations override any process-wide decorations.
11167 * (Foundation uses this, and this overrides even the dataless-
11168 * manipulation entitlement so as to make API contracts consistent.)
11169 */
11170 if (ut != NULL) {
11171 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11172 return EDEADLK;
11173 }
11174 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11175 return 0;
11176 }
11177 }
11178
11179 /*
11180 * If the process's iopolicy specifies that dataless files
11181 * can be materialized, then we let it go ahead.
11182 */
11183 if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11184 return 0;
11185 }
11186 #endif /* CONFIG_DATALESS_FILES */
11187
11188 /*
11189 * The default behavior is to not materialize dataless files;
11190 * return to the caller that deadlock was detected.
11191 */
11192 return EDEADLK;
11193 }
11194
11195 void
nspace_resolver_init(void)11196 nspace_resolver_init(void)
11197 {
11198 #if CONFIG_DATALESS_FILES
11199 nspace_resolver_request_hashtbl =
11200 hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
11201 M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
11202 #endif /* CONFIG_DATALESS_FILES */
11203 }
11204
11205 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)11206 nspace_resolver_exited(struct proc *p __no_dataless_unused)
11207 {
11208 #if CONFIG_DATALESS_FILES
11209 struct nspace_resolver_requesthead *bucket;
11210 struct nspace_resolver_request *req;
11211 u_long idx;
11212
11213 NSPACE_REQ_LOCK();
11214
11215 if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11216 p == nspace_resolver_proc) {
11217 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
11218 bucket = &nspace_resolver_request_hashtbl[idx];
11219 LIST_FOREACH(req, bucket, r_hashlink) {
11220 nspace_resolver_req_mark_complete(req,
11221 ETIMEDOUT);
11222 }
11223 }
11224 nspace_resolver_proc = NULL;
11225 }
11226
11227 NSPACE_REQ_UNLOCK();
11228 #endif /* CONFIG_DATALESS_FILES */
11229 }
11230
11231 int
resolve_nspace_item(struct vnode * vp,uint64_t op)11232 resolve_nspace_item(struct vnode *vp, uint64_t op)
11233 {
11234 return resolve_nspace_item_ext(vp, op, NULL);
11235 }
11236
11237 #define DATALESS_RESOLVER_ENTITLEMENT \
11238 "com.apple.private.vfs.dataless-resolver"
11239 #define DATALESS_MANIPULATION_ENTITLEMENT \
11240 "com.apple.private.vfs.dataless-manipulation"
11241
11242 /*
11243 * Return TRUE if the vfs context is associated with a process entitled
11244 * for dataless manipulation.
11245 *
11246 * XXX Arguably belongs in vfs_subr.c, but is here because of the
11247 * complication around CONFIG_DATALESS_FILES.
11248 */
11249 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __unused)11250 vfs_context_is_dataless_manipulator(vfs_context_t ctx __unused)
11251 {
11252 #if CONFIG_DATALESS_FILES
11253 assert(ctx->vc_thread == current_thread());
11254 return IOCurrentTaskHasEntitlement( DATALESS_MANIPULATION_ENTITLEMENT) ||
11255 IOCurrentTaskHasEntitlement(DATALESS_RESOLVER_ENTITLEMENT);
11256 #else
11257 return false;
11258 #endif /* CONFIG_DATALESS_FILES */
11259 }
11260
11261 #if CONFIG_DATALESS_FILES
11262 static void
log_materialization_prevented(vnode_t vp,uint64_t op)11263 log_materialization_prevented(vnode_t vp, uint64_t op)
11264 {
11265 char p_name[MAXCOMLEN + 1];
11266 char *vntype;
11267 proc_selfname(&p_name[0], sizeof(p_name));
11268
11269 if (vp->v_type == VREG) {
11270 vntype = "File";
11271 } else if (vp->v_type == VDIR) {
11272 vntype = "Dir";
11273 } else if (vp->v_type == VLNK) {
11274 vntype = "SymLink";
11275 } else {
11276 vntype = "Other";
11277 }
11278
11279 #if DEVELOPMENT
11280 char *path = NULL;
11281 int len;
11282
11283 path = get_pathbuff();
11284 len = MAXPATHLEN;
11285 if (path) {
11286 vn_getpath(vp, path, &len);
11287 }
11288
11289 os_log_debug(OS_LOG_DEFAULT,
11290 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) path: %s",
11291 p_name, proc_selfpid(),
11292 op, vntype, path ? path : "<unknown-path>");
11293 if (path) {
11294 release_pathbuff(path);
11295 }
11296 #else
11297 os_log_debug(OS_LOG_DEFAULT,
11298 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
11299 p_name, proc_selfpid(),
11300 op, vntype);
11301 #endif
11302 }
11303 #endif /* CONFIG_DATALESS_FILES */
11304
11305
11306 static int
vfs_materialize_item(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused)11307 vfs_materialize_item(
11308 struct vnode *vp __no_dataless_unused,
11309 uint64_t op __no_dataless_unused,
11310 int64_t offset __no_dataless_unused,
11311 int64_t size __no_dataless_unused,
11312 char *lookup_name __no_dataless_unused,
11313 size_t const namelen __no_dataless_unused)
11314 {
11315 #if CONFIG_DATALESS_FILES
11316 struct nspace_resolver_request req;
11317 kern_return_t kern_ret;
11318 mach_port_t mach_port;
11319 char *path = NULL;
11320 vfs_context_t context;
11321 int path_len;
11322 int error;
11323 #ifdef DATALESS_FILES_USE_AUDIT_TOKEN
11324 audit_token_t atoken;
11325 #endif
11326
11327 /*
11328 * If this is a snapshot event and the vnode is on a disk image just
11329 * pretend nothing happened since any change to the disk image will
11330 * cause the disk image itself to get backed up and this avoids multi-
11331 * way deadlocks between the snapshot handler and the ever popular
11332 * diskimages-helper process. The variable nspace_allow_virtual_devs
11333 * allows this behavior to be overridden (for use by the Mobile
11334 * TimeMachine testing infrastructure which uses disk images).
11335 */
11336 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
11337 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
11338 return ENOTSUP;
11339 }
11340
11341 context = vfs_context_current();
11342
11343 error = vfs_context_dataless_materialization_is_prevented(context);
11344 if (error) {
11345 log_materialization_prevented(vp, op);
11346 return error;
11347 }
11348
11349 kern_ret = host_get_filecoordinationd_port(host_priv_self(),
11350 &mach_port);
11351 if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
11352 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
11353 /*
11354 * Treat this like being unable to access the backing store
11355 * server.
11356 */
11357 return ETIMEDOUT;
11358 }
11359
11360 path = zalloc(ZV_NAMEI);
11361 path_len = MAXPATHLEN;
11362
11363 error = vn_getpath(vp, path, &path_len);
11364 if (error) {
11365 goto out_release_port;
11366 }
11367
11368 #ifdef DATALESS_FILES_USE_AUDIT_TOKEN
11369 error = vfs_context_copy_audit_token(context, &atoken);
11370 if (error) {
11371 goto out_release_port;
11372 }
11373 #endif
11374
11375 req.r_req_id = next_nspace_req_id();
11376 req.r_resolver_error = 0;
11377 req.r_flags = 0;
11378 req.r_vp = vp;
11379
11380 NSPACE_REQ_LOCK();
11381 error = nspace_resolver_req_add(&req);
11382 NSPACE_REQ_UNLOCK();
11383 if (error) {
11384 goto out_release_port;
11385 }
11386
11387 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
11388 if (vp->v_type == VDIR) {
11389 char *tmpname = NULL;
11390
11391 /*
11392 * If the caller provided a lookup_name *and* a name length,
11393 * then we assume the lookup_name is not NUL-terminated.
11394 * Allocate a temporary buffer in this case to provide
11395 * a NUL-terminated path name to the IPC call.
11396 */
11397 if (lookup_name != NULL && namelen != 0) {
11398 if (namelen >= PATH_MAX) {
11399 error = EINVAL;
11400 goto out_release_port;
11401 }
11402 tmpname = zalloc(ZV_NAMEI);
11403 strlcpy(tmpname, lookup_name, namelen + 1);
11404 lookup_name = tmpname;
11405 } else if (lookup_name != NULL) {
11406 /*
11407 * If the caller provided a lookup_name with a
11408 * zero name length, then we assume it's NUL-
11409 * terminated. Verify it has a valid length.
11410 */
11411 if (strlen(lookup_name) >= PATH_MAX) {
11412 error = EINVAL;
11413 goto out_release_port;
11414 }
11415 }
11416
11417 #ifdef DATALESS_FILES_USE_AUDIT_TOKEN
11418 kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
11419 req.r_req_id, (uint32_t)(op & 0xffffffff),
11420 lookup_name == NULL ? "" : lookup_name, path, atoken);
11421 #else
11422 kern_ret = send_vfs_resolve_dir(mach_port, req.r_req_id,
11423 proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
11424 lookup_name == NULL ? "" : lookup_name, path);
11425 #endif /* DATALESS_FILES_USE_AUDIT_TOKEN */
11426
11427 if (tmpname != NULL) {
11428 zfree(ZV_NAMEI, tmpname);
11429
11430 /*
11431 * Poison lookup_name rather than reference
11432 * freed memory.
11433 */
11434 lookup_name = NULL;
11435 }
11436 } else {
11437 #ifdef DATALESS_FILES_USE_AUDIT_TOKEN
11438 kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
11439 req.r_req_id, (uint32_t)(op & 0xffffffff),
11440 offset, size, path, atoken);
11441 #else
11442 kern_ret = send_vfs_resolve_file(mach_port, req.r_req_id,
11443 proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
11444 offset, size, path);
11445 #endif /* DATALESS_FILES_USE_AUDIT_TOKEN */
11446 }
11447 if (kern_ret != KERN_SUCCESS) {
11448 /*
11449 * Also treat this like being unable to access the backing
11450 * store server.
11451 */
11452 os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
11453 kern_ret);
11454 error = ETIMEDOUT;
11455
11456 NSPACE_REQ_LOCK();
11457 nspace_resolver_req_remove(&req);
11458 NSPACE_REQ_UNLOCK();
11459 goto out_release_port;
11460 }
11461
11462 /*
11463 * Give back the memory we allocated earlier while we wait; we
11464 * no longer need it.
11465 */
11466 zfree(ZV_NAMEI, path);
11467 path = NULL;
11468
11469 /*
11470 * Request has been submitted to the resolver. Now (interruptibly)
11471 * wait for completion. Upon requrn, the request will have been
11472 * removed from the lookup table.
11473 */
11474 error = nspace_resolver_req_wait(&req);
11475
11476 out_release_port:
11477 if (path != NULL) {
11478 zfree(ZV_NAMEI, path);
11479 }
11480 ipc_port_release_send(mach_port);
11481
11482 return error;
11483 #else
11484 return ENOTSUP;
11485 #endif /* CONFIG_DATALESS_FILES */
11486 }
11487
11488 /*
11489 * vfs_materialize_file: Materialize a regular file.
11490 *
11491 * Inputs:
11492 * vp The dataless file to be materialized.
11493 *
11494 * op What kind of operation is being performed:
11495 * -> NAMESPACE_HANDLER_READ_OP
11496 * -> NAMESPACE_HANDLER_WRITE_OP
11497 * -> NAMESPACE_HANDLER_LINK_CREATE
11498 * -> NAMESPACE_HANDLER_DELETE_OP
11499 * -> NAMESPACE_HANDLER_TRUNCATE_OP
11500 * -> NAMESPACE_HANDLER_RENAME_OP
11501 *
11502 * offset offset of I/O for READ or WRITE. Ignored for
11503 * other ops.
11504 *
11505 * size size of I/O for READ or WRITE Ignored for
11506 * other ops.
11507 *
11508 * If offsize or size are -1 for a READ or WRITE, then the resolver should
11509 * consider the range to be unknown.
11510 *
11511 * Upon successful return, the caller may proceed with the operation.
11512 * N.B. the file may still be "dataless" in this case.
11513 */
11514 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)11515 vfs_materialize_file(
11516 struct vnode *vp,
11517 uint64_t op,
11518 int64_t offset,
11519 int64_t size)
11520 {
11521 if (vp->v_type != VREG) {
11522 return EFTYPE;
11523 }
11524 return vfs_materialize_item(vp, op, offset, size, NULL, 0);
11525 }
11526
11527 /*
11528 * vfs_materialize_dir:
11529 *
11530 * Inputs:
11531 * vp The dataless directory to be materialized.
11532 *
11533 * op What kind of operation is being performed:
11534 * -> NAMESPACE_HANDLER_READ_OP
11535 * -> NAMESPACE_HANDLER_WRITE_OP
11536 * -> NAMESPACE_HANDLER_DELETE_OP
11537 * -> NAMESPACE_HANDLER_RENAME_OP
11538 * -> NAMESPACE_HANDLER_LOOKUP_OP
11539 *
11540 * lookup_name Name being looked up for a LOOKUP op. Ignored for
11541 * other ops. May or may not be NUL-terminated; see below.
11542 *
11543 * namelen If non-zero, then lookup_name is assumed to not be NUL-
11544 * terminated and namelen is the number of valid bytes in
11545 * lookup_name. If zero, then lookup_name is assumed to be
11546 * NUL-terminated.
11547 *
11548 * Upon successful return, the caller may proceed with the operation.
11549 * N.B. the directory may still be "dataless" in this case.
11550 */
11551 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)11552 vfs_materialize_dir(
11553 struct vnode *vp,
11554 uint64_t op,
11555 char *lookup_name,
11556 size_t namelen)
11557 {
11558 if (vp->v_type != VDIR) {
11559 return EFTYPE;
11560 }
11561 if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
11562 return EINVAL;
11563 }
11564 return vfs_materialize_item(vp, op, 0, 0, lookup_name, namelen);
11565 }
11566
11567 int
resolve_nspace_item_ext(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,void * arg __unused)11568 resolve_nspace_item_ext(
11569 struct vnode *vp __no_dataless_unused,
11570 uint64_t op __no_dataless_unused,
11571 void *arg __unused)
11572 {
11573 #if CONFIG_DATALESS_FILES
11574 int error;
11575 mach_port_t mp;
11576 char *path = NULL;
11577 int path_len;
11578 kern_return_t kr;
11579 struct nspace_resolver_request req;
11580
11581 // only allow namespace events on regular files, directories and symlinks.
11582 if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
11583 return EFTYPE;
11584 }
11585
11586 //
11587 // if this is a snapshot event and the vnode is on a
11588 // disk image just pretend nothing happened since any
11589 // change to the disk image will cause the disk image
11590 // itself to get backed up and this avoids multi-way
11591 // deadlocks between the snapshot handler and the ever
11592 // popular diskimages-helper process. the variable
11593 // nspace_allow_virtual_devs allows this behavior to
11594 // be overridden (for use by the Mobile TimeMachine
11595 // testing infrastructure which uses disk images)
11596 //
11597 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
11598 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
11599 return ENOTSUP;
11600 }
11601
11602 error = vfs_context_dataless_materialization_is_prevented(
11603 vfs_context_current());
11604 if (error) {
11605 log_materialization_prevented(vp, op);
11606 return error;
11607 }
11608
11609 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11610 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11611 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
11612 // Treat this like being unable to access the backing
11613 // store server.
11614 return ETIMEDOUT;
11615 }
11616
11617 path = zalloc(ZV_NAMEI);
11618 path_len = MAXPATHLEN;
11619
11620 error = vn_getpath(vp, path, &path_len);
11621 if (error == 0) {
11622 int xxx_rdar44371223; /* XXX Mig bug */
11623 req.r_req_id = next_nspace_req_id();
11624 req.r_resolver_error = 0;
11625 req.r_flags = 0;
11626
11627 if ((error = vnode_ref(vp)) == 0) { // take a ref so that the vnode doesn't go away
11628 req.r_vp = vp;
11629 } else {
11630 goto out_release_port;
11631 }
11632
11633 NSPACE_REQ_LOCK();
11634 error = nspace_resolver_req_add(&req);
11635 NSPACE_REQ_UNLOCK();
11636 if (error) {
11637 vnode_rele(req.r_vp);
11638 goto out_release_port;
11639 }
11640
11641 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
11642 kr = send_nspace_resolve_path(mp, req.r_req_id,
11643 proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
11644 path, &xxx_rdar44371223);
11645 if (kr != KERN_SUCCESS) {
11646 // Also treat this like being unable to access
11647 // the backing store server.
11648 os_log_error(OS_LOG_DEFAULT,
11649 "NSPACE resolve_path failure: %d", kr);
11650 error = ETIMEDOUT;
11651
11652 NSPACE_REQ_LOCK();
11653 nspace_resolver_req_remove(&req);
11654 NSPACE_REQ_UNLOCK();
11655 vnode_rele(req.r_vp);
11656 goto out_release_port;
11657 }
11658
11659 // Give back the memory we allocated earlier while
11660 // we wait; we no longer need it.
11661 zfree(ZV_NAMEI, path);
11662 path = NULL;
11663
11664 // Request has been submitted to the resolver.
11665 // Now (interruptibly) wait for completion.
11666 // Upon requrn, the request will have been removed
11667 // from the lookup table.
11668 error = nspace_resolver_req_wait(&req);
11669
11670 vnode_rele(req.r_vp);
11671 }
11672
11673 out_release_port:
11674 if (path != NULL) {
11675 zfree(ZV_NAMEI, path);
11676 }
11677 ipc_port_release_send(mp);
11678
11679 return error;
11680 #else
11681 return ENOTSUP;
11682 #endif /* CONFIG_DATALESS_FILES */
11683 }
11684
11685 int
nspace_snapshot_event(__unused vnode_t vp,__unused time_t ctime,__unused uint64_t op_type,__unused void * arg)11686 nspace_snapshot_event(__unused vnode_t vp, __unused time_t ctime,
11687 __unused uint64_t op_type, __unused void *arg)
11688 {
11689 return 0;
11690 }
11691
11692 #if 0
11693 static int
11694 build_volfs_path(struct vnode *vp, char *path, int *len)
11695 {
11696 struct vnode_attr va;
11697 int ret;
11698
11699 VATTR_INIT(&va);
11700 VATTR_WANTED(&va, va_fsid);
11701 VATTR_WANTED(&va, va_fileid);
11702
11703 if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
11704 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
11705 ret = -1;
11706 } else {
11707 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
11708 ret = 0;
11709 }
11710
11711 return ret;
11712 }
11713 #endif
11714
11715 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)11716 fsctl_bogus_command_compat(unsigned long cmd)
11717 {
11718 switch (cmd) {
11719 case IOCBASECMD(FSIOC_SYNC_VOLUME):
11720 return FSIOC_SYNC_VOLUME;
11721 case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
11722 return FSIOC_ROUTEFS_SETROUTEID;
11723 case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
11724 return FSIOC_SET_PACKAGE_EXTS;
11725 case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
11726 return FSIOC_SET_FSTYPENAME_OVERRIDE;
11727 case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
11728 return DISK_CONDITIONER_IOC_GET;
11729 case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
11730 return DISK_CONDITIONER_IOC_SET;
11731 case IOCBASECMD(FSIOC_FIOSEEKHOLE):
11732 return FSIOC_FIOSEEKHOLE;
11733 case IOCBASECMD(FSIOC_FIOSEEKDATA):
11734 return FSIOC_FIOSEEKDATA;
11735 case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
11736 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
11737 case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
11738 return SPOTLIGHT_IOC_GET_LAST_MTIME;
11739 }
11740
11741 return cmd;
11742 }
11743
11744 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)11745 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
11746 {
11747 return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
11748 }
11749
11750 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)11751 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
11752 {
11753 struct vfs_attr vfa;
11754 mount_t mp = vp->v_mount;
11755 unsigned arg;
11756 int error;
11757
11758 /* record vid of vp so we can drop it below. */
11759 uint32_t vvid = vp->v_id;
11760
11761 /*
11762 * Then grab mount_iterref so that we can release the vnode.
11763 * Without this, a thread may call vnode_iterate_prepare then
11764 * get into a deadlock because we've never released the root vp
11765 */
11766 error = mount_iterref(mp, 0);
11767 if (error) {
11768 return error;
11769 }
11770 vnode_put(vp);
11771
11772 arg = MNT_NOWAIT;
11773 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
11774 arg = MNT_WAIT;
11775 }
11776
11777 /*
11778 * If the filessytem supports multiple filesytems in a
11779 * partition (For eg APFS volumes in a container, it knows
11780 * that the waitfor argument to VFS_SYNC are flags.
11781 */
11782 VFSATTR_INIT(&vfa);
11783 VFSATTR_WANTED(&vfa, f_capabilities);
11784 if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
11785 VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
11786 ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
11787 ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
11788 arg |= MNT_VOLUME;
11789 }
11790
11791 /* issue the sync for this volume */
11792 (void)sync_callback(mp, &arg);
11793
11794 /*
11795 * Then release the mount_iterref once we're done syncing; it's not
11796 * needed for the VNOP_IOCTL below
11797 */
11798 mount_iterdrop(mp);
11799
11800 if (arg & FSCTL_SYNC_FULLSYNC) {
11801 /* re-obtain vnode iocount on the root vp, if possible */
11802 error = vnode_getwithvid(vp, vvid);
11803 if (error == 0) {
11804 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
11805 vnode_put(vp);
11806 }
11807 }
11808 /* mark the argument VP as having been released */
11809 *arg_vp = NULL;
11810 return error;
11811 }
11812
11813 #if ROUTEFS
11814 static int __attribute__((noinline))
handle_routes(user_addr_t udata)11815 handle_routes(user_addr_t udata)
11816 {
11817 char routepath[MAXPATHLEN];
11818 size_t len = 0;
11819 int error;
11820
11821 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11822 return error;
11823 }
11824 bzero(routepath, MAXPATHLEN);
11825 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
11826 if (error) {
11827 return error;
11828 }
11829 error = routefs_kernel_mount(routepath);
11830 return error;
11831 }
11832 #endif
11833
11834 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)11835 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
11836 {
11837 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
11838 struct vnode_attr va;
11839 int error;
11840
11841 VATTR_INIT(&va);
11842 VATTR_SET(&va, va_flags, cas->new_flags);
11843
11844 error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
11845 return error;
11846 }
11847
11848 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)11849 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
11850 {
11851 struct mount *mp = NULL;
11852 errno_t rootauth = 0;
11853
11854 mp = vp->v_mount;
11855
11856 /*
11857 * query the underlying FS and see if it reports something
11858 * sane for this vnode. If volume is authenticated via
11859 * chunklist, leave that for the caller to determine.
11860 */
11861 rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
11862
11863 return rootauth;
11864 }
11865
11866 /*
11867 * Make a filesystem-specific control call:
11868 */
11869 /* ARGSUSED */
11870 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)11871 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
11872 {
11873 int error = 0;
11874 boolean_t is64bit;
11875 u_int size;
11876 #define STK_PARAMS 128
11877 char stkbuf[STK_PARAMS] = {0};
11878 caddr_t data, memp;
11879 vnode_t vp = *arg_vp;
11880
11881 if (vp->v_type == VCHR || vp->v_type == VBLK) {
11882 return ENOTTY;
11883 }
11884
11885 cmd = fsctl_bogus_command_compat(cmd);
11886
11887 size = IOCPARM_LEN(cmd);
11888 if (size > IOCPARM_MAX) {
11889 return EINVAL;
11890 }
11891
11892 is64bit = proc_is64bit(p);
11893
11894 memp = NULL;
11895
11896 if (size > sizeof(stkbuf)) {
11897 if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
11898 return ENOMEM;
11899 }
11900 data = memp;
11901 } else {
11902 data = &stkbuf[0];
11903 };
11904
11905 if (cmd & IOC_IN) {
11906 if (size) {
11907 error = copyin(udata, data, size);
11908 if (error) {
11909 if (memp) {
11910 kfree_data(memp, size);
11911 }
11912 return error;
11913 }
11914 } else {
11915 if (is64bit) {
11916 *(user_addr_t *)data = udata;
11917 } else {
11918 *(uint32_t *)data = (uint32_t)udata;
11919 }
11920 };
11921 } else if ((cmd & IOC_OUT) && size) {
11922 /*
11923 * Zero the buffer so the user always
11924 * gets back something deterministic.
11925 */
11926 bzero(data, size);
11927 } else if (cmd & IOC_VOID) {
11928 if (is64bit) {
11929 *(user_addr_t *)data = udata;
11930 } else {
11931 *(uint32_t *)data = (uint32_t)udata;
11932 }
11933 }
11934
11935 /* Check to see if it's a generic command */
11936 switch (cmd) {
11937 case FSIOC_SYNC_VOLUME:
11938 error = handle_sync_volume(vp, arg_vp, data, ctx);
11939 break;
11940
11941 case FSIOC_ROUTEFS_SETROUTEID:
11942 #if ROUTEFS
11943 error = handle_routes(udata);
11944 #endif
11945 break;
11946
11947 case FSIOC_SET_PACKAGE_EXTS: {
11948 user_addr_t ext_strings;
11949 uint32_t num_entries;
11950 uint32_t max_width;
11951
11952 if ((error = priv_check_cred(kauth_cred_get(), PRIV_PACKAGE_EXTENSIONS, 0))) {
11953 break;
11954 }
11955
11956 if ((is64bit && size != sizeof(user64_package_ext_info))
11957 || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
11958 // either you're 64-bit and passed a 64-bit struct or
11959 // you're 32-bit and passed a 32-bit struct. otherwise
11960 // it's not ok.
11961 error = EINVAL;
11962 break;
11963 }
11964
11965 if (is64bit) {
11966 if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
11967 assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
11968 }
11969 ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
11970 num_entries = ((user64_package_ext_info *)data)->num_entries;
11971 max_width = ((user64_package_ext_info *)data)->max_width;
11972 } else {
11973 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
11974 num_entries = ((user32_package_ext_info *)data)->num_entries;
11975 max_width = ((user32_package_ext_info *)data)->max_width;
11976 }
11977 error = set_package_extensions_table(ext_strings, num_entries, max_width);
11978 }
11979 break;
11980
11981 case FSIOC_SET_FSTYPENAME_OVERRIDE:
11982 {
11983 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11984 break;
11985 }
11986 if (vp->v_mount) {
11987 mount_lock(vp->v_mount);
11988 if (data[0] != 0) {
11989 int i;
11990 for (i = 0; i < MFSTYPENAMELEN; i++) {
11991 if (!data[i]) {
11992 goto continue_copy;
11993 }
11994 }
11995 /*
11996 * Getting here means we have a user data string which has no
11997 * NULL termination in its first MFSTYPENAMELEN bytes.
11998 * This is bogus, let's avoid strlcpy-ing the read data and
11999 * return an error.
12000 */
12001 error = EINVAL;
12002 goto unlock;
12003 continue_copy:
12004 strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
12005 vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
12006 if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
12007 vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
12008 vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
12009 }
12010 } else {
12011 if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
12012 vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
12013 }
12014 vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
12015 vp->v_mount->fstypename_override[0] = '\0';
12016 }
12017 unlock:
12018 mount_unlock(vp->v_mount);
12019 }
12020 }
12021 break;
12022
12023 case DISK_CONDITIONER_IOC_GET: {
12024 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12025 }
12026 break;
12027
12028 case DISK_CONDITIONER_IOC_SET: {
12029 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
12030 }
12031 break;
12032
12033 case FSIOC_CAS_BSDFLAGS:
12034 error = handle_flags(vp, data, ctx);
12035 break;
12036
12037 case FSIOC_FD_ONLY_OPEN_ONCE: {
12038 error = 0;
12039 if (vnode_usecount(vp) > 1) {
12040 vnode_lock_spin(vp);
12041 if (vp->v_lflag & VL_HASSTREAMS) {
12042 if (vnode_isinuse_locked(vp, 1, 1)) {
12043 error = EBUSY;
12044 }
12045 } else if (vnode_usecount(vp) > 1) {
12046 error = EBUSY;
12047 }
12048 vnode_unlock(vp);
12049 }
12050 }
12051 break;
12052
12053 case FSIOC_EVAL_ROOTAUTH:
12054 error = handle_auth(vp, cmd, data, options, ctx);
12055 break;
12056
12057 default: {
12058 /* other, known commands shouldn't be passed down here */
12059 switch (cmd) {
12060 case F_PUNCHHOLE:
12061 case F_TRIM_ACTIVE_FILE:
12062 case F_RDADVISE:
12063 case F_TRANSCODEKEY:
12064 case F_GETPROTECTIONLEVEL:
12065 case F_GETDEFAULTPROTLEVEL:
12066 case F_MAKECOMPRESSED:
12067 case F_SET_GREEDY_MODE:
12068 case F_SETSTATICCONTENT:
12069 case F_SETIOTYPE:
12070 case F_SETBACKINGSTORE:
12071 case F_GETPATH_MTMINFO:
12072 case APFSIOC_REVERT_TO_SNAPSHOT:
12073 case FSIOC_FIOSEEKHOLE:
12074 case FSIOC_FIOSEEKDATA:
12075 case HFS_GET_BOOT_INFO:
12076 case HFS_SET_BOOT_INFO:
12077 case FIOPINSWAP:
12078 case F_CHKCLEAN:
12079 case F_FULLFSYNC:
12080 case F_BARRIERFSYNC:
12081 case F_FREEZE_FS:
12082 case F_THAW_FS:
12083 case FSIOC_KERNEL_ROOTAUTH:
12084 error = EINVAL;
12085 goto outdrop;
12086 }
12087 /* Invoke the filesystem-specific code */
12088 error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12089 }
12090 } /* end switch stmt */
12091
12092 /*
12093 * if no errors, copy any data to user. Size was
12094 * already set and checked above.
12095 */
12096 if (error == 0 && (cmd & IOC_OUT) && size) {
12097 error = copyout(data, udata, size);
12098 }
12099
12100 outdrop:
12101 if (memp) {
12102 kfree_data(memp, size);
12103 }
12104
12105 return error;
12106 }
12107
12108 /* ARGSUSED */
12109 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)12110 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
12111 {
12112 int error;
12113 struct nameidata nd;
12114 uint32_t nameiflags;
12115 vnode_t vp = NULL;
12116 vfs_context_t ctx = vfs_context_current();
12117
12118 AUDIT_ARG(cmd, (int)uap->cmd);
12119 AUDIT_ARG(value32, uap->options);
12120 /* Get the vnode for the file we are getting info on: */
12121 nameiflags = 0;
12122 //
12123 // if we come through fsctl() then the file is by definition not open.
12124 // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
12125 // lest the caller mistakenly thinks the only open is their own (but in
12126 // reality it's someone elses).
12127 //
12128 if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
12129 return EINVAL;
12130 }
12131 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
12132 nameiflags |= FOLLOW;
12133 }
12134 if (uap->cmd == FSIOC_FIRMLINK_CTL) {
12135 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
12136 }
12137 NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
12138 UIO_USERSPACE, uap->path, ctx);
12139 if ((error = namei(&nd))) {
12140 goto done;
12141 }
12142 vp = nd.ni_vp;
12143 nameidone(&nd);
12144
12145 #if CONFIG_MACF
12146 error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
12147 if (error) {
12148 goto done;
12149 }
12150 #endif
12151
12152 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12153
12154 done:
12155 if (vp) {
12156 vnode_put(vp);
12157 }
12158 return error;
12159 }
12160 /* ARGSUSED */
12161 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)12162 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
12163 {
12164 int error;
12165 vnode_t vp = NULL;
12166 vfs_context_t ctx = vfs_context_current();
12167 int fd = -1;
12168
12169 AUDIT_ARG(fd, uap->fd);
12170 AUDIT_ARG(cmd, (int)uap->cmd);
12171 AUDIT_ARG(value32, uap->options);
12172
12173 /* Get the vnode for the file we are getting info on: */
12174 if ((error = file_vnode(uap->fd, &vp))) {
12175 return error;
12176 }
12177 fd = uap->fd;
12178 if ((error = vnode_getwithref(vp))) {
12179 file_drop(fd);
12180 return error;
12181 }
12182
12183 #if CONFIG_MACF
12184 if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
12185 file_drop(fd);
12186 vnode_put(vp);
12187 return error;
12188 }
12189 #endif
12190
12191 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12192
12193 file_drop(fd);
12194
12195 /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
12196 if (vp) {
12197 vnode_put(vp);
12198 }
12199
12200 return error;
12201 }
12202 /* end of fsctl system call */
12203
12204 #define FILESEC_ACCESS_ENTITLEMENT \
12205 "com.apple.private.vfs.filesec-access"
12206
12207 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)12208 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
12209 {
12210 if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
12211 /*
12212 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
12213 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
12214 */
12215 if ((!setting && vfs_context_issuser(ctx)) ||
12216 IOCurrentTaskHasEntitlement(FILESEC_ACCESS_ENTITLEMENT)) {
12217 return 0;
12218 }
12219 }
12220
12221 return EPERM;
12222 }
12223
12224 /*
12225 * Retrieve the data of an extended attribute.
12226 */
12227 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)12228 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
12229 {
12230 vnode_t vp;
12231 struct nameidata nd;
12232 char attrname[XATTR_MAXNAMELEN + 1];
12233 vfs_context_t ctx = vfs_context_current();
12234 uio_t auio = NULL;
12235 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12236 size_t attrsize = 0;
12237 size_t namelen;
12238 u_int32_t nameiflags;
12239 int error;
12240 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12241
12242 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12243 return EINVAL;
12244 }
12245
12246 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12247 NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
12248 if ((error = namei(&nd))) {
12249 return error;
12250 }
12251 vp = nd.ni_vp;
12252 nameidone(&nd);
12253
12254 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12255 if (error != 0) {
12256 goto out;
12257 }
12258 if (xattr_protected(attrname) &&
12259 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12260 goto out;
12261 }
12262 /*
12263 * the specific check for 0xffffffff is a hack to preserve
12264 * binaray compatibilty in K64 with applications that discovered
12265 * that passing in a buf pointer and a size of -1 resulted in
12266 * just the size of the indicated extended attribute being returned.
12267 * this isn't part of the documented behavior, but because of the
12268 * original implemtation's check for "uap->size > 0", this behavior
12269 * was allowed. In K32 that check turned into a signed comparison
12270 * even though uap->size is unsigned... in K64, we blow by that
12271 * check because uap->size is unsigned and doesn't get sign smeared
12272 * in the munger for a 32 bit user app. we also need to add a
12273 * check to limit the maximum size of the buffer being passed in...
12274 * unfortunately, the underlying fileystems seem to just malloc
12275 * the requested size even if the actual extended attribute is tiny.
12276 * because that malloc is for kernel wired memory, we have to put a
12277 * sane limit on it.
12278 *
12279 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
12280 * U64 running on K64 will yield -1 (64 bits wide)
12281 * U32/U64 running on K32 will yield -1 (32 bits wide)
12282 */
12283 if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
12284 goto no_uio;
12285 }
12286
12287 if (uap->value) {
12288 if (uap->size > (size_t)XATTR_MAXSIZE) {
12289 uap->size = XATTR_MAXSIZE;
12290 }
12291
12292 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12293 &uio_buf[0], sizeof(uio_buf));
12294 uio_addiov(auio, uap->value, uap->size);
12295 }
12296 no_uio:
12297 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
12298 out:
12299 vnode_put(vp);
12300
12301 if (auio) {
12302 *retval = uap->size - uio_resid(auio);
12303 } else {
12304 *retval = (user_ssize_t)attrsize;
12305 }
12306
12307 return error;
12308 }
12309
12310 /*
12311 * Retrieve the data of an extended attribute.
12312 */
12313 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)12314 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
12315 {
12316 vnode_t vp;
12317 char attrname[XATTR_MAXNAMELEN + 1];
12318 vfs_context_t ctx = vfs_context_current();
12319 uio_t auio = NULL;
12320 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12321 size_t attrsize = 0;
12322 size_t namelen;
12323 int error;
12324 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12325
12326 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12327 return EINVAL;
12328 }
12329
12330 if ((error = file_vnode(uap->fd, &vp))) {
12331 return error;
12332 }
12333 if ((error = vnode_getwithref(vp))) {
12334 file_drop(uap->fd);
12335 return error;
12336 }
12337 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12338 if (error != 0) {
12339 goto out;
12340 }
12341 if (xattr_protected(attrname) &&
12342 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12343 goto out;
12344 }
12345 if (uap->value && uap->size > 0) {
12346 if (uap->size > (size_t)XATTR_MAXSIZE) {
12347 uap->size = XATTR_MAXSIZE;
12348 }
12349
12350 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12351 &uio_buf[0], sizeof(uio_buf));
12352 uio_addiov(auio, uap->value, uap->size);
12353 }
12354
12355 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
12356 out:
12357 (void)vnode_put(vp);
12358 file_drop(uap->fd);
12359
12360 if (auio) {
12361 *retval = uap->size - uio_resid(auio);
12362 } else {
12363 *retval = (user_ssize_t)attrsize;
12364 }
12365 return error;
12366 }
12367
12368 /* struct for checkdirs iteration */
12369 struct setxattr_ctx {
12370 struct nameidata nd;
12371 char attrname[XATTR_MAXNAMELEN + 1];
12372 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12373 };
12374
12375 /*
12376 * Set the data of an extended attribute.
12377 */
12378 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)12379 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
12380 {
12381 vnode_t vp;
12382 vfs_context_t ctx = vfs_context_current();
12383 uio_t auio = NULL;
12384 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12385 size_t namelen;
12386 u_int32_t nameiflags;
12387 int error;
12388 struct setxattr_ctx *sactx;
12389
12390 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12391 return EINVAL;
12392 }
12393
12394 sactx = (struct setxattr_ctx *)kalloc_data(sizeof(struct setxattr_ctx), Z_WAITOK);
12395 if (sactx == NULL) {
12396 return ENOMEM;
12397 }
12398
12399 error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
12400 if (error != 0) {
12401 if (error == EPERM) {
12402 /* if the string won't fit in attrname, copyinstr emits EPERM */
12403 error = ENAMETOOLONG;
12404 }
12405 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
12406 goto out;
12407 }
12408 if (xattr_protected(sactx->attrname) &&
12409 (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
12410 goto out;
12411 }
12412 if (uap->size != 0 && uap->value == 0) {
12413 error = EINVAL;
12414 goto out;
12415 }
12416 if (uap->size > INT_MAX) {
12417 error = E2BIG;
12418 goto out;
12419 }
12420
12421 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12422 NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
12423 if ((error = namei(&sactx->nd))) {
12424 goto out;
12425 }
12426 vp = sactx->nd.ni_vp;
12427 nameidone(&sactx->nd);
12428
12429 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
12430 &sactx->uio_buf[0], sizeof(sactx->uio_buf));
12431 uio_addiov(auio, uap->value, uap->size);
12432
12433 error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
12434 #if CONFIG_FSE
12435 if (error == 0) {
12436 add_fsevent(FSE_XATTR_MODIFIED, ctx,
12437 FSE_ARG_VNODE, vp,
12438 FSE_ARG_DONE);
12439 }
12440 #endif
12441 vnode_put(vp);
12442 out:
12443 kfree_data(sactx, sizeof(struct setxattr_ctx));
12444 *retval = 0;
12445 return error;
12446 }
12447
12448 /*
12449 * Set the data of an extended attribute.
12450 */
12451 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)12452 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
12453 {
12454 vnode_t vp;
12455 char attrname[XATTR_MAXNAMELEN + 1];
12456 vfs_context_t ctx = vfs_context_current();
12457 uio_t auio = NULL;
12458 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12459 size_t namelen;
12460 int error;
12461 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12462
12463 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12464 return EINVAL;
12465 }
12466
12467 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12468 if (error != 0) {
12469 if (error == EPERM) {
12470 /* if the string won't fit in attrname, copyinstr emits EPERM */
12471 return ENAMETOOLONG;
12472 }
12473 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
12474 return error;
12475 }
12476 if (xattr_protected(attrname) &&
12477 (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
12478 return error;
12479 }
12480 if (uap->size != 0 && uap->value == 0) {
12481 return EINVAL;
12482 }
12483 if (uap->size > INT_MAX) {
12484 return E2BIG;
12485 }
12486 if ((error = file_vnode(uap->fd, &vp))) {
12487 return error;
12488 }
12489 if ((error = vnode_getwithref(vp))) {
12490 file_drop(uap->fd);
12491 return error;
12492 }
12493 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
12494 &uio_buf[0], sizeof(uio_buf));
12495 uio_addiov(auio, uap->value, uap->size);
12496
12497 error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
12498 #if CONFIG_FSE
12499 if (error == 0) {
12500 add_fsevent(FSE_XATTR_MODIFIED, ctx,
12501 FSE_ARG_VNODE, vp,
12502 FSE_ARG_DONE);
12503 }
12504 #endif
12505 vnode_put(vp);
12506 file_drop(uap->fd);
12507 *retval = 0;
12508 return error;
12509 }
12510
12511 /*
12512 * Remove an extended attribute.
12513 * XXX Code duplication here.
12514 */
12515 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)12516 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
12517 {
12518 vnode_t vp;
12519 struct nameidata nd;
12520 char attrname[XATTR_MAXNAMELEN + 1];
12521 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12522 vfs_context_t ctx = vfs_context_current();
12523 size_t namelen;
12524 u_int32_t nameiflags;
12525 int error;
12526
12527 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12528 return EINVAL;
12529 }
12530
12531 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12532 if (error != 0) {
12533 return error;
12534 }
12535 if (xattr_protected(attrname)) {
12536 return EPERM;
12537 }
12538 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12539 NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
12540 if ((error = namei(&nd))) {
12541 return error;
12542 }
12543 vp = nd.ni_vp;
12544 nameidone(&nd);
12545
12546 error = vn_removexattr(vp, attrname, uap->options, ctx);
12547 #if CONFIG_FSE
12548 if (error == 0) {
12549 add_fsevent(FSE_XATTR_REMOVED, ctx,
12550 FSE_ARG_VNODE, vp,
12551 FSE_ARG_DONE);
12552 }
12553 #endif
12554 vnode_put(vp);
12555 *retval = 0;
12556 return error;
12557 }
12558
12559 /*
12560 * Remove an extended attribute.
12561 * XXX Code duplication here.
12562 */
12563 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)12564 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
12565 {
12566 vnode_t vp;
12567 char attrname[XATTR_MAXNAMELEN + 1];
12568 size_t namelen;
12569 int error;
12570 #if CONFIG_FSE
12571 vfs_context_t ctx = vfs_context_current();
12572 #endif
12573
12574 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12575 return EINVAL;
12576 }
12577
12578 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12579 if (error != 0) {
12580 return error;
12581 }
12582 if (xattr_protected(attrname)) {
12583 return EPERM;
12584 }
12585 if ((error = file_vnode(uap->fd, &vp))) {
12586 return error;
12587 }
12588 if ((error = vnode_getwithref(vp))) {
12589 file_drop(uap->fd);
12590 return error;
12591 }
12592
12593 error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
12594 #if CONFIG_FSE
12595 if (error == 0) {
12596 add_fsevent(FSE_XATTR_REMOVED, ctx,
12597 FSE_ARG_VNODE, vp,
12598 FSE_ARG_DONE);
12599 }
12600 #endif
12601 vnode_put(vp);
12602 file_drop(uap->fd);
12603 *retval = 0;
12604 return error;
12605 }
12606
12607 /*
12608 * Retrieve the list of extended attribute names.
12609 * XXX Code duplication here.
12610 */
12611 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)12612 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
12613 {
12614 vnode_t vp;
12615 struct nameidata nd;
12616 vfs_context_t ctx = vfs_context_current();
12617 uio_t auio = NULL;
12618 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12619 size_t attrsize = 0;
12620 u_int32_t nameiflags;
12621 int error;
12622 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12623
12624 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12625 return EINVAL;
12626 }
12627
12628 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12629 NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
12630 if ((error = namei(&nd))) {
12631 return error;
12632 }
12633 vp = nd.ni_vp;
12634 nameidone(&nd);
12635 if (uap->namebuf != 0 && uap->bufsize > 0) {
12636 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
12637 &uio_buf[0], sizeof(uio_buf));
12638 uio_addiov(auio, uap->namebuf, uap->bufsize);
12639 }
12640
12641 error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
12642
12643 vnode_put(vp);
12644 if (auio) {
12645 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
12646 } else {
12647 *retval = (user_ssize_t)attrsize;
12648 }
12649 return error;
12650 }
12651
12652 /*
12653 * Retrieve the list of extended attribute names.
12654 * XXX Code duplication here.
12655 */
12656 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)12657 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
12658 {
12659 vnode_t vp;
12660 uio_t auio = NULL;
12661 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12662 size_t attrsize = 0;
12663 int error;
12664 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12665
12666 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12667 return EINVAL;
12668 }
12669
12670 if ((error = file_vnode(uap->fd, &vp))) {
12671 return error;
12672 }
12673 if ((error = vnode_getwithref(vp))) {
12674 file_drop(uap->fd);
12675 return error;
12676 }
12677 if (uap->namebuf != 0 && uap->bufsize > 0) {
12678 auio = uio_createwithbuffer(1, 0, spacetype,
12679 UIO_READ, &uio_buf[0], sizeof(uio_buf));
12680 uio_addiov(auio, uap->namebuf, uap->bufsize);
12681 }
12682
12683 error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
12684
12685 vnode_put(vp);
12686 file_drop(uap->fd);
12687 if (auio) {
12688 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
12689 } else {
12690 *retval = (user_ssize_t)attrsize;
12691 }
12692 return error;
12693 }
12694
12695 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)12696 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
12697 vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
12698 {
12699 int error;
12700 struct mount *mp = NULL;
12701 vnode_t vp;
12702 int length;
12703 int bpflags;
12704 /* maximum number of times to retry build_path */
12705 unsigned int retries = 0x10;
12706
12707 if (bufsize > PAGE_SIZE) {
12708 return EINVAL;
12709 }
12710
12711 if (buf == NULL) {
12712 return ENOMEM;
12713 }
12714
12715 retry:
12716 if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
12717 error = ENOTSUP; /* unexpected failure */
12718 return ENOTSUP;
12719 }
12720
12721 #if CONFIG_UNION_MOUNTS
12722 unionget:
12723 #endif /* CONFIG_UNION_MOUNTS */
12724 if (objid == 2) {
12725 struct vfs_attr vfsattr;
12726 int use_vfs_root = TRUE;
12727
12728 VFSATTR_INIT(&vfsattr);
12729 VFSATTR_WANTED(&vfsattr, f_capabilities);
12730 if (!(options & FSOPT_ISREALFSID) &&
12731 vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
12732 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
12733 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
12734 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
12735 use_vfs_root = FALSE;
12736 }
12737 }
12738
12739 if (use_vfs_root) {
12740 error = VFS_ROOT(mp, &vp, ctx);
12741 } else {
12742 error = VFS_VGET(mp, objid, &vp, ctx);
12743 }
12744 } else {
12745 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
12746 }
12747
12748 #if CONFIG_UNION_MOUNTS
12749 if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
12750 /*
12751 * If the fileid isn't found and we're in a union
12752 * mount volume, then see if the fileid is in the
12753 * mounted-on volume.
12754 */
12755 struct mount *tmp = mp;
12756 mp = vnode_mount(tmp->mnt_vnodecovered);
12757 vfs_unbusy(tmp);
12758 if (vfs_busy(mp, LK_NOWAIT) == 0) {
12759 goto unionget;
12760 }
12761 } else {
12762 vfs_unbusy(mp);
12763 }
12764 #else
12765 vfs_unbusy(mp);
12766 #endif /* CONFIG_UNION_MOUNTS */
12767
12768 if (error) {
12769 return error;
12770 }
12771
12772 #if CONFIG_MACF
12773 error = mac_vnode_check_fsgetpath(ctx, vp);
12774 if (error) {
12775 vnode_put(vp);
12776 return error;
12777 }
12778 #endif
12779
12780 /* Obtain the absolute path to this vnode. */
12781 bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
12782 if (options & FSOPT_NOFIRMLINKPATH) {
12783 bpflags |= BUILDPATH_NO_FIRMLINK;
12784 }
12785 bpflags |= BUILDPATH_CHECK_MOVED;
12786 error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
12787 vnode_put(vp);
12788
12789 if (error) {
12790 /* there was a race building the path, try a few more times */
12791 if (error == EAGAIN) {
12792 --retries;
12793 if (retries > 0) {
12794 goto retry;
12795 }
12796
12797 error = ENOENT;
12798 }
12799 goto out;
12800 }
12801
12802 AUDIT_ARG(text, buf);
12803
12804 if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
12805 unsigned long path_words[NUMPARMS];
12806 size_t path_len = sizeof(path_words);
12807
12808 if ((size_t)length < path_len) {
12809 memcpy((char *)path_words, buf, length);
12810 memset((char *)path_words + length, 0, path_len - length);
12811
12812 path_len = length;
12813 } else {
12814 memcpy((char *)path_words, buf + (length - path_len), path_len);
12815 }
12816
12817 kdebug_vfs_lookup(path_words, (int)path_len, vp,
12818 KDBG_VFS_LOOKUP_FLAG_LOOKUP);
12819 }
12820
12821 *pathlen = length; /* may be superseded by error */
12822
12823 out:
12824 return error;
12825 }
12826
12827 /*
12828 * Obtain the full pathname of a file system object by id.
12829 */
12830 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)12831 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
12832 uint32_t options, user_ssize_t *retval)
12833 {
12834 vfs_context_t ctx = vfs_context_current();
12835 fsid_t fsid;
12836 char *realpath;
12837 int length;
12838 int error;
12839
12840 if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
12841 return EINVAL;
12842 }
12843
12844 if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
12845 return error;
12846 }
12847 AUDIT_ARG(value32, fsid.val[0]);
12848 AUDIT_ARG(value64, objid);
12849 /* Restrict output buffer size for now. */
12850
12851 if (bufsize > PAGE_SIZE || bufsize <= 0) {
12852 return EINVAL;
12853 }
12854 realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
12855 if (realpath == NULL) {
12856 return ENOMEM;
12857 }
12858
12859 error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
12860 options, &length);
12861
12862 if (error) {
12863 goto out;
12864 }
12865
12866 error = copyout((caddr_t)realpath, buf, length);
12867
12868 *retval = (user_ssize_t)length; /* may be superseded by error */
12869 out:
12870 kfree_data(realpath, bufsize);
12871 return error;
12872 }
12873
12874 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)12875 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
12876 {
12877 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
12878 0, retval);
12879 }
12880
12881 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)12882 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
12883 {
12884 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
12885 uap->options, retval);
12886 }
12887
12888 /*
12889 * Common routine to handle various flavors of statfs data heading out
12890 * to user space.
12891 *
12892 * Returns: 0 Success
12893 * EFAULT
12894 */
12895 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)12896 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
12897 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
12898 boolean_t partial_copy)
12899 {
12900 int error;
12901 int my_size, copy_size;
12902
12903 if (is_64_bit) {
12904 struct user64_statfs sfs;
12905 my_size = copy_size = sizeof(sfs);
12906 bzero(&sfs, my_size);
12907 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12908 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
12909 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12910 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
12911 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
12912 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
12913 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
12914 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
12915 sfs.f_files = (user64_long_t)sfsp->f_files;
12916 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
12917 sfs.f_fsid = sfsp->f_fsid;
12918 sfs.f_owner = sfsp->f_owner;
12919 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12920 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12921 } else {
12922 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12923 }
12924 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12925 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12926
12927 if (partial_copy) {
12928 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12929 }
12930 error = copyout((caddr_t)&sfs, bufp, copy_size);
12931 } else {
12932 struct user32_statfs sfs;
12933
12934 my_size = copy_size = sizeof(sfs);
12935 bzero(&sfs, my_size);
12936
12937 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12938 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
12939 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12940
12941 /*
12942 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
12943 * have to fudge the numbers here in that case. We inflate the blocksize in order
12944 * to reflect the filesystem size as best we can.
12945 */
12946 if ((sfsp->f_blocks > INT_MAX)
12947 /* Hack for 4061702 . I think the real fix is for Carbon to
12948 * look for some volume capability and not depend on hidden
12949 * semantics agreed between a FS and carbon.
12950 * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
12951 * for Carbon to set bNoVolumeSizes volume attribute.
12952 * Without this the webdavfs files cannot be copied onto
12953 * disk as they look huge. This change should not affect
12954 * XSAN as they should not setting these to -1..
12955 */
12956 && (sfsp->f_blocks != 0xffffffffffffffffULL)
12957 && (sfsp->f_bfree != 0xffffffffffffffffULL)
12958 && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
12959 int shift;
12960
12961 /*
12962 * Work out how far we have to shift the block count down to make it fit.
12963 * Note that it's possible to have to shift so far that the resulting
12964 * blocksize would be unreportably large. At that point, we will clip
12965 * any values that don't fit.
12966 *
12967 * For safety's sake, we also ensure that f_iosize is never reported as
12968 * being smaller than f_bsize.
12969 */
12970 for (shift = 0; shift < 32; shift++) {
12971 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
12972 break;
12973 }
12974 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
12975 break;
12976 }
12977 }
12978 #define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
12979 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
12980 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
12981 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
12982 #undef __SHIFT_OR_CLIP
12983 sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
12984 sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
12985 } else {
12986 /* filesystem is small enough to be reported honestly */
12987 sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
12988 sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
12989 sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
12990 sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
12991 sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
12992 }
12993 sfs.f_files = (user32_long_t)sfsp->f_files;
12994 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
12995 sfs.f_fsid = sfsp->f_fsid;
12996 sfs.f_owner = sfsp->f_owner;
12997 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12998 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12999 } else {
13000 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
13001 }
13002 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13003 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13004
13005 if (partial_copy) {
13006 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13007 }
13008 error = copyout((caddr_t)&sfs, bufp, copy_size);
13009 }
13010
13011 if (sizep != NULL) {
13012 *sizep = my_size;
13013 }
13014 return error;
13015 }
13016
13017 /*
13018 * copy stat structure into user_stat structure.
13019 */
13020 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)13021 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
13022 {
13023 bzero(usbp, sizeof(*usbp));
13024
13025 usbp->st_dev = sbp->st_dev;
13026 usbp->st_ino = sbp->st_ino;
13027 usbp->st_mode = sbp->st_mode;
13028 usbp->st_nlink = sbp->st_nlink;
13029 usbp->st_uid = sbp->st_uid;
13030 usbp->st_gid = sbp->st_gid;
13031 usbp->st_rdev = sbp->st_rdev;
13032 #ifndef _POSIX_C_SOURCE
13033 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13034 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13035 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13036 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13037 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13038 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13039 #else
13040 usbp->st_atime = sbp->st_atime;
13041 usbp->st_atimensec = sbp->st_atimensec;
13042 usbp->st_mtime = sbp->st_mtime;
13043 usbp->st_mtimensec = sbp->st_mtimensec;
13044 usbp->st_ctime = sbp->st_ctime;
13045 usbp->st_ctimensec = sbp->st_ctimensec;
13046 #endif
13047 usbp->st_size = sbp->st_size;
13048 usbp->st_blocks = sbp->st_blocks;
13049 usbp->st_blksize = sbp->st_blksize;
13050 usbp->st_flags = sbp->st_flags;
13051 usbp->st_gen = sbp->st_gen;
13052 usbp->st_lspare = sbp->st_lspare;
13053 usbp->st_qspare[0] = sbp->st_qspare[0];
13054 usbp->st_qspare[1] = sbp->st_qspare[1];
13055 }
13056
13057 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)13058 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
13059 {
13060 bzero(usbp, sizeof(*usbp));
13061
13062 usbp->st_dev = sbp->st_dev;
13063 usbp->st_ino = sbp->st_ino;
13064 usbp->st_mode = sbp->st_mode;
13065 usbp->st_nlink = sbp->st_nlink;
13066 usbp->st_uid = sbp->st_uid;
13067 usbp->st_gid = sbp->st_gid;
13068 usbp->st_rdev = sbp->st_rdev;
13069 #ifndef _POSIX_C_SOURCE
13070 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13071 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13072 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13073 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13074 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13075 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13076 #else
13077 usbp->st_atime = sbp->st_atime;
13078 usbp->st_atimensec = sbp->st_atimensec;
13079 usbp->st_mtime = sbp->st_mtime;
13080 usbp->st_mtimensec = sbp->st_mtimensec;
13081 usbp->st_ctime = sbp->st_ctime;
13082 usbp->st_ctimensec = sbp->st_ctimensec;
13083 #endif
13084 usbp->st_size = sbp->st_size;
13085 usbp->st_blocks = sbp->st_blocks;
13086 usbp->st_blksize = sbp->st_blksize;
13087 usbp->st_flags = sbp->st_flags;
13088 usbp->st_gen = sbp->st_gen;
13089 usbp->st_lspare = sbp->st_lspare;
13090 usbp->st_qspare[0] = sbp->st_qspare[0];
13091 usbp->st_qspare[1] = sbp->st_qspare[1];
13092 }
13093
13094 /*
13095 * copy stat64 structure into user_stat64 structure.
13096 */
13097 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)13098 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
13099 {
13100 bzero(usbp, sizeof(*usbp));
13101
13102 usbp->st_dev = sbp->st_dev;
13103 usbp->st_ino = sbp->st_ino;
13104 usbp->st_mode = sbp->st_mode;
13105 usbp->st_nlink = sbp->st_nlink;
13106 usbp->st_uid = sbp->st_uid;
13107 usbp->st_gid = sbp->st_gid;
13108 usbp->st_rdev = sbp->st_rdev;
13109 #ifndef _POSIX_C_SOURCE
13110 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13111 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13112 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13113 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13114 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13115 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13116 usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
13117 usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
13118 #else
13119 usbp->st_atime = sbp->st_atime;
13120 usbp->st_atimensec = sbp->st_atimensec;
13121 usbp->st_mtime = sbp->st_mtime;
13122 usbp->st_mtimensec = sbp->st_mtimensec;
13123 usbp->st_ctime = sbp->st_ctime;
13124 usbp->st_ctimensec = sbp->st_ctimensec;
13125 usbp->st_birthtime = sbp->st_birthtime;
13126 usbp->st_birthtimensec = sbp->st_birthtimensec;
13127 #endif
13128 usbp->st_size = sbp->st_size;
13129 usbp->st_blocks = sbp->st_blocks;
13130 usbp->st_blksize = sbp->st_blksize;
13131 usbp->st_flags = sbp->st_flags;
13132 usbp->st_gen = sbp->st_gen;
13133 usbp->st_lspare = sbp->st_lspare;
13134 usbp->st_qspare[0] = sbp->st_qspare[0];
13135 usbp->st_qspare[1] = sbp->st_qspare[1];
13136 }
13137
13138 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)13139 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
13140 {
13141 bzero(usbp, sizeof(*usbp));
13142
13143 usbp->st_dev = sbp->st_dev;
13144 usbp->st_ino = sbp->st_ino;
13145 usbp->st_mode = sbp->st_mode;
13146 usbp->st_nlink = sbp->st_nlink;
13147 usbp->st_uid = sbp->st_uid;
13148 usbp->st_gid = sbp->st_gid;
13149 usbp->st_rdev = sbp->st_rdev;
13150 #ifndef _POSIX_C_SOURCE
13151 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13152 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13153 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13154 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13155 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13156 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13157 usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
13158 usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
13159 #else
13160 usbp->st_atime = sbp->st_atime;
13161 usbp->st_atimensec = sbp->st_atimensec;
13162 usbp->st_mtime = sbp->st_mtime;
13163 usbp->st_mtimensec = sbp->st_mtimensec;
13164 usbp->st_ctime = sbp->st_ctime;
13165 usbp->st_ctimensec = sbp->st_ctimensec;
13166 usbp->st_birthtime = sbp->st_birthtime;
13167 usbp->st_birthtimensec = sbp->st_birthtimensec;
13168 #endif
13169 usbp->st_size = sbp->st_size;
13170 usbp->st_blocks = sbp->st_blocks;
13171 usbp->st_blksize = sbp->st_blksize;
13172 usbp->st_flags = sbp->st_flags;
13173 usbp->st_gen = sbp->st_gen;
13174 usbp->st_lspare = sbp->st_lspare;
13175 usbp->st_qspare[0] = sbp->st_qspare[0];
13176 usbp->st_qspare[1] = sbp->st_qspare[1];
13177 }
13178
13179 /*
13180 * Purge buffer cache for simulating cold starts
13181 */
13182 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)13183 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
13184 {
13185 ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
13186
13187 return VNODE_RETURNED;
13188 }
13189
13190 static int
vfs_purge_callback(mount_t mp,__unused void * arg)13191 vfs_purge_callback(mount_t mp, __unused void * arg)
13192 {
13193 vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
13194
13195 return VFS_RETURNED;
13196 }
13197
13198 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)13199 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
13200 {
13201 if (!kauth_cred_issuser(kauth_cred_get())) {
13202 return EPERM;
13203 }
13204
13205 vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
13206
13207 return 0;
13208 }
13209
13210 /*
13211 * gets the vnode associated with the (unnamed) snapshot directory
13212 * for a Filesystem. The snapshot directory vnode is returned with
13213 * an iocount on it.
13214 */
13215 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)13216 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
13217 {
13218 return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
13219 }
13220
13221 /*
13222 * Get the snapshot vnode.
13223 *
13224 * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
13225 * needs nameidone() on ndp.
13226 *
13227 * If the snapshot vnode exists it is returned in ndp->ni_vp.
13228 *
13229 * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
13230 * not needed.
13231 */
13232 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)13233 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
13234 user_addr_t name, struct nameidata *ndp, int32_t op,
13235 #if !CONFIG_TRIGGERS
13236 __unused
13237 #endif
13238 enum path_operation pathop,
13239 vfs_context_t ctx)
13240 {
13241 int error, i;
13242 caddr_t name_buf;
13243 size_t name_len;
13244 struct vfs_attr vfa;
13245
13246 *sdvpp = NULLVP;
13247 *rvpp = NULLVP;
13248
13249 error = vnode_getfromfd(ctx, dirfd, rvpp);
13250 if (error) {
13251 return error;
13252 }
13253
13254 if (!vnode_isvroot(*rvpp)) {
13255 error = EINVAL;
13256 goto out;
13257 }
13258
13259 /* Make sure the filesystem supports snapshots */
13260 VFSATTR_INIT(&vfa);
13261 VFSATTR_WANTED(&vfa, f_capabilities);
13262 if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
13263 !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
13264 !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
13265 VOL_CAP_INT_SNAPSHOT)) ||
13266 !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
13267 VOL_CAP_INT_SNAPSHOT))) {
13268 error = ENOTSUP;
13269 goto out;
13270 }
13271
13272 error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
13273 if (error) {
13274 goto out;
13275 }
13276
13277 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13278 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13279 if (error) {
13280 goto out1;
13281 }
13282
13283 /*
13284 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
13285 * (the length returned by copyinstr includes the terminating NUL)
13286 */
13287 if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
13288 (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
13289 error = EINVAL;
13290 goto out1;
13291 }
13292 for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
13293 ;
13294 }
13295 if (i < (int)name_len) {
13296 error = EINVAL;
13297 goto out1;
13298 }
13299
13300 #if CONFIG_MACF
13301 if (op == CREATE) {
13302 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
13303 name_buf);
13304 } else if (op == DELETE) {
13305 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
13306 name_buf);
13307 }
13308 if (error) {
13309 goto out1;
13310 }
13311 #endif
13312
13313 /* Check if the snapshot already exists ... */
13314 NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
13315 UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
13316 ndp->ni_dvp = *sdvpp;
13317
13318 error = namei(ndp);
13319 out1:
13320 zfree(ZV_NAMEI, name_buf);
13321 out:
13322 if (error) {
13323 if (*sdvpp) {
13324 vnode_put(*sdvpp);
13325 *sdvpp = NULLVP;
13326 }
13327 if (*rvpp) {
13328 vnode_put(*rvpp);
13329 *rvpp = NULLVP;
13330 }
13331 }
13332 return error;
13333 }
13334
13335 /*
13336 * create a filesystem snapshot (for supporting filesystems)
13337 *
13338 * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
13339 * We get to the (unnamed) snapshot directory vnode and create the vnode
13340 * for the snapshot in it.
13341 *
13342 * Restrictions:
13343 *
13344 * a) Passed in name for snapshot cannot have slashes.
13345 * b) name can't be "." or ".."
13346 *
13347 * Since this requires superuser privileges, vnode_authorize calls are not
13348 * made.
13349 */
13350 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13351 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
13352 vfs_context_t ctx)
13353 {
13354 vnode_t rvp, snapdvp;
13355 int error;
13356 struct nameidata *ndp;
13357
13358 ndp = kalloc_type(struct nameidata, Z_WAITOK);
13359
13360 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
13361 OP_LINK, ctx);
13362 if (error) {
13363 goto out;
13364 }
13365
13366 if (ndp->ni_vp) {
13367 vnode_put(ndp->ni_vp);
13368 error = EEXIST;
13369 } else {
13370 struct vnode_attr *vap;
13371 vnode_t vp = NULLVP;
13372
13373 vap = kalloc_type(struct vnode_attr, Z_WAITOK);
13374
13375 VATTR_INIT(vap);
13376 VATTR_SET(vap, va_type, VREG);
13377 VATTR_SET(vap, va_mode, 0);
13378
13379 error = vn_create(snapdvp, &vp, ndp, vap,
13380 VN_CREATE_NOAUTH | VN_CREATE_NOINHERIT, 0, NULL, ctx);
13381 if (!error && vp) {
13382 vnode_put(vp);
13383 }
13384
13385 kfree_type(struct vnode_attr, vap);
13386 }
13387
13388 nameidone(ndp);
13389 vnode_put(snapdvp);
13390 vnode_put(rvp);
13391 out:
13392 kfree_type(struct nameidata, ndp);
13393
13394 return error;
13395 }
13396
13397 /*
13398 * Delete a Filesystem snapshot
13399 *
13400 * get the vnode for the unnamed snapshot directory and the snapshot and
13401 * delete the snapshot.
13402 */
13403 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13404 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
13405 vfs_context_t ctx)
13406 {
13407 vnode_t rvp, snapdvp;
13408 int error;
13409 struct nameidata *ndp;
13410
13411 ndp = kalloc_type(struct nameidata, Z_WAITOK);
13412
13413 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
13414 OP_UNLINK, ctx);
13415 if (error) {
13416 goto out;
13417 }
13418
13419 error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
13420 VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
13421
13422 vnode_put(ndp->ni_vp);
13423 nameidone(ndp);
13424 vnode_put(snapdvp);
13425 vnode_put(rvp);
13426 out:
13427 kfree_type(struct nameidata, ndp);
13428
13429 return error;
13430 }
13431
13432 /*
13433 * Revert a filesystem to a snapshot
13434 *
13435 * Marks the filesystem to revert to the given snapshot on next mount.
13436 */
13437 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13438 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
13439 vfs_context_t ctx)
13440 {
13441 int error;
13442 vnode_t rvp;
13443 mount_t mp;
13444 struct fs_snapshot_revert_args revert_data;
13445 struct componentname cnp;
13446 caddr_t name_buf;
13447 size_t name_len;
13448
13449 error = vnode_getfromfd(ctx, dirfd, &rvp);
13450 if (error) {
13451 return error;
13452 }
13453 mp = vnode_mount(rvp);
13454
13455 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13456 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13457 if (error) {
13458 zfree(ZV_NAMEI, name_buf);
13459 vnode_put(rvp);
13460 return error;
13461 }
13462
13463 #if CONFIG_MACF
13464 error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
13465 if (error) {
13466 zfree(ZV_NAMEI, name_buf);
13467 vnode_put(rvp);
13468 return error;
13469 }
13470 #endif
13471
13472 /*
13473 * Grab mount_iterref so that we can release the vnode,
13474 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
13475 */
13476 error = mount_iterref(mp, 0);
13477 vnode_put(rvp);
13478 if (error) {
13479 zfree(ZV_NAMEI, name_buf);
13480 return error;
13481 }
13482
13483 memset(&cnp, 0, sizeof(cnp));
13484 cnp.cn_pnbuf = (char *)name_buf;
13485 cnp.cn_nameiop = LOOKUP;
13486 cnp.cn_flags = ISLASTCN | HASBUF;
13487 cnp.cn_pnlen = MAXPATHLEN;
13488 cnp.cn_nameptr = cnp.cn_pnbuf;
13489 cnp.cn_namelen = (int)name_len;
13490 revert_data.sr_cnp = &cnp;
13491
13492 error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
13493 mount_iterdrop(mp);
13494 zfree(ZV_NAMEI, name_buf);
13495
13496 if (error) {
13497 /* If there was any error, try again using VNOP_IOCTL */
13498
13499 vnode_t snapdvp;
13500 struct nameidata namend;
13501
13502 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
13503 OP_LOOKUP, ctx);
13504 if (error) {
13505 return error;
13506 }
13507
13508
13509 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
13510 0, ctx);
13511
13512 vnode_put(namend.ni_vp);
13513 nameidone(&namend);
13514 vnode_put(snapdvp);
13515 vnode_put(rvp);
13516 }
13517
13518 return error;
13519 }
13520
13521 /*
13522 * rename a Filesystem snapshot
13523 *
13524 * get the vnode for the unnamed snapshot directory and the snapshot and
13525 * rename the snapshot. This is a very specialised (and simple) case of
13526 * rename(2) (which has to deal with a lot more complications). It differs
13527 * slightly from rename(2) in that EEXIST is returned if the new name exists.
13528 */
13529 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)13530 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
13531 __unused uint32_t flags, vfs_context_t ctx)
13532 {
13533 vnode_t rvp, snapdvp;
13534 int error, i;
13535 caddr_t newname_buf;
13536 size_t name_len;
13537 vnode_t fvp;
13538 struct nameidata *fromnd, *tond;
13539 /* carving out a chunk for structs that are too big to be on stack. */
13540 struct {
13541 struct nameidata from_node;
13542 struct nameidata to_node;
13543 } * __rename_data;
13544
13545 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
13546 fromnd = &__rename_data->from_node;
13547 tond = &__rename_data->to_node;
13548
13549 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
13550 OP_UNLINK, ctx);
13551 if (error) {
13552 goto out;
13553 }
13554 fvp = fromnd->ni_vp;
13555
13556 newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13557 error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
13558 if (error) {
13559 goto out1;
13560 }
13561
13562 /*
13563 * Some sanity checks- new name can't be empty, "." or ".." or have
13564 * slashes.
13565 * (the length returned by copyinstr includes the terminating NUL)
13566 *
13567 * The FS rename VNOP is suppossed to handle this but we'll pick it
13568 * off here itself.
13569 */
13570 if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
13571 (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
13572 error = EINVAL;
13573 goto out1;
13574 }
13575 for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
13576 ;
13577 }
13578 if (i < (int)name_len) {
13579 error = EINVAL;
13580 goto out1;
13581 }
13582
13583 #if CONFIG_MACF
13584 error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
13585 newname_buf);
13586 if (error) {
13587 goto out1;
13588 }
13589 #endif
13590
13591 NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
13592 UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
13593 tond->ni_dvp = snapdvp;
13594
13595 error = namei(tond);
13596 if (error) {
13597 goto out2;
13598 } else if (tond->ni_vp) {
13599 /*
13600 * snapshot rename behaves differently than rename(2) - if the
13601 * new name exists, EEXIST is returned.
13602 */
13603 vnode_put(tond->ni_vp);
13604 error = EEXIST;
13605 goto out2;
13606 }
13607
13608 error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
13609 &tond->ni_cnd, ctx);
13610
13611 out2:
13612 nameidone(tond);
13613 out1:
13614 zfree(ZV_NAMEI, newname_buf);
13615 vnode_put(fvp);
13616 vnode_put(snapdvp);
13617 vnode_put(rvp);
13618 nameidone(fromnd);
13619 out:
13620 kfree_type(typeof(*__rename_data), __rename_data);
13621 return error;
13622 }
13623
13624 /*
13625 * Mount a Filesystem snapshot
13626 *
13627 * get the vnode for the unnamed snapshot directory and the snapshot and
13628 * mount the snapshot.
13629 */
13630 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)13631 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
13632 __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
13633 {
13634 mount_t mp;
13635 vnode_t rvp, snapdvp, snapvp, vp, pvp;
13636 struct fs_snapshot_mount_args smnt_data;
13637 int error;
13638 struct nameidata *snapndp, *dirndp;
13639 /* carving out a chunk for structs that are too big to be on stack. */
13640 struct {
13641 struct nameidata snapnd;
13642 struct nameidata dirnd;
13643 } * __snapshot_mount_data;
13644
13645 __snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
13646 snapndp = &__snapshot_mount_data->snapnd;
13647 dirndp = &__snapshot_mount_data->dirnd;
13648
13649 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
13650 OP_LOOKUP, ctx);
13651 if (error) {
13652 goto out;
13653 }
13654
13655 snapvp = snapndp->ni_vp;
13656 if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
13657 error = EIO;
13658 goto out1;
13659 }
13660
13661 /* Get the vnode to be covered */
13662 NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
13663 UIO_USERSPACE, directory, ctx);
13664 error = namei(dirndp);
13665 if (error) {
13666 goto out1;
13667 }
13668
13669 vp = dirndp->ni_vp;
13670 pvp = dirndp->ni_dvp;
13671 mp = vnode_mount(rvp);
13672
13673 if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
13674 error = EINVAL;
13675 goto out2;
13676 }
13677
13678 #if CONFIG_MACF
13679 error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
13680 mp->mnt_vfsstat.f_fstypename);
13681 if (error) {
13682 goto out2;
13683 }
13684 #endif
13685
13686 smnt_data.sm_mp = mp;
13687 smnt_data.sm_cnp = &snapndp->ni_cnd;
13688 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
13689 &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
13690 KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
13691
13692 out2:
13693 vnode_put(vp);
13694 vnode_put(pvp);
13695 nameidone(dirndp);
13696 out1:
13697 vnode_put(snapvp);
13698 vnode_put(snapdvp);
13699 vnode_put(rvp);
13700 nameidone(snapndp);
13701 out:
13702 kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
13703 return error;
13704 }
13705
13706 /*
13707 * Root from a snapshot of the filesystem
13708 *
13709 * Marks the filesystem to root from the given snapshot on next boot.
13710 */
13711 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13712 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
13713 vfs_context_t ctx)
13714 {
13715 int error;
13716 vnode_t rvp;
13717 mount_t mp;
13718 struct fs_snapshot_root_args root_data;
13719 struct componentname cnp;
13720 caddr_t name_buf;
13721 size_t name_len;
13722
13723 error = vnode_getfromfd(ctx, dirfd, &rvp);
13724 if (error) {
13725 return error;
13726 }
13727 mp = vnode_mount(rvp);
13728
13729 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13730 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13731 if (error) {
13732 zfree(ZV_NAMEI, name_buf);
13733 vnode_put(rvp);
13734 return error;
13735 }
13736
13737 // XXX MAC checks ?
13738
13739 /*
13740 * Grab mount_iterref so that we can release the vnode,
13741 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
13742 */
13743 error = mount_iterref(mp, 0);
13744 vnode_put(rvp);
13745 if (error) {
13746 zfree(ZV_NAMEI, name_buf);
13747 return error;
13748 }
13749
13750 memset(&cnp, 0, sizeof(cnp));
13751 cnp.cn_pnbuf = (char *)name_buf;
13752 cnp.cn_nameiop = LOOKUP;
13753 cnp.cn_flags = ISLASTCN | HASBUF;
13754 cnp.cn_pnlen = MAXPATHLEN;
13755 cnp.cn_nameptr = cnp.cn_pnbuf;
13756 cnp.cn_namelen = (int)name_len;
13757 root_data.sr_cnp = &cnp;
13758
13759 error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
13760
13761 mount_iterdrop(mp);
13762 zfree(ZV_NAMEI, name_buf);
13763
13764 return error;
13765 }
13766
13767 /*
13768 * FS snapshot operations dispatcher
13769 */
13770 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)13771 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
13772 __unused int32_t *retval)
13773 {
13774 int error;
13775 vfs_context_t ctx = vfs_context_current();
13776
13777 AUDIT_ARG(fd, uap->dirfd);
13778 AUDIT_ARG(value32, uap->op);
13779
13780 error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_SNAPSHOT, 0);
13781 if (error) {
13782 return error;
13783 }
13784
13785 /*
13786 * Enforce user authorization for snapshot modification operations,
13787 * or if trying to root from snapshot.
13788 */
13789 if (uap->op != SNAPSHOT_OP_MOUNT) {
13790 vnode_t dvp = NULLVP;
13791 vnode_t devvp = NULLVP;
13792 mount_t mp;
13793
13794 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
13795 if (error) {
13796 return error;
13797 }
13798 mp = vnode_mount(dvp);
13799 devvp = mp->mnt_devvp;
13800
13801 /* get an iocount on devvp */
13802 if (devvp == NULLVP) {
13803 error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
13804 /* for mounts which arent block devices */
13805 if (error == ENOENT) {
13806 error = ENXIO;
13807 }
13808 } else {
13809 error = vnode_getwithref(devvp);
13810 }
13811
13812 if (error) {
13813 vnode_put(dvp);
13814 return error;
13815 }
13816
13817 if ((vfs_context_issuser(ctx) == 0) &&
13818 (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
13819 (!IOCurrentTaskHasEntitlement("com.apple.private.vfs.snapshot.user"))) {
13820 error = EPERM;
13821 }
13822 vnode_put(dvp);
13823 vnode_put(devvp);
13824
13825 if (error) {
13826 return error;
13827 }
13828 }
13829
13830 switch (uap->op) {
13831 case SNAPSHOT_OP_CREATE:
13832 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
13833 break;
13834 case SNAPSHOT_OP_DELETE:
13835 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
13836 break;
13837 case SNAPSHOT_OP_RENAME:
13838 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
13839 uap->flags, ctx);
13840 break;
13841 case SNAPSHOT_OP_MOUNT:
13842 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
13843 uap->data, uap->flags, ctx);
13844 break;
13845 case SNAPSHOT_OP_REVERT:
13846 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
13847 break;
13848 #if CONFIG_MNT_ROOTSNAP
13849 case SNAPSHOT_OP_ROOT:
13850 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
13851 break;
13852 #endif /* CONFIG_MNT_ROOTSNAP */
13853 default:
13854 error = ENOSYS;
13855 }
13856
13857 return error;
13858 }
13859