1 /*
2 * Copyright (c) 1995-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 * (c) UNIX System Laboratories, Inc.
32 * All or some portions of this file are derived from material licensed
33 * to the University of California by American Telephone and Telegraph
34 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35 * the permission of UNIX System Laboratories, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 * must display the following acknowledgement:
47 * This product includes software developed by the University of
48 * California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95
66 */
67 /*
68 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69 * support for mandatory and extensible security protections. This notice
70 * is included in support of clause 2.2 (b) of the Apple Public License,
71 * Version 2.0.
72 */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112
113 #include <vfs/vfs_disk_conditioner.h>
114
115 #include <security/audit/audit.h>
116 #include <bsm/audit_kevents.h>
117
118 #include <mach/mach_types.h>
119 #include <kern/kern_types.h>
120 #include <kern/kalloc.h>
121 #include <kern/task.h>
122
123 #include <vm/vm_pageout.h>
124 #include <vm/vm_protos.h>
125
126 #include <libkern/OSAtomic.h>
127 #include <os/atomic_private.h>
128 #include <pexpert/pexpert.h>
129 #include <IOKit/IOBSD.h>
130
131 // deps for MIG call
132 #include <kern/host.h>
133 #include <kern/ipc_misc.h>
134 #include <mach/host_priv.h>
135 #include <mach/vfs_nspace.h>
136 #include <os/log.h>
137
138 #include <nfs/nfs_conf.h>
139
140 #if ROUTEFS
141 #include <miscfs/routefs/routefs.h>
142 #endif /* ROUTEFS */
143
144 #if CONFIG_MACF
145 #include <security/mac.h>
146 #include <security/mac_framework.h>
147 #endif
148
149 #if CONFIG_FSE
150 #define GET_PATH(x) \
151 ((x) = get_pathbuff())
152 #define RELEASE_PATH(x) \
153 release_pathbuff(x)
154 #else
155 #define GET_PATH(x) \
156 ((x) = zalloc(ZV_NAMEI))
157 #define RELEASE_PATH(x) \
158 zfree(ZV_NAMEI, x)
159 #endif /* CONFIG_FSE */
160
161 #ifndef HFS_GET_BOOT_INFO
162 #define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004)
163 #endif
164
165 #ifndef HFS_SET_BOOT_INFO
166 #define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005)
167 #endif
168
169 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
170 #define APFSIOC_REVERT_TO_SNAPSHOT _IOW('J', 1, u_int64_t)
171 #endif
172
173 extern void disk_conditioner_unmount(mount_t mp);
174
175 /* struct for checkdirs iteration */
176 struct cdirargs {
177 vnode_t olddp;
178 vnode_t newdp;
179 };
180 /* callback for checkdirs iteration */
181 static int checkdirs_callback(proc_t p, void * arg);
182
183 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
184 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
185 void enablequotas(struct mount *mp, vfs_context_t ctx);
186 static int getfsstat_callback(mount_t mp, void * arg);
187 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
188 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
189 static int sync_callback(mount_t, void *);
190 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
191 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
192 boolean_t partial_copy);
193 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
194 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
195 struct componentname *cnp, user_addr_t fsmountargs,
196 int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
197 void vfs_notify_mount(vnode_t pdvp);
198
199 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
200
201 struct fd_vn_data * fg_vn_data_alloc(void);
202
203 /*
204 * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
205 * Concurrent lookups (or lookups by ids) on hard links can cause the
206 * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
207 * does) to return ENOENT as the path cannot be returned from the name cache
208 * alone. We have no option but to retry and hope to get one namei->reverse path
209 * generation done without an intervening lookup, lookup by id on the hard link
210 * item. This is only an issue for MAC hooks which cannot reenter the filesystem
211 * which currently are the MAC hooks for rename, unlink and rmdir.
212 */
213 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
214
215 /* Max retry limit for rename due to vnode recycling. */
216 #define MAX_RENAME_ERECYCLE_RETRIES 1024
217
218 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
219 int unlink_flags);
220
221 #ifdef CONFIG_IMGSRC_ACCESS
222 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
223 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
224 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
225 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
226 static void mount_end_update(mount_t mp);
227 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
228 #endif /* CONFIG_IMGSRC_ACCESS */
229
230 //snapshot functions
231 #if CONFIG_MNT_ROOTSNAP
232 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
233 #else
234 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
235 #endif
236
237 __private_extern__
238 int sync_internal(void);
239
240 __private_extern__
241 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
242
243 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
244 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
245
246 /* vars for sync mutex */
247 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
248 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
249
250 extern lck_rw_t rootvnode_rw_lock;
251
252 /*
253 * incremented each time a mount or unmount operation occurs
254 * used to invalidate the cached value of the rootvp in the
255 * mount structure utilized by cache_lookup_path
256 */
257 uint32_t mount_generation = 0;
258
259 /* counts number of mount and unmount operations */
260 unsigned int vfs_nummntops = 0;
261
262 /* system-wide, per-boot unique mount ID */
263 static _Atomic uint64_t mount_unique_id = 1;
264
265 extern const struct fileops vnops;
266 #if CONFIG_APPLEDOUBLE
267 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
268 #endif /* CONFIG_APPLEDOUBLE */
269
270 /*
271 * Virtual File System System Calls
272 */
273
274 /*
275 * Private in-kernel mounting spi (specific use-cases only)
276 */
277 boolean_t
vfs_iskernelmount(mount_t mp)278 vfs_iskernelmount(mount_t mp)
279 {
280 return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
281 }
282
283 __private_extern__
284 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)285 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
286 void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
287 vfs_context_t ctx)
288 {
289 struct nameidata nd;
290 boolean_t did_namei;
291 int error;
292
293 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
294 UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
295
296 kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
297
298 /*
299 * Get the vnode to be covered if it's not supplied
300 */
301 if (vp == NULLVP) {
302 error = namei(&nd);
303 if (error) {
304 if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
305 printf("failed to locate mount-on path: %s ", path);
306 }
307 return error;
308 }
309 vp = nd.ni_vp;
310 pvp = nd.ni_dvp;
311 did_namei = TRUE;
312 } else {
313 char *pnbuf = CAST_DOWN(char *, path);
314
315 nd.ni_cnd.cn_pnbuf = pnbuf;
316 nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
317 did_namei = FALSE;
318 }
319
320 kern_flags |= KERNEL_MOUNT_KMOUNT;
321 error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
322 syscall_flags, kern_flags, NULL, ctx);
323
324 if (did_namei) {
325 vnode_put(vp);
326 vnode_put(pvp);
327 nameidone(&nd);
328 }
329
330 return error;
331 }
332
333 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)334 vfs_mount_at_path(const char *fstype, const char *path,
335 vnode_t pvp, vnode_t vp, void *data, size_t datalen,
336 int mnt_flags, int flags)
337 {
338 int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
339 int error, km_flags = 0;
340
341 /*
342 * This call is currently restricted to specific use cases.
343 */
344 if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
345 return ENOTSUP;
346 }
347
348 #if !defined(XNU_TARGET_OS_OSX)
349 if (strcmp(fstype, "lifs") == 0) {
350 syscall_flags |= MNT_NOEXEC;
351 }
352 #endif
353
354 if (flags & VFS_MOUNT_FLAG_NOAUTH) {
355 km_flags |= KERNEL_MOUNT_NOAUTH;
356 }
357 if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
358 km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
359 }
360
361 error = kernel_mount(fstype, pvp, vp, path, data, datalen,
362 syscall_flags, km_flags, vfs_context_kernel());
363 if (error) {
364 printf("%s: mount on %s failed, error %d\n", __func__, path,
365 error);
366 }
367
368 return error;
369 }
370
371 int
vfs_mount_override_type_name(mount_t mp,const char * name)372 vfs_mount_override_type_name(mount_t mp, const char *name)
373 {
374 if (mp == NULL || name == NULL) {
375 return EINVAL;
376 }
377
378 /* Override the FS type name. */
379 mount_lock_spin(mp);
380 strlcpy(mp->fstypename_override, name, sizeof(mp->fstypename_override));
381 mp->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
382 mount_unlock(mp);
383
384 return 0;
385 }
386
387 /*
388 * Mount a file system.
389 */
390 /* ARGSUSED */
391 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)392 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
393 {
394 struct __mac_mount_args muap;
395
396 muap.type = uap->type;
397 muap.path = uap->path;
398 muap.flags = uap->flags;
399 muap.data = uap->data;
400 muap.mac_p = USER_ADDR_NULL;
401 return __mac_mount(p, &muap, retval);
402 }
403
404 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)405 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
406 {
407 struct componentname cn;
408 vfs_context_t ctx = vfs_context_current();
409 size_t dummy = 0;
410 int error;
411 int flags = uap->flags;
412 char fstypename[MFSNAMELEN];
413 char *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
414 vnode_t pvp;
415 vnode_t vp;
416
417 AUDIT_ARG(fd, uap->fd);
418 AUDIT_ARG(fflags, flags);
419 /* fstypename will get audited by mount_common */
420
421 /* Sanity check the flags */
422 if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
423 return ENOTSUP;
424 }
425
426 if (flags & MNT_UNION) {
427 return EPERM;
428 }
429
430 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
431 if (error) {
432 return error;
433 }
434
435 if ((error = file_vnode(uap->fd, &vp)) != 0) {
436 return error;
437 }
438
439 if ((error = vnode_getwithref(vp)) != 0) {
440 file_drop(uap->fd);
441 return error;
442 }
443
444 pvp = vnode_getparent(vp);
445 if (pvp == NULL) {
446 vnode_put(vp);
447 file_drop(uap->fd);
448 return EINVAL;
449 }
450
451 memset(&cn, 0, sizeof(struct componentname));
452 cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
453 cn.cn_pnlen = MAXPATHLEN;
454
455 if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
456 zfree(ZV_NAMEI, cn.cn_pnbuf);
457 vnode_put(pvp);
458 vnode_put(vp);
459 file_drop(uap->fd);
460 return error;
461 }
462
463 error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
464
465 zfree(ZV_NAMEI, cn.cn_pnbuf);
466 vnode_put(pvp);
467 vnode_put(vp);
468 file_drop(uap->fd);
469
470 return error;
471 }
472
473 void
vfs_notify_mount(vnode_t pdvp)474 vfs_notify_mount(vnode_t pdvp)
475 {
476 vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
477 lock_vnode_and_post(pdvp, NOTE_WRITE);
478 }
479
480 /*
481 * __mac_mount:
482 * Mount a file system taking into account MAC label behavior.
483 * See mount(2) man page for more information
484 *
485 * Parameters: p Process requesting the mount
486 * uap User argument descriptor (see below)
487 * retval (ignored)
488 *
489 * Indirect: uap->type Filesystem type
490 * uap->path Path to mount
491 * uap->data Mount arguments
492 * uap->mac_p MAC info
493 * uap->flags Mount flags
494 *
495 *
496 * Returns: 0 Success
497 * !0 Not success
498 */
499 boolean_t root_fs_upgrade_try = FALSE;
500
501 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)502 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
503 {
504 vnode_t pvp = NULL;
505 vnode_t vp = NULL;
506 int need_nameidone = 0;
507 vfs_context_t ctx = vfs_context_current();
508 char fstypename[MFSNAMELEN];
509 struct nameidata nd;
510 size_t dummy = 0;
511 char *labelstr = NULL;
512 size_t labelsz = 0;
513 int flags = uap->flags;
514 int error;
515 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
516 boolean_t is_64bit = IS_64BIT_PROCESS(p);
517 #else
518 #pragma unused(p)
519 #endif
520 /*
521 * Get the fs type name from user space
522 */
523 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
524 if (error) {
525 return error;
526 }
527
528 /*
529 * Get the vnode to be covered
530 */
531 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
532 UIO_USERSPACE, uap->path, ctx);
533 if (flags & MNT_NOFOLLOW) {
534 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
535 }
536 error = namei(&nd);
537 if (error) {
538 goto out;
539 }
540 need_nameidone = 1;
541 vp = nd.ni_vp;
542 pvp = nd.ni_dvp;
543
544 #ifdef CONFIG_IMGSRC_ACCESS
545 /* Mounting image source cannot be batched with other operations */
546 if (flags == MNT_IMGSRC_BY_INDEX) {
547 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
548 ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
549 goto out;
550 }
551 #endif /* CONFIG_IMGSRC_ACCESS */
552
553 #if CONFIG_MACF
554 /*
555 * Get the label string (if any) from user space
556 */
557 if (uap->mac_p != USER_ADDR_NULL) {
558 struct user_mac mac;
559 size_t ulen = 0;
560
561 if (is_64bit) {
562 struct user64_mac mac64;
563 error = copyin(uap->mac_p, &mac64, sizeof(mac64));
564 mac.m_buflen = (user_size_t)mac64.m_buflen;
565 mac.m_string = (user_addr_t)mac64.m_string;
566 } else {
567 struct user32_mac mac32;
568 error = copyin(uap->mac_p, &mac32, sizeof(mac32));
569 mac.m_buflen = mac32.m_buflen;
570 mac.m_string = mac32.m_string;
571 }
572 if (error) {
573 goto out;
574 }
575 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
576 (mac.m_buflen < 2)) {
577 error = EINVAL;
578 goto out;
579 }
580 labelsz = mac.m_buflen;
581 labelstr = kalloc_data(labelsz, Z_WAITOK);
582 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
583 if (error) {
584 goto out;
585 }
586 AUDIT_ARG(mac_string, labelstr);
587 }
588 #endif /* CONFIG_MACF */
589
590 AUDIT_ARG(fflags, flags);
591
592 #if !CONFIG_UNION_MOUNTS
593 if (flags & MNT_UNION) {
594 error = EPERM;
595 goto out;
596 }
597 #endif
598
599 if ((vp->v_flag & VROOT) &&
600 (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
601 #if CONFIG_UNION_MOUNTS
602 if (!(flags & MNT_UNION)) {
603 flags |= MNT_UPDATE;
604 } else {
605 /*
606 * For a union mount on '/', treat it as fresh
607 * mount instead of update.
608 * Otherwise, union mouting on '/' used to panic the
609 * system before, since mnt_vnodecovered was found to
610 * be NULL for '/' which is required for unionlookup
611 * after it gets ENOENT on union mount.
612 */
613 flags = (flags & ~(MNT_UPDATE));
614 }
615 #else
616 flags |= MNT_UPDATE;
617 #endif /* CONFIG_UNION_MOUNTS */
618
619 #if SECURE_KERNEL
620 if ((flags & MNT_RDONLY) == 0) {
621 /* Release kernels are not allowed to mount "/" as rw */
622 error = EPERM;
623 goto out;
624 }
625 #endif
626
627 /*
628 * See 7392553 for more details on why this check exists.
629 * Suffice to say: If this check is ON and something tries
630 * to mount the rootFS RW, we'll turn off the codesign
631 * bitmap optimization.
632 */
633 #if CHECK_CS_VALIDATION_BITMAP
634 if ((flags & MNT_RDONLY) == 0) {
635 root_fs_upgrade_try = TRUE;
636 }
637 #endif
638 }
639
640 error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
641 labelstr, ctx);
642
643 out:
644
645 #if CONFIG_MACF
646 kfree_data(labelstr, labelsz);
647 #endif /* CONFIG_MACF */
648
649 if (vp) {
650 vnode_put(vp);
651 }
652 if (pvp) {
653 vnode_put(pvp);
654 }
655 if (need_nameidone) {
656 nameidone(&nd);
657 }
658
659 return error;
660 }
661
662 /*
663 * common mount implementation (final stage of mounting)
664 *
665 * Arguments:
666 * fstypename file system type (ie it's vfs name)
667 * pvp parent of covered vnode
668 * vp covered vnode
669 * cnp component name (ie path) of covered vnode
670 * flags generic mount flags
671 * fsmountargs file system specific data
672 * labelstr optional MAC label
673 * kernelmount TRUE for mounts initiated from inside the kernel
674 * ctx caller's context
675 */
676 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)677 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
678 struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
679 char *labelstr, vfs_context_t ctx)
680 {
681 #if !CONFIG_MACF
682 #pragma unused(labelstr)
683 #endif
684 struct vnode *devvp = NULLVP;
685 struct vnode *device_vnode = NULLVP;
686 #if CONFIG_MACF
687 struct vnode *rvp;
688 #endif
689 struct mount *mp;
690 struct vfstable *vfsp = (struct vfstable *)0;
691 struct proc *p = vfs_context_proc(ctx);
692 int error, flag = 0;
693 bool flag_set = false;
694 user_addr_t devpath = USER_ADDR_NULL;
695 int ronly = 0;
696 int mntalloc = 0;
697 boolean_t vfsp_ref = FALSE;
698 boolean_t is_rwlock_locked = FALSE;
699 boolean_t did_rele = FALSE;
700 boolean_t have_usecount = FALSE;
701 boolean_t did_set_lmount = FALSE;
702 boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
703
704 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
705 /* Check for mutually-exclusive flag bits */
706 uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
707 int bitcount = 0;
708 while (checkflags != 0) {
709 checkflags &= (checkflags - 1);
710 bitcount++;
711 }
712
713 if (bitcount > 1) {
714 //not allowed to request multiple mount-by-role flags
715 error = EINVAL;
716 goto out1;
717 }
718 #endif
719
720 /*
721 * Process an update for an existing mount
722 */
723 if (flags & MNT_UPDATE) {
724 if ((vp->v_flag & VROOT) == 0) {
725 error = EINVAL;
726 goto out1;
727 }
728 mp = vp->v_mount;
729
730 /* if unmount or mount in progress, return error */
731 mount_lock_spin(mp);
732 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
733 mount_unlock(mp);
734 error = EBUSY;
735 goto out1;
736 }
737 mp->mnt_lflag |= MNT_LMOUNT;
738 did_set_lmount = TRUE;
739 mount_unlock(mp);
740 lck_rw_lock_exclusive(&mp->mnt_rwlock);
741 is_rwlock_locked = TRUE;
742 /*
743 * We only allow the filesystem to be reloaded if it
744 * is currently mounted read-only.
745 */
746 if ((flags & MNT_RELOAD) &&
747 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
748 error = ENOTSUP;
749 goto out1;
750 }
751
752 /*
753 * If content protection is enabled, update mounts are not
754 * allowed to turn it off.
755 */
756 if ((mp->mnt_flag & MNT_CPROTECT) &&
757 ((flags & MNT_CPROTECT) == 0)) {
758 error = EINVAL;
759 goto out1;
760 }
761
762 /*
763 * can't turn off MNT_REMOVABLE either but it may be an unexpected
764 * failure to return an error for this so we'll just silently
765 * add it if it is not passed in.
766 */
767 if ((mp->mnt_flag & MNT_REMOVABLE) &&
768 ((flags & MNT_REMOVABLE) == 0)) {
769 flags |= MNT_REMOVABLE;
770 }
771
772 /* Can't downgrade the backer of the root FS */
773 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
774 (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
775 error = ENOTSUP;
776 goto out1;
777 }
778
779 /*
780 * Only root, or the user that did the original mount is
781 * permitted to update it.
782 */
783 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
784 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
785 goto out1;
786 }
787 #if CONFIG_MACF
788 error = mac_mount_check_remount(ctx, mp);
789 if (error != 0) {
790 goto out1;
791 }
792 #endif
793 /*
794 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
795 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
796 */
797 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
798 flags |= MNT_NOSUID | MNT_NODEV;
799 if (mp->mnt_flag & MNT_NOEXEC) {
800 flags |= MNT_NOEXEC;
801 }
802 }
803 flag = mp->mnt_flag;
804 flag_set = true;
805
806
807
808 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
809
810 vfsp = mp->mnt_vtable;
811 goto update;
812 } // MNT_UPDATE
813
814 /*
815 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
816 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
817 */
818 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
819 flags |= MNT_NOSUID | MNT_NODEV;
820 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
821 flags |= MNT_NOEXEC;
822 }
823 }
824
825 /* XXXAUDIT: Should we capture the type on the error path as well? */
826 /* XXX cast-away const (audit_arg_text() does not modify its input) */
827 AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
828 mount_list_lock();
829 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
830 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
831 vfsp->vfc_refcount++;
832 vfsp_ref = TRUE;
833 break;
834 }
835 }
836 mount_list_unlock();
837 if (vfsp == NULL) {
838 error = ENODEV;
839 goto out1;
840 }
841
842 /*
843 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
844 * except in ROSV configs and for the initial BaseSystem root.
845 */
846 if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
847 ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
848 ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
849 error = EINVAL; /* unsupported request */
850 goto out1;
851 }
852
853 error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
854 if (error != 0) {
855 goto out1;
856 }
857
858 /*
859 * Allocate and initialize the filesystem (mount_t)
860 */
861 mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
862 mntalloc = 1;
863
864 /* Initialize the default IO constraints */
865 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
866 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
867 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
868 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
869 mp->mnt_devblocksize = DEV_BSIZE;
870 mp->mnt_alignmentmask = PAGE_MASK;
871 mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
872 mp->mnt_ioscale = 1;
873 mp->mnt_ioflags = 0;
874 mp->mnt_realrootvp = NULLVP;
875 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
876
877 mp->mnt_lflag |= MNT_LMOUNT;
878 did_set_lmount = TRUE;
879
880 TAILQ_INIT(&mp->mnt_vnodelist);
881 TAILQ_INIT(&mp->mnt_workerqueue);
882 TAILQ_INIT(&mp->mnt_newvnodes);
883 mount_lock_init(mp);
884 lck_rw_lock_exclusive(&mp->mnt_rwlock);
885 is_rwlock_locked = TRUE;
886 mp->mnt_op = vfsp->vfc_vfsops;
887 mp->mnt_vtable = vfsp;
888 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
889 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
890 strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
891 do {
892 int pathlen = MAXPATHLEN;
893
894 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
895 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
896 }
897 } while (0);
898 mp->mnt_vnodecovered = vp;
899 mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
900 mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
901 mp->mnt_devbsdunit = 0;
902 mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
903
904 /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
905 vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
906
907 if (kernelmount) {
908 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
909 }
910 if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
911 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
912 }
913
914 if (KERNEL_MOUNT_DEVFS & internal_flags) {
915 // kernel mounted devfs
916 mp->mnt_kern_flag |= MNTK_SYSTEM;
917 }
918
919 update:
920
921 /*
922 * Set the mount level flags.
923 */
924 if (flags & MNT_RDONLY) {
925 mp->mnt_flag |= MNT_RDONLY;
926 } else if (mp->mnt_flag & MNT_RDONLY) {
927 // disallow read/write upgrades of file systems that
928 // had the TYPENAME_OVERRIDE feature set.
929 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
930 error = EPERM;
931 goto out1;
932 }
933 mp->mnt_kern_flag |= MNTK_WANTRDWR;
934 }
935 mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
936 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
937 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
938 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
939 MNT_QUARANTINE | MNT_CPROTECT);
940
941 #if SECURE_KERNEL
942 #if !CONFIG_MNT_SUID
943 /*
944 * On release builds of iOS based platforms, always enforce NOSUID on
945 * all mounts. We do this here because we can catch update mounts as well as
946 * non-update mounts in this case.
947 */
948 mp->mnt_flag |= (MNT_NOSUID);
949 #endif
950 #endif
951
952 mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
953 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
954 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
955 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
956 MNT_QUARANTINE | MNT_CPROTECT);
957
958 #if CONFIG_MACF
959 if (flags & MNT_MULTILABEL) {
960 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
961 error = EINVAL;
962 goto out1;
963 }
964 mp->mnt_flag |= MNT_MULTILABEL;
965 }
966 #endif
967 /*
968 * Process device path for local file systems if requested.
969 *
970 * Snapshot and mount-by-role mounts do not use this path; they are
971 * passing other opaque data in the device path field.
972 *
973 * Basesystemroot mounts pass a device path to be resolved here,
974 * but it's just a char * already inside the kernel, which
975 * kernel_mount() shoved into a user_addr_t to call us. So for such
976 * mounts we must skip copyin (both of the address and of the string
977 * (in NDINIT).
978 */
979 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
980 !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
981 boolean_t do_copyin_devpath = true;
982 #if CONFIG_BASESYSTEMROOT
983 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
984 // KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
985 // We have been passed fsmountargs, which is typed as a user_addr_t,
986 // but is actually a char ** pointing to a (kernelspace) string.
987 // We manually unpack it with a series of casts and dereferences
988 // that reverses what was done just above us on the stack in
989 // imageboot_pivot_image().
990 // After retrieving the path to the dev node (which we will NDINIT
991 // in a moment), we pass NULL fsmountargs on to the filesystem.
992 _Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
993 char **devnamepp = (char **)fsmountargs;
994 char *devnamep = *devnamepp;
995 devpath = CAST_USER_ADDR_T(devnamep);
996 do_copyin_devpath = false;
997 fsmountargs = USER_ADDR_NULL;
998
999 //Now that we have a mp, denote that this mount is for the basesystem.
1000 mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1001 }
1002 #endif // CONFIG_BASESYSTEMROOT
1003
1004 if (do_copyin_devpath) {
1005 if (vfs_context_is64bit(ctx)) {
1006 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1007 goto out1;
1008 }
1009 fsmountargs += sizeof(devpath);
1010 } else {
1011 user32_addr_t tmp;
1012 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1013 goto out1;
1014 }
1015 /* munge into LP64 addr */
1016 devpath = CAST_USER_ADDR_T(tmp);
1017 fsmountargs += sizeof(tmp);
1018 }
1019 }
1020
1021 /* Lookup device and authorize access to it */
1022 if ((devpath)) {
1023 struct nameidata nd;
1024
1025 enum uio_seg seg = UIO_USERSPACE;
1026 #if CONFIG_BASESYSTEMROOT
1027 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1028 seg = UIO_SYSSPACE;
1029 }
1030 #endif // CONFIG_BASESYSTEMROOT
1031
1032 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1033 if ((error = namei(&nd))) {
1034 goto out1;
1035 }
1036
1037 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1038 devvp = nd.ni_vp;
1039
1040 nameidone(&nd);
1041
1042 if (devvp->v_type != VBLK) {
1043 error = ENOTBLK;
1044 goto out2;
1045 }
1046 if (major(devvp->v_rdev) >= nblkdev) {
1047 error = ENXIO;
1048 goto out2;
1049 }
1050 /*
1051 * If mount by non-root, then verify that user has necessary
1052 * permissions on the device.
1053 */
1054 if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1055 mode_t accessmode = KAUTH_VNODE_READ_DATA;
1056
1057 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1058 accessmode |= KAUTH_VNODE_WRITE_DATA;
1059 }
1060 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1061 goto out2;
1062 }
1063 }
1064 }
1065 /* On first mount, preflight and open device */
1066 if (devpath && ((flags & MNT_UPDATE) == 0)) {
1067 if ((error = vnode_ref(devvp))) {
1068 goto out2;
1069 }
1070 /*
1071 * Disallow multiple mounts of the same device.
1072 * Disallow mounting of a device that is currently in use
1073 * (except for root, which might share swap device for miniroot).
1074 * Flush out any old buffers remaining from a previous use.
1075 */
1076 if ((error = vfs_mountedon(devvp))) {
1077 goto out3;
1078 }
1079
1080 if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1081 error = EBUSY;
1082 goto out3;
1083 }
1084 if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1085 error = ENOTBLK;
1086 goto out3;
1087 }
1088 if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1089 goto out3;
1090 }
1091
1092 ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1093 #if CONFIG_MACF
1094 error = mac_vnode_check_open(ctx,
1095 devvp,
1096 ronly ? FREAD : FREAD | FWRITE);
1097 if (error) {
1098 goto out3;
1099 }
1100 #endif /* MAC */
1101 if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1102 goto out3;
1103 }
1104
1105 mp->mnt_devvp = devvp;
1106 device_vnode = devvp;
1107 } else if ((mp->mnt_flag & MNT_RDONLY) &&
1108 (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1109 (device_vnode = mp->mnt_devvp)) {
1110 dev_t dev;
1111 int maj;
1112 /*
1113 * If upgrade to read-write by non-root, then verify
1114 * that user has necessary permissions on the device.
1115 */
1116 vnode_getalways(device_vnode);
1117
1118 if (suser(vfs_context_ucred(ctx), NULL) &&
1119 (error = vnode_authorize(device_vnode, NULL,
1120 KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1121 ctx)) != 0) {
1122 vnode_put(device_vnode);
1123 goto out2;
1124 }
1125
1126 /* Tell the device that we're upgrading */
1127 dev = (dev_t)device_vnode->v_rdev;
1128 maj = major(dev);
1129
1130 if ((u_int)maj >= (u_int)nblkdev) {
1131 panic("Volume mounted on a device with invalid major number.");
1132 }
1133
1134 error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1135 vnode_put(device_vnode);
1136 device_vnode = NULLVP;
1137 if (error != 0) {
1138 goto out2;
1139 }
1140 }
1141 } // localargs && !(snapshot | data | vm)
1142
1143 #if CONFIG_MACF
1144 if ((flags & MNT_UPDATE) == 0) {
1145 mac_mount_label_init(mp);
1146 mac_mount_label_associate(ctx, mp);
1147 }
1148 if (labelstr) {
1149 if ((flags & MNT_UPDATE) != 0) {
1150 error = mac_mount_check_label_update(ctx, mp);
1151 if (error != 0) {
1152 goto out3;
1153 }
1154 }
1155 }
1156 #endif
1157 /*
1158 * Mount the filesystem. We already asserted that internal_flags
1159 * cannot have more than one mount-by-role bit set.
1160 */
1161 if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1162 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1163 (caddr_t)fsmountargs, 0, ctx);
1164 } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1165 #if CONFIG_ROSV_STARTUP
1166 struct mount *origin_mp = (struct mount*)fsmountargs;
1167 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1168 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1169 if (error) {
1170 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1171 } else {
1172 /* Mark volume associated with system volume */
1173 mp->mnt_kern_flag |= MNTK_SYSTEM;
1174
1175 /* Attempt to acquire the mnt_devvp and set it up */
1176 struct vnode *mp_devvp = NULL;
1177 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1178 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1179 0, &mp_devvp, vfs_context_kernel());
1180 if (!lerr) {
1181 mp->mnt_devvp = mp_devvp;
1182 //vnode_lookup took an iocount, need to drop it.
1183 vnode_put(mp_devvp);
1184 // now set `device_vnode` to the devvp that was acquired.
1185 // this is needed in order to ensure vfs_init_io_attributes is invoked.
1186 // note that though the iocount above was dropped, the mount acquires
1187 // an implicit reference against the device.
1188 device_vnode = mp_devvp;
1189 }
1190 }
1191 }
1192 #else
1193 error = EINVAL;
1194 #endif
1195 } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1196 #if CONFIG_MOUNT_VM
1197 struct mount *origin_mp = (struct mount*)fsmountargs;
1198 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1199 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1200 if (error) {
1201 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1202 } else {
1203 /* Mark volume associated with system volume and a swap mount */
1204 mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1205 /* Attempt to acquire the mnt_devvp and set it up */
1206 struct vnode *mp_devvp = NULL;
1207 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1208 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1209 0, &mp_devvp, vfs_context_kernel());
1210 if (!lerr) {
1211 mp->mnt_devvp = mp_devvp;
1212 //vnode_lookup took an iocount, need to drop it.
1213 vnode_put(mp_devvp);
1214
1215 // now set `device_vnode` to the devvp that was acquired.
1216 // note that though the iocount above was dropped, the mount acquires
1217 // an implicit reference against the device.
1218 device_vnode = mp_devvp;
1219 }
1220 }
1221 }
1222 #else
1223 error = EINVAL;
1224 #endif
1225 } else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1226 #if CONFIG_MOUNT_PREBOOTRECOVERY
1227 struct mount *origin_mp = (struct mount*)fsmountargs;
1228 uint32_t mount_role = 0;
1229 if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1230 mount_role = VFS_PREBOOT_ROLE;
1231 } else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1232 mount_role = VFS_RECOVERY_ROLE;
1233 }
1234
1235 if (mount_role != 0) {
1236 fs_role_mount_args_t frma = {origin_mp, mount_role};
1237 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1238 if (error) {
1239 printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1240 } else {
1241 // NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1242 /* Mark volume associated with system volume */
1243 //mp->mnt_kern_flag |= MNTK_SYSTEM;
1244 /* Attempt to acquire the mnt_devvp and set it up */
1245 struct vnode *mp_devvp = NULL;
1246 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1247 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1248 0, &mp_devvp, vfs_context_kernel());
1249 if (!lerr) {
1250 mp->mnt_devvp = mp_devvp;
1251 //vnode_lookup took an iocount, need to drop it.
1252 vnode_put(mp_devvp);
1253
1254 // now set `device_vnode` to the devvp that was acquired.
1255 // note that though the iocount above was dropped, the mount acquires
1256 // an implicit reference against the device.
1257 device_vnode = mp_devvp;
1258 }
1259 }
1260 }
1261 } else {
1262 printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1263 error = EINVAL;
1264 }
1265 #else
1266 error = EINVAL;
1267 #endif
1268 } else {
1269 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1270 }
1271
1272 if (flags & MNT_UPDATE) {
1273 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1274 mp->mnt_flag &= ~MNT_RDONLY;
1275 }
1276 mp->mnt_flag &= ~
1277 (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1278 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1279 if (error) {
1280 mp->mnt_flag = flag; /* restore flag value */
1281 }
1282 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1283 lck_rw_done(&mp->mnt_rwlock);
1284 is_rwlock_locked = FALSE;
1285 if (!error) {
1286 enablequotas(mp, ctx);
1287 }
1288 goto exit;
1289 }
1290
1291 /*
1292 * Put the new filesystem on the mount list after root.
1293 */
1294 if (error == 0) {
1295 struct vfs_attr vfsattr;
1296 #if CONFIG_MACF
1297 error = mac_mount_check_mount_late(ctx, mp);
1298 if (error != 0) {
1299 goto out4;
1300 }
1301
1302 if (vfs_flags(mp) & MNT_MULTILABEL) {
1303 error = VFS_ROOT(mp, &rvp, ctx);
1304 if (error) {
1305 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1306 goto out4;
1307 }
1308 error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1309 /*
1310 * drop reference provided by VFS_ROOT
1311 */
1312 vnode_put(rvp);
1313
1314 if (error) {
1315 goto out4;
1316 }
1317 }
1318 #endif /* MAC */
1319
1320 vnode_lock_spin(vp);
1321 CLR(vp->v_flag, VMOUNT);
1322 vp->v_mountedhere = mp;
1323 vnode_unlock(vp);
1324
1325 /*
1326 * taking the name_cache_lock exclusively will
1327 * insure that everyone is out of the fast path who
1328 * might be trying to use a now stale copy of
1329 * vp->v_mountedhere->mnt_realrootvp
1330 * bumping mount_generation causes the cached values
1331 * to be invalidated
1332 */
1333 name_cache_lock();
1334 mount_generation++;
1335 name_cache_unlock();
1336
1337 error = vnode_ref(vp);
1338 if (error != 0) {
1339 goto out4;
1340 }
1341
1342 have_usecount = TRUE;
1343
1344 error = checkdirs(vp, ctx);
1345 if (error != 0) {
1346 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1347 goto out4;
1348 }
1349 /*
1350 * there is no cleanup code here so I have made it void
1351 * we need to revisit this
1352 */
1353 (void)VFS_START(mp, 0, ctx);
1354
1355 if (mount_list_add(mp) != 0) {
1356 /*
1357 * The system is shutting down trying to umount
1358 * everything, so fail with a plausible errno.
1359 */
1360 error = EBUSY;
1361 goto out4;
1362 }
1363 lck_rw_done(&mp->mnt_rwlock);
1364 is_rwlock_locked = FALSE;
1365
1366 /* Check if this mounted file system supports EAs or named streams. */
1367 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1368 VFSATTR_INIT(&vfsattr);
1369 VFSATTR_WANTED(&vfsattr, f_capabilities);
1370 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1371 vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1372 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1373 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1374 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1375 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1376 }
1377 #if NAMEDSTREAMS
1378 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1379 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1380 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1381 }
1382 #endif
1383 /* Check if this file system supports path from id lookups. */
1384 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1385 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1386 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1387 } else if (mp->mnt_flag & MNT_DOVOLFS) {
1388 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1389 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1390 }
1391
1392 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1393 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1394 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1395 }
1396 }
1397 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1398 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1399 }
1400 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1401 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1402 }
1403 /* increment the operations count */
1404 OSAddAtomic(1, &vfs_nummntops);
1405 enablequotas(mp, ctx);
1406
1407 if (device_vnode) {
1408 device_vnode->v_specflags |= SI_MOUNTEDON;
1409
1410 /*
1411 * cache the IO attributes for the underlying physical media...
1412 * an error return indicates the underlying driver doesn't
1413 * support all the queries necessary... however, reasonable
1414 * defaults will have been set, so no reason to bail or care
1415 */
1416 vfs_init_io_attributes(device_vnode, mp);
1417 }
1418
1419 /* Now that mount is setup, notify the listeners */
1420 vfs_notify_mount(pvp);
1421 IOBSDMountChange(mp, kIOMountChangeMount);
1422 } else {
1423 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1424 if (mp->mnt_vnodelist.tqh_first != NULL) {
1425 panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1426 mp->mnt_vtable->vfc_name, error);
1427 }
1428
1429 vnode_lock_spin(vp);
1430 CLR(vp->v_flag, VMOUNT);
1431 vnode_unlock(vp);
1432 mount_list_lock();
1433 mp->mnt_vtable->vfc_refcount--;
1434 mount_list_unlock();
1435
1436 if (device_vnode) {
1437 vnode_rele(device_vnode);
1438 VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1439 }
1440 lck_rw_done(&mp->mnt_rwlock);
1441 is_rwlock_locked = FALSE;
1442
1443 /*
1444 * if we get here, we have a mount structure that needs to be freed,
1445 * but since the coveredvp hasn't yet been updated to point at it,
1446 * no need to worry about other threads holding a crossref on this mp
1447 * so it's ok to just free it
1448 */
1449 mount_lock_destroy(mp);
1450 #if CONFIG_MACF
1451 mac_mount_label_destroy(mp);
1452 #endif
1453 zfree(mount_zone, mp);
1454 did_set_lmount = false;
1455 }
1456 exit:
1457 /*
1458 * drop I/O count on the device vp if there was one
1459 */
1460 if (devpath && devvp) {
1461 vnode_put(devvp);
1462 }
1463
1464 if (did_set_lmount) {
1465 mount_lock_spin(mp);
1466 mp->mnt_lflag &= ~MNT_LMOUNT;
1467 mount_unlock(mp);
1468 }
1469
1470 return error;
1471
1472 /* Error condition exits */
1473 out4:
1474 (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1475
1476 /*
1477 * If the mount has been placed on the covered vp,
1478 * it may have been discovered by now, so we have
1479 * to treat this just like an unmount
1480 */
1481 mount_lock_spin(mp);
1482 mp->mnt_lflag |= MNT_LDEAD;
1483 mount_unlock(mp);
1484
1485 if (device_vnode != NULLVP) {
1486 vnode_rele(device_vnode);
1487 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1488 ctx);
1489 did_rele = TRUE;
1490 }
1491
1492 vnode_lock_spin(vp);
1493
1494 mp->mnt_crossref++;
1495 vp->v_mountedhere = (mount_t) 0;
1496
1497 vnode_unlock(vp);
1498
1499 if (have_usecount) {
1500 vnode_rele(vp);
1501 }
1502 out3:
1503 if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1504 vnode_rele(devvp);
1505 }
1506 out2:
1507 if (devpath && devvp) {
1508 vnode_put(devvp);
1509 }
1510 out1:
1511 /* Release mnt_rwlock only when it was taken */
1512 if (is_rwlock_locked == TRUE) {
1513 if (flag_set) {
1514 mp->mnt_flag = flag; /* restore mnt_flag value */
1515 }
1516 lck_rw_done(&mp->mnt_rwlock);
1517 }
1518
1519 if (did_set_lmount) {
1520 mount_lock_spin(mp);
1521 mp->mnt_lflag &= ~MNT_LMOUNT;
1522 mount_unlock(mp);
1523 }
1524
1525 if (mntalloc) {
1526 if (mp->mnt_crossref) {
1527 mount_dropcrossref(mp, vp, 0);
1528 } else {
1529 mount_lock_destroy(mp);
1530 #if CONFIG_MACF
1531 mac_mount_label_destroy(mp);
1532 #endif
1533 zfree(mount_zone, mp);
1534 }
1535 }
1536 if (vfsp_ref) {
1537 mount_list_lock();
1538 vfsp->vfc_refcount--;
1539 mount_list_unlock();
1540 }
1541
1542 return error;
1543 }
1544
1545 /*
1546 * Flush in-core data, check for competing mount attempts,
1547 * and set VMOUNT
1548 */
1549 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)1550 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1551 {
1552 #if !CONFIG_MACF
1553 #pragma unused(cnp,fsname)
1554 #endif
1555 struct vnode_attr va;
1556 int error;
1557 boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1558 boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1559 boolean_t is_busy;
1560
1561 if (!skip_auth) {
1562 /*
1563 * If the user is not root, ensure that they own the directory
1564 * onto which we are attempting to mount.
1565 */
1566 VATTR_INIT(&va);
1567 VATTR_WANTED(&va, va_uid);
1568 if ((error = vnode_getattr(vp, &va, ctx)) ||
1569 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1570 (!vfs_context_issuser(ctx)))) {
1571 error = EPERM;
1572 goto out;
1573 }
1574 }
1575
1576 if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1577 goto out;
1578 }
1579
1580 if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1581 goto out;
1582 }
1583
1584 if (vp->v_type != VDIR) {
1585 error = ENOTDIR;
1586 goto out;
1587 }
1588
1589 vnode_lock_spin(vp);
1590 is_busy = is_fmount ?
1591 (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL)) :
1592 (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL));
1593 if (is_busy) {
1594 vnode_unlock(vp);
1595 error = EBUSY;
1596 goto out;
1597 }
1598 SET(vp->v_flag, VMOUNT);
1599 vnode_unlock(vp);
1600
1601 #if CONFIG_MACF
1602 error = mac_mount_check_mount(ctx, vp,
1603 cnp, fsname);
1604 if (error != 0) {
1605 vnode_lock_spin(vp);
1606 CLR(vp->v_flag, VMOUNT);
1607 vnode_unlock(vp);
1608 }
1609 #endif
1610
1611 out:
1612 return error;
1613 }
1614
1615 #if CONFIG_IMGSRC_ACCESS
1616
1617 #define DEBUG_IMGSRC 0
1618
1619 #if DEBUG_IMGSRC
1620 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1621 #else
1622 #define IMGSRC_DEBUG(args...) do { } while(0)
1623 #endif
1624
1625 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)1626 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1627 {
1628 struct nameidata nd;
1629 vnode_t vp, realdevvp;
1630 mode_t accessmode;
1631 int error;
1632 enum uio_seg uio = UIO_USERSPACE;
1633
1634 if (ctx == vfs_context_kernel()) {
1635 uio = UIO_SYSSPACE;
1636 }
1637
1638 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1639 if ((error = namei(&nd))) {
1640 IMGSRC_DEBUG("namei() failed with %d\n", error);
1641 return error;
1642 }
1643
1644 vp = nd.ni_vp;
1645
1646 if (!vnode_isblk(vp)) {
1647 IMGSRC_DEBUG("Not block device.\n");
1648 error = ENOTBLK;
1649 goto out;
1650 }
1651
1652 realdevvp = mp->mnt_devvp;
1653 if (realdevvp == NULLVP) {
1654 IMGSRC_DEBUG("No device backs the mount.\n");
1655 error = ENXIO;
1656 goto out;
1657 }
1658
1659 error = vnode_getwithref(realdevvp);
1660 if (error != 0) {
1661 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1662 goto out;
1663 }
1664
1665 if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1666 IMGSRC_DEBUG("Wrong dev_t.\n");
1667 error = ENXIO;
1668 goto out1;
1669 }
1670
1671 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1672
1673 /*
1674 * If mount by non-root, then verify that user has necessary
1675 * permissions on the device.
1676 */
1677 if (!vfs_context_issuser(ctx)) {
1678 accessmode = KAUTH_VNODE_READ_DATA;
1679 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1680 accessmode |= KAUTH_VNODE_WRITE_DATA;
1681 }
1682 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1683 IMGSRC_DEBUG("Access denied.\n");
1684 goto out1;
1685 }
1686 }
1687
1688 *devvpp = vp;
1689
1690 out1:
1691 vnode_put(realdevvp);
1692
1693 out:
1694 nameidone(&nd);
1695
1696 if (error) {
1697 vnode_put(vp);
1698 }
1699
1700 return error;
1701 }
1702
1703 /*
1704 * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1705 * and call checkdirs()
1706 */
1707 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)1708 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1709 {
1710 int error;
1711
1712 mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1713
1714 IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
1715 mp->mnt_vtable->vfc_name, vnode_getname(vp));
1716
1717 vnode_lock_spin(vp);
1718 CLR(vp->v_flag, VMOUNT);
1719 vp->v_mountedhere = mp;
1720 vnode_unlock(vp);
1721
1722 /*
1723 * taking the name_cache_lock exclusively will
1724 * insure that everyone is out of the fast path who
1725 * might be trying to use a now stale copy of
1726 * vp->v_mountedhere->mnt_realrootvp
1727 * bumping mount_generation causes the cached values
1728 * to be invalidated
1729 */
1730 name_cache_lock();
1731 mount_generation++;
1732 name_cache_unlock();
1733
1734 error = vnode_ref(vp);
1735 if (error != 0) {
1736 goto out;
1737 }
1738
1739 error = checkdirs(vp, ctx);
1740 if (error != 0) {
1741 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1742 vnode_rele(vp);
1743 goto out;
1744 }
1745
1746 out:
1747 if (error != 0) {
1748 mp->mnt_vnodecovered = NULLVP;
1749 }
1750 return error;
1751 }
1752
1753 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)1754 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1755 {
1756 vnode_rele(vp);
1757 vnode_lock_spin(vp);
1758 vp->v_mountedhere = (mount_t)NULL;
1759 vnode_unlock(vp);
1760
1761 mp->mnt_vnodecovered = NULLVP;
1762 }
1763
1764 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)1765 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1766 {
1767 int error;
1768
1769 /* unmount in progress return error */
1770 mount_lock_spin(mp);
1771 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1772 mount_unlock(mp);
1773 return EBUSY;
1774 }
1775 mount_unlock(mp);
1776 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1777
1778 /*
1779 * We only allow the filesystem to be reloaded if it
1780 * is currently mounted read-only.
1781 */
1782 if ((flags & MNT_RELOAD) &&
1783 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1784 error = ENOTSUP;
1785 goto out;
1786 }
1787
1788 /*
1789 * Only root, or the user that did the original mount is
1790 * permitted to update it.
1791 */
1792 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1793 (!vfs_context_issuser(ctx))) {
1794 error = EPERM;
1795 goto out;
1796 }
1797 #if CONFIG_MACF
1798 error = mac_mount_check_remount(ctx, mp);
1799 if (error != 0) {
1800 goto out;
1801 }
1802 #endif
1803
1804 out:
1805 if (error) {
1806 lck_rw_done(&mp->mnt_rwlock);
1807 }
1808
1809 return error;
1810 }
1811
1812 static void
mount_end_update(mount_t mp)1813 mount_end_update(mount_t mp)
1814 {
1815 lck_rw_done(&mp->mnt_rwlock);
1816 }
1817
1818 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)1819 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1820 {
1821 vnode_t vp;
1822
1823 if (height >= MAX_IMAGEBOOT_NESTING) {
1824 return EINVAL;
1825 }
1826
1827 vp = imgsrc_rootvnodes[height];
1828 if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1829 *rvpp = vp;
1830 return 0;
1831 } else {
1832 return ENOENT;
1833 }
1834 }
1835
1836 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)1837 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
1838 struct componentname *cnp, const char *fsname, vfs_context_t ctx,
1839 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1840 {
1841 int error;
1842 mount_t mp;
1843 boolean_t placed = FALSE;
1844 struct vfstable *vfsp;
1845 user_addr_t devpath;
1846 char *old_mntonname;
1847 vnode_t rvp;
1848 vnode_t devvp;
1849 uint32_t height;
1850 uint32_t flags;
1851
1852 /* If we didn't imageboot, nothing to move */
1853 if (imgsrc_rootvnodes[0] == NULLVP) {
1854 return EINVAL;
1855 }
1856
1857 /* Only root can do this */
1858 if (!vfs_context_issuser(ctx)) {
1859 return EPERM;
1860 }
1861
1862 IMGSRC_DEBUG("looking for root vnode.\n");
1863
1864 /*
1865 * Get root vnode of filesystem we're moving.
1866 */
1867 if (by_index) {
1868 if (is64bit) {
1869 struct user64_mnt_imgsrc_args mia64;
1870 error = copyin(fsmountargs, &mia64, sizeof(mia64));
1871 if (error != 0) {
1872 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1873 return error;
1874 }
1875
1876 height = mia64.mi_height;
1877 flags = mia64.mi_flags;
1878 devpath = (user_addr_t)mia64.mi_devpath;
1879 } else {
1880 struct user32_mnt_imgsrc_args mia32;
1881 error = copyin(fsmountargs, &mia32, sizeof(mia32));
1882 if (error != 0) {
1883 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1884 return error;
1885 }
1886
1887 height = mia32.mi_height;
1888 flags = mia32.mi_flags;
1889 devpath = mia32.mi_devpath;
1890 }
1891 } else {
1892 /*
1893 * For binary compatibility--assumes one level of nesting.
1894 */
1895 if (is64bit) {
1896 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1897 return error;
1898 }
1899 } else {
1900 user32_addr_t tmp;
1901 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1902 return error;
1903 }
1904
1905 /* munge into LP64 addr */
1906 devpath = CAST_USER_ADDR_T(tmp);
1907 }
1908
1909 height = 0;
1910 flags = 0;
1911 }
1912
1913 if (flags != 0) {
1914 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1915 return EINVAL;
1916 }
1917
1918 error = get_imgsrc_rootvnode(height, &rvp);
1919 if (error != 0) {
1920 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
1921 return error;
1922 }
1923
1924 IMGSRC_DEBUG("got old root vnode\n");
1925
1926 old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
1927
1928 /* Can only move once */
1929 mp = vnode_mount(rvp);
1930 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1931 IMGSRC_DEBUG("Already moved.\n");
1932 error = EBUSY;
1933 goto out0;
1934 }
1935
1936 IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
1937 IMGSRC_DEBUG("Starting updated.\n");
1938
1939 /* Get exclusive rwlock on mount, authorize update on mp */
1940 error = mount_begin_update(mp, ctx, 0);
1941 if (error != 0) {
1942 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1943 goto out0;
1944 }
1945
1946 /*
1947 * It can only be moved once. Flag is set under the rwlock,
1948 * so we're now safe to proceed.
1949 */
1950 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1951 IMGSRC_DEBUG("Already moved [2]\n");
1952 goto out1;
1953 }
1954
1955 IMGSRC_DEBUG("Preparing coveredvp.\n");
1956
1957 /* Mark covered vnode as mount in progress, authorize placing mount on top */
1958 error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
1959 if (error != 0) {
1960 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1961 goto out1;
1962 }
1963
1964 IMGSRC_DEBUG("Covered vp OK.\n");
1965
1966 /* Sanity check the name caller has provided */
1967 vfsp = mp->mnt_vtable;
1968 if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1969 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
1970 vfsp->vfc_name, fsname);
1971 error = EINVAL;
1972 goto out2;
1973 }
1974
1975 /* Check the device vnode and update mount-from name, for local filesystems */
1976 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1977 IMGSRC_DEBUG("Local, doing device validation.\n");
1978
1979 if (devpath != USER_ADDR_NULL) {
1980 error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1981 if (error) {
1982 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1983 goto out2;
1984 }
1985
1986 vnode_put(devvp);
1987 }
1988 }
1989
1990 /*
1991 * Place mp on top of vnode, ref the vnode, call checkdirs(),
1992 * and increment the name cache's mount generation
1993 */
1994
1995 IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1996 error = place_mount_and_checkdirs(mp, vp, ctx);
1997 if (error != 0) {
1998 goto out2;
1999 }
2000
2001 placed = TRUE;
2002
2003 strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2004 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2005
2006 /* Forbid future moves */
2007 mount_lock(mp);
2008 mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2009 mount_unlock(mp);
2010
2011 /* Finally, add to mount list, completely ready to go */
2012 if (mount_list_add(mp) != 0) {
2013 /*
2014 * The system is shutting down trying to umount
2015 * everything, so fail with a plausible errno.
2016 */
2017 error = EBUSY;
2018 goto out3;
2019 }
2020
2021 mount_end_update(mp);
2022 vnode_put(rvp);
2023 zfree(ZV_NAMEI, old_mntonname);
2024
2025 vfs_notify_mount(pvp);
2026
2027 return 0;
2028 out3:
2029 strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2030
2031 mount_lock(mp);
2032 mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2033 mount_unlock(mp);
2034
2035 out2:
2036 /*
2037 * Placing the mp on the vnode clears VMOUNT,
2038 * so cleanup is different after that point
2039 */
2040 if (placed) {
2041 /* Rele the vp, clear VMOUNT and v_mountedhere */
2042 undo_place_on_covered_vp(mp, vp);
2043 } else {
2044 vnode_lock_spin(vp);
2045 CLR(vp->v_flag, VMOUNT);
2046 vnode_unlock(vp);
2047 }
2048 out1:
2049 mount_end_update(mp);
2050
2051 out0:
2052 vnode_put(rvp);
2053 zfree(ZV_NAMEI, old_mntonname);
2054 return error;
2055 }
2056
2057 #endif /* CONFIG_IMGSRC_ACCESS */
2058
2059 void
enablequotas(struct mount * mp,vfs_context_t ctx)2060 enablequotas(struct mount *mp, vfs_context_t ctx)
2061 {
2062 struct nameidata qnd;
2063 int type;
2064 char qfpath[MAXPATHLEN];
2065 const char *qfname = QUOTAFILENAME;
2066 const char *qfopsname = QUOTAOPSNAME;
2067 const char *qfextension[] = INITQFNAMES;
2068
2069 /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2070 if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2071 return;
2072 }
2073 /*
2074 * Enable filesystem disk quotas if necessary.
2075 * We ignore errors as this should not interfere with final mount
2076 */
2077 for (type = 0; type < MAXQUOTAS; type++) {
2078 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2079 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2080 CAST_USER_ADDR_T(qfpath), ctx);
2081 if (namei(&qnd) != 0) {
2082 continue; /* option file to trigger quotas is not present */
2083 }
2084 vnode_put(qnd.ni_vp);
2085 nameidone(&qnd);
2086 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2087
2088 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2089 }
2090 return;
2091 }
2092
2093
2094 static int
checkdirs_callback(proc_t p,void * arg)2095 checkdirs_callback(proc_t p, void * arg)
2096 {
2097 struct cdirargs *cdrp = (struct cdirargs *)arg;
2098 vnode_t olddp = cdrp->olddp;
2099 vnode_t newdp = cdrp->newdp;
2100 struct filedesc *fdp = &p->p_fd;
2101 vnode_t new_cvp = newdp;
2102 vnode_t new_rvp = newdp;
2103 vnode_t old_cvp = NULL;
2104 vnode_t old_rvp = NULL;
2105
2106 /*
2107 * XXX Also needs to iterate each thread in the process to see if it
2108 * XXX is using a per-thread current working directory, and, if so,
2109 * XXX update that as well.
2110 */
2111
2112 /*
2113 * First, with the proc_fdlock held, check to see if we will need
2114 * to do any work. If not, we will get out fast.
2115 */
2116 proc_fdlock(p);
2117 if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2118 proc_fdunlock(p);
2119 return PROC_RETURNED;
2120 }
2121 proc_fdunlock(p);
2122
2123 /*
2124 * Ok, we will have to do some work. Always take two refs
2125 * because we might need that many. We'll dispose of whatever
2126 * we ended up not using.
2127 */
2128 if (vnode_ref(newdp) != 0) {
2129 return PROC_RETURNED;
2130 }
2131 if (vnode_ref(newdp) != 0) {
2132 vnode_rele(newdp);
2133 return PROC_RETURNED;
2134 }
2135
2136 proc_dirs_lock_exclusive(p);
2137 /*
2138 * Now do the work. Note: we dropped the proc_fdlock, so we
2139 * have to do all of the checks again.
2140 */
2141 proc_fdlock(p);
2142 if (fdp->fd_cdir == olddp) {
2143 old_cvp = olddp;
2144 fdp->fd_cdir = newdp;
2145 new_cvp = NULL;
2146 }
2147 if (fdp->fd_rdir == olddp) {
2148 old_rvp = olddp;
2149 fdp->fd_rdir = newdp;
2150 new_rvp = NULL;
2151 }
2152 proc_fdunlock(p);
2153 proc_dirs_unlock_exclusive(p);
2154
2155 /*
2156 * Dispose of any references that are no longer needed.
2157 */
2158 if (old_cvp != NULL) {
2159 vnode_rele(old_cvp);
2160 }
2161 if (old_rvp != NULL) {
2162 vnode_rele(old_rvp);
2163 }
2164 if (new_cvp != NULL) {
2165 vnode_rele(new_cvp);
2166 }
2167 if (new_rvp != NULL) {
2168 vnode_rele(new_rvp);
2169 }
2170
2171 return PROC_RETURNED;
2172 }
2173
2174
2175
2176 /*
2177 * Scan all active processes to see if any of them have a current
2178 * or root directory onto which the new filesystem has just been
2179 * mounted. If so, replace them with the new mount point.
2180 */
2181 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2182 checkdirs(vnode_t olddp, vfs_context_t ctx)
2183 {
2184 vnode_t newdp;
2185 vnode_t tvp;
2186 int err;
2187 struct cdirargs cdr;
2188
2189 if (olddp->v_usecount == 1) {
2190 return 0;
2191 }
2192 err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2193
2194 if (err != 0) {
2195 #if DIAGNOSTIC
2196 panic("mount: lost mount: error %d", err);
2197 #endif
2198 return err;
2199 }
2200
2201 cdr.olddp = olddp;
2202 cdr.newdp = newdp;
2203 /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2204 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2205
2206 if (rootvnode == olddp) {
2207 vnode_ref(newdp);
2208 lck_rw_lock_exclusive(&rootvnode_rw_lock);
2209 tvp = rootvnode;
2210 rootvnode = newdp;
2211 lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2212 vnode_rele(tvp);
2213 }
2214
2215 vnode_put(newdp);
2216 return 0;
2217 }
2218
2219 /*
2220 * Unmount a file system.
2221 *
2222 * Note: unmount takes a path to the vnode mounted on as argument,
2223 * not special file (as before).
2224 */
2225 /* ARGSUSED */
2226 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2227 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2228 {
2229 vnode_t vp;
2230 struct mount *mp;
2231 int error;
2232 struct nameidata nd;
2233 vfs_context_t ctx = vfs_context_current();
2234
2235 NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2236 UIO_USERSPACE, uap->path, ctx);
2237 error = namei(&nd);
2238 if (error) {
2239 return error;
2240 }
2241 vp = nd.ni_vp;
2242 mp = vp->v_mount;
2243 nameidone(&nd);
2244
2245 #if CONFIG_MACF
2246 error = mac_mount_check_umount(ctx, mp);
2247 if (error != 0) {
2248 vnode_put(vp);
2249 return error;
2250 }
2251 #endif
2252 /*
2253 * Must be the root of the filesystem
2254 */
2255 if ((vp->v_flag & VROOT) == 0) {
2256 vnode_put(vp);
2257 return EINVAL;
2258 }
2259 mount_ref(mp, 0);
2260 vnode_put(vp);
2261 /* safedounmount consumes the mount ref */
2262 return safedounmount(mp, uap->flags, ctx);
2263 }
2264
2265 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2266 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2267 {
2268 mount_t mp;
2269
2270 mp = mount_list_lookupby_fsid(fsid, 0, 1);
2271 if (mp == (mount_t)0) {
2272 return ENOENT;
2273 }
2274 mount_ref(mp, 0);
2275 mount_iterdrop(mp);
2276 /* safedounmount consumes the mount ref */
2277 return safedounmount(mp, flags, ctx);
2278 }
2279
2280 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT \
2281 "com.apple.private.vfs.role-account-unmount"
2282
2283 /*
2284 * The mount struct comes with a mount ref which will be consumed.
2285 * Do the actual file system unmount, prevent some common foot shooting.
2286 */
2287 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2288 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2289 {
2290 int error;
2291 proc_t p = vfs_context_proc(ctx);
2292
2293 /*
2294 * If the file system is not responding and MNT_NOBLOCK
2295 * is set and not a forced unmount then return EBUSY.
2296 */
2297 if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2298 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2299 error = EBUSY;
2300 goto out;
2301 }
2302
2303 /*
2304 * Skip authorization in two cases:
2305 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2306 * This entitlement allows non-root processes unmount volumes mounted by
2307 * other processes.
2308 * - If the mount is tagged as permissive and this is not a forced-unmount
2309 * attempt.
2310 */
2311 if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2312 (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2313 /*
2314 * Only root, or the user that did the original mount is
2315 * permitted to unmount this filesystem.
2316 */
2317 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2318 (error = suser(kauth_cred_get(), &p->p_acflag))) {
2319 goto out;
2320 }
2321 }
2322 /*
2323 * Don't allow unmounting the root file system, or other volumes
2324 * associated with it (for example, the associated VM or DATA mounts) .
2325 */
2326 if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2327 if (!(mp->mnt_flag & MNT_ROOTFS)) {
2328 printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2329 mp->mnt_vfsstat.f_mntonname);
2330 }
2331 error = EBUSY; /* the root (or associated volumes) is always busy */
2332 goto out;
2333 }
2334
2335 /*
2336 * If the mount is providing the root filesystem's disk image
2337 * (i.e. imageboot), don't allow unmounting
2338 */
2339 if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2340 error = EBUSY;
2341 goto out;
2342 }
2343
2344 return dounmount(mp, flags, 1, ctx);
2345
2346 out:
2347 mount_drop(mp, 0);
2348 return error;
2349 }
2350
2351 /*
2352 * Do the actual file system unmount.
2353 */
2354 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2355 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2356 {
2357 vnode_t coveredvp = (vnode_t)0;
2358 int error;
2359 int needwakeup = 0;
2360 int forcedunmount = 0;
2361 int lflags = 0;
2362 struct vnode *devvp = NULLVP;
2363 #if CONFIG_TRIGGERS
2364 proc_t p = vfs_context_proc(ctx);
2365 int did_vflush = 0;
2366 int pflags_save = 0;
2367 #endif /* CONFIG_TRIGGERS */
2368
2369 #if CONFIG_FSE
2370 if (!(flags & MNT_FORCE)) {
2371 fsevent_unmount(mp, ctx); /* has to come first! */
2372 }
2373 #endif
2374
2375 mount_lock(mp);
2376
2377 /*
2378 * If already an unmount in progress just return EBUSY.
2379 * Even a forced unmount cannot override.
2380 */
2381 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2382 if (withref != 0) {
2383 mount_drop(mp, 1);
2384 }
2385 mount_unlock(mp);
2386 return EBUSY;
2387 }
2388
2389 if (flags & MNT_FORCE) {
2390 forcedunmount = 1;
2391 mp->mnt_lflag |= MNT_LFORCE;
2392 }
2393
2394 #if CONFIG_TRIGGERS
2395 if (flags & MNT_NOBLOCK && p != kernproc) {
2396 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2397 }
2398 #endif
2399
2400 mp->mnt_kern_flag |= MNTK_UNMOUNT;
2401 mp->mnt_lflag |= MNT_LUNMOUNT;
2402 mp->mnt_flag &= ~MNT_ASYNC;
2403 /*
2404 * anyone currently in the fast path that
2405 * trips over the cached rootvp will be
2406 * dumped out and forced into the slow path
2407 * to regenerate a new cached value
2408 */
2409 mp->mnt_realrootvp = NULLVP;
2410 mount_unlock(mp);
2411
2412 if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2413 /*
2414 * Force unmount any mounts in this filesystem.
2415 * If any unmounts fail - just leave them dangling.
2416 * Avoids recursion.
2417 */
2418 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2419 }
2420
2421 /*
2422 * taking the name_cache_lock exclusively will
2423 * insure that everyone is out of the fast path who
2424 * might be trying to use a now stale copy of
2425 * vp->v_mountedhere->mnt_realrootvp
2426 * bumping mount_generation causes the cached values
2427 * to be invalidated
2428 */
2429 name_cache_lock();
2430 mount_generation++;
2431 name_cache_unlock();
2432
2433
2434 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2435 if (withref != 0) {
2436 mount_drop(mp, 0);
2437 }
2438 error = 0;
2439 if (forcedunmount == 0) {
2440 ubc_umount(mp); /* release cached vnodes */
2441 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2442 error = VFS_SYNC(mp, MNT_WAIT, ctx);
2443 if (error) {
2444 mount_lock(mp);
2445 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2446 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2447 mp->mnt_lflag &= ~MNT_LFORCE;
2448 goto out;
2449 }
2450 }
2451 }
2452
2453 IOBSDMountChange(mp, kIOMountChangeUnmount);
2454
2455 #if CONFIG_TRIGGERS
2456 vfs_nested_trigger_unmounts(mp, flags, ctx);
2457 did_vflush = 1;
2458 #endif
2459 if (forcedunmount) {
2460 lflags |= FORCECLOSE;
2461 }
2462 error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM | SKIPROOT | lflags);
2463 if ((forcedunmount == 0) && error) {
2464 mount_lock(mp);
2465 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2466 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2467 mp->mnt_lflag &= ~MNT_LFORCE;
2468 goto out;
2469 }
2470
2471 /* make sure there are no one in the mount iterations or lookup */
2472 mount_iterdrain(mp);
2473
2474 error = VFS_UNMOUNT(mp, flags, ctx);
2475 if (error) {
2476 mount_iterreset(mp);
2477 mount_lock(mp);
2478 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2479 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2480 mp->mnt_lflag &= ~MNT_LFORCE;
2481 goto out;
2482 }
2483
2484 /* increment the operations count */
2485 if (!error) {
2486 OSAddAtomic(1, &vfs_nummntops);
2487 }
2488
2489 if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2490 /* hold an io reference and drop the usecount before close */
2491 devvp = mp->mnt_devvp;
2492 vnode_getalways(devvp);
2493 vnode_rele(devvp);
2494 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2495 ctx);
2496 vnode_clearmountedon(devvp);
2497 vnode_put(devvp);
2498 }
2499 lck_rw_done(&mp->mnt_rwlock);
2500 mount_list_remove(mp);
2501 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2502
2503 /* mark the mount point hook in the vp but not drop the ref yet */
2504 if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2505 /*
2506 * The covered vnode needs special handling. Trying to get an
2507 * iocount must not block here as this may lead to deadlocks
2508 * if the Filesystem to which the covered vnode belongs is
2509 * undergoing forced unmounts. Since we hold a usecount, the
2510 * vnode cannot be reused (it can, however, still be terminated)
2511 */
2512 vnode_getalways(coveredvp);
2513 vnode_lock_spin(coveredvp);
2514
2515 mp->mnt_crossref++;
2516 coveredvp->v_mountedhere = (struct mount *)0;
2517 CLR(coveredvp->v_flag, VMOUNT);
2518
2519 vnode_unlock(coveredvp);
2520 vnode_put(coveredvp);
2521 }
2522
2523 mount_list_lock();
2524 mp->mnt_vtable->vfc_refcount--;
2525 mount_list_unlock();
2526
2527 cache_purgevfs(mp); /* remove cache entries for this file sys */
2528 vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2529 mount_lock(mp);
2530 mp->mnt_lflag |= MNT_LDEAD;
2531
2532 if (mp->mnt_lflag & MNT_LWAIT) {
2533 /*
2534 * do the wakeup here
2535 * in case we block in mount_refdrain
2536 * which will drop the mount lock
2537 * and allow anyone blocked in vfs_busy
2538 * to wakeup and see the LDEAD state
2539 */
2540 mp->mnt_lflag &= ~MNT_LWAIT;
2541 wakeup((caddr_t)mp);
2542 }
2543 mount_refdrain(mp);
2544
2545 /* free disk_conditioner_info structure for this mount */
2546 disk_conditioner_unmount(mp);
2547
2548 out:
2549 if (mp->mnt_lflag & MNT_LWAIT) {
2550 mp->mnt_lflag &= ~MNT_LWAIT;
2551 needwakeup = 1;
2552 }
2553
2554 #if CONFIG_TRIGGERS
2555 if (flags & MNT_NOBLOCK && p != kernproc) {
2556 // Restore P_NOREMOTEHANG bit to its previous value
2557 if ((pflags_save & P_NOREMOTEHANG) == 0) {
2558 OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2559 }
2560 }
2561
2562 /*
2563 * Callback and context are set together under the mount lock, and
2564 * never cleared, so we're safe to examine them here, drop the lock,
2565 * and call out.
2566 */
2567 if (mp->mnt_triggercallback != NULL) {
2568 mount_unlock(mp);
2569 if (error == 0) {
2570 mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2571 } else if (did_vflush) {
2572 mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2573 }
2574 } else {
2575 mount_unlock(mp);
2576 }
2577 #else
2578 mount_unlock(mp);
2579 #endif /* CONFIG_TRIGGERS */
2580
2581 lck_rw_done(&mp->mnt_rwlock);
2582
2583 if (needwakeup) {
2584 wakeup((caddr_t)mp);
2585 }
2586
2587 if (!error) {
2588 if ((coveredvp != NULLVP)) {
2589 vnode_t pvp = NULLVP;
2590
2591 /*
2592 * The covered vnode needs special handling. Trying to
2593 * get an iocount must not block here as this may lead
2594 * to deadlocks if the Filesystem to which the covered
2595 * vnode belongs is undergoing forced unmounts. Since we
2596 * hold a usecount, the vnode cannot be reused
2597 * (it can, however, still be terminated).
2598 */
2599 vnode_getalways(coveredvp);
2600
2601 mount_dropcrossref(mp, coveredvp, 0);
2602 /*
2603 * We'll _try_ to detect if this really needs to be
2604 * done. The coveredvp can only be in termination (or
2605 * terminated) if the coveredvp's mount point is in a
2606 * forced unmount (or has been) since we still hold the
2607 * ref.
2608 */
2609 if (!vnode_isrecycled(coveredvp)) {
2610 pvp = vnode_getparent(coveredvp);
2611 #if CONFIG_TRIGGERS
2612 if (coveredvp->v_resolve) {
2613 vnode_trigger_rearm(coveredvp, ctx);
2614 }
2615 #endif
2616 }
2617
2618 vnode_rele(coveredvp);
2619 vnode_put(coveredvp);
2620 coveredvp = NULLVP;
2621
2622 if (pvp) {
2623 lock_vnode_and_post(pvp, NOTE_WRITE);
2624 vnode_put(pvp);
2625 }
2626 } else if (mp->mnt_flag & MNT_ROOTFS) {
2627 mount_lock_destroy(mp);
2628 #if CONFIG_MACF
2629 mac_mount_label_destroy(mp);
2630 #endif
2631 zfree(mount_zone, mp);
2632 } else {
2633 panic("dounmount: no coveredvp");
2634 }
2635 }
2636 return error;
2637 }
2638
2639 /*
2640 * Unmount any mounts in this filesystem.
2641 */
2642 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)2643 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2644 {
2645 mount_t smp;
2646 fsid_t *fsids, fsid;
2647 int fsids_sz;
2648 int count = 0, i, m = 0;
2649 vnode_t vp;
2650
2651 mount_list_lock();
2652
2653 // Get an array to hold the submounts fsids.
2654 TAILQ_FOREACH(smp, &mountlist, mnt_list)
2655 count++;
2656 fsids_sz = count * sizeof(fsid_t);
2657 fsids = kalloc_data(fsids_sz, Z_NOWAIT);
2658 if (fsids == NULL) {
2659 mount_list_unlock();
2660 goto out;
2661 }
2662 fsids[0] = mp->mnt_vfsstat.f_fsid; // Prime the pump
2663
2664 /*
2665 * Fill the array with submount fsids.
2666 * Since mounts are always added to the tail of the mount list, the
2667 * list is always in mount order.
2668 * For each mount check if the mounted-on vnode belongs to a
2669 * mount that's already added to our array of mounts to be unmounted.
2670 */
2671 for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2672 vp = smp->mnt_vnodecovered;
2673 if (vp == NULL) {
2674 continue;
2675 }
2676 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid; // Underlying fsid
2677 for (i = 0; i <= m; i++) {
2678 if (fsids[i].val[0] == fsid.val[0] &&
2679 fsids[i].val[1] == fsid.val[1]) {
2680 fsids[++m] = smp->mnt_vfsstat.f_fsid;
2681 break;
2682 }
2683 }
2684 }
2685 mount_list_unlock();
2686
2687 // Unmount the submounts in reverse order. Ignore errors.
2688 for (i = m; i > 0; i--) {
2689 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2690 if (smp) {
2691 mount_ref(smp, 0);
2692 mount_iterdrop(smp);
2693 (void) dounmount(smp, flags, 1, ctx);
2694 }
2695 }
2696 out:
2697 kfree_data(fsids, fsids_sz);
2698 }
2699
2700 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)2701 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2702 {
2703 vnode_lock(dp);
2704 mp->mnt_crossref--;
2705
2706 if (mp->mnt_crossref < 0) {
2707 panic("mount cross refs -ve");
2708 }
2709
2710 if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2711 if (need_put) {
2712 vnode_put_locked(dp);
2713 }
2714 vnode_unlock(dp);
2715
2716 mount_lock_destroy(mp);
2717 #if CONFIG_MACF
2718 mac_mount_label_destroy(mp);
2719 #endif
2720 zfree(mount_zone, mp);
2721 return;
2722 }
2723 if (need_put) {
2724 vnode_put_locked(dp);
2725 }
2726 vnode_unlock(dp);
2727 }
2728
2729
2730 /*
2731 * Sync each mounted filesystem.
2732 */
2733 #if DIAGNOSTIC
2734 int syncprt = 0;
2735 #endif
2736
2737 int print_vmpage_stat = 0;
2738
2739 /*
2740 * sync_callback: simple wrapper that calls VFS_SYNC() on volumes
2741 * mounted read-write with the passed waitfor value.
2742 *
2743 * Parameters: mp mount-point descriptor per mounted file-system instance.
2744 * arg user argument (please see below)
2745 *
2746 * User argument is a pointer to 32 bit unsigned integer which describes the
2747 * type of waitfor value to set for calling VFS_SYNC(). If user argument is
2748 * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
2749 * waitfor value.
2750 *
2751 * Returns: VFS_RETURNED
2752 */
2753 static int
sync_callback(mount_t mp,void * arg)2754 sync_callback(mount_t mp, void *arg)
2755 {
2756 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2757 int asyncflag = mp->mnt_flag & MNT_ASYNC;
2758 unsigned waitfor = MNT_NOWAIT;
2759
2760 if (arg) {
2761 waitfor = *(uint32_t*)arg;
2762 }
2763
2764 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
2765 if (waitfor != MNT_WAIT &&
2766 waitfor != (MNT_WAIT | MNT_VOLUME) &&
2767 waitfor != MNT_NOWAIT &&
2768 waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
2769 waitfor != MNT_DWAIT &&
2770 waitfor != (MNT_DWAIT | MNT_VOLUME)) {
2771 panic("Passed inappropriate waitfor %u to "
2772 "sync_callback()", waitfor);
2773 }
2774
2775 mp->mnt_flag &= ~MNT_ASYNC;
2776 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
2777 if (asyncflag) {
2778 mp->mnt_flag |= MNT_ASYNC;
2779 }
2780 }
2781
2782 return VFS_RETURNED;
2783 }
2784
2785 /* ARGSUSED */
2786 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)2787 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2788 {
2789 vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2790
2791 if (print_vmpage_stat) {
2792 vm_countdirtypages();
2793 }
2794
2795 #if DIAGNOSTIC
2796 if (syncprt) {
2797 vfs_bufstats();
2798 }
2799 #endif /* DIAGNOSTIC */
2800 return 0;
2801 }
2802
2803 typedef enum {
2804 SYNC_ALL = 0,
2805 SYNC_ONLY_RELIABLE_MEDIA = 1,
2806 SYNC_ONLY_UNRELIABLE_MEDIA = 2
2807 } sync_type_t;
2808
2809 static int
sync_internal_callback(mount_t mp,void * arg)2810 sync_internal_callback(mount_t mp, void *arg)
2811 {
2812 if (arg) {
2813 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
2814 (mp->mnt_flag & MNT_LOCAL);
2815 sync_type_t sync_type = *((sync_type_t *)arg);
2816
2817 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
2818 return VFS_RETURNED;
2819 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
2820 return VFS_RETURNED;
2821 }
2822 }
2823
2824 (void)sync_callback(mp, NULL);
2825
2826 return VFS_RETURNED;
2827 }
2828
2829 int sync_thread_state = 0;
2830 int sync_timeout_seconds = 5;
2831
2832 #define SYNC_THREAD_RUN 0x0001
2833 #define SYNC_THREAD_RUNNING 0x0002
2834
2835 #if CONFIG_PHYS_WRITE_ACCT
2836 thread_t pm_sync_thread;
2837 #endif /* CONFIG_PHYS_WRITE_ACCT */
2838
2839 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)2840 sync_thread(__unused void *arg, __unused wait_result_t wr)
2841 {
2842 sync_type_t sync_type;
2843 #if CONFIG_PHYS_WRITE_ACCT
2844 pm_sync_thread = current_thread();
2845 #endif /* CONFIG_PHYS_WRITE_ACCT */
2846
2847 lck_mtx_lock(&sync_mtx_lck);
2848 while (sync_thread_state & SYNC_THREAD_RUN) {
2849 sync_thread_state &= ~SYNC_THREAD_RUN;
2850 lck_mtx_unlock(&sync_mtx_lck);
2851
2852 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
2853 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2854 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
2855 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2856
2857 lck_mtx_lock(&sync_mtx_lck);
2858 }
2859 /*
2860 * This wakeup _has_ to be issued before the lock is released otherwise
2861 * we may end up waking up a thread in sync_internal which is
2862 * expecting a wakeup from a thread it just created and not from this
2863 * thread which is about to exit.
2864 */
2865 wakeup(&sync_thread_state);
2866 sync_thread_state &= ~SYNC_THREAD_RUNNING;
2867 #if CONFIG_PHYS_WRITE_ACCT
2868 pm_sync_thread = NULL;
2869 #endif /* CONFIG_PHYS_WRITE_ACCT */
2870 lck_mtx_unlock(&sync_mtx_lck);
2871
2872 if (print_vmpage_stat) {
2873 vm_countdirtypages();
2874 }
2875
2876 #if DIAGNOSTIC
2877 if (syncprt) {
2878 vfs_bufstats();
2879 }
2880 #endif /* DIAGNOSTIC */
2881 }
2882
2883 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
2884
2885 /*
2886 * An in-kernel sync for power management to call.
2887 * This function always returns within sync_timeout seconds.
2888 */
2889 __private_extern__ int
sync_internal(void)2890 sync_internal(void)
2891 {
2892 thread_t thd;
2893 int error;
2894 int thread_created = FALSE;
2895 struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
2896
2897 lck_mtx_lock(&sync_mtx_lck);
2898 sync_thread_state |= SYNC_THREAD_RUN;
2899 if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
2900 int kr;
2901
2902 sync_thread_state |= SYNC_THREAD_RUNNING;
2903 kr = kernel_thread_start(sync_thread, NULL, &thd);
2904 if (kr != KERN_SUCCESS) {
2905 sync_thread_state &= ~SYNC_THREAD_RUNNING;
2906 lck_mtx_unlock(&sync_mtx_lck);
2907 printf("sync_thread failed\n");
2908 return 0;
2909 }
2910 thread_created = TRUE;
2911 }
2912
2913 error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
2914 (PVFS | PDROP | PCATCH), "sync_thread", &ts);
2915 if (error) {
2916 struct timeval now;
2917
2918 microtime(&now);
2919 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
2920 printf("sync timed out: %d sec\n", sync_timeout_seconds);
2921 sync_timeout_last_print.tv_sec = now.tv_sec;
2922 }
2923 }
2924
2925 if (thread_created) {
2926 thread_deallocate(thd);
2927 }
2928
2929 return 0;
2930 } /* end of sync_internal call */
2931
2932 /*
2933 * Change filesystem quotas.
2934 */
2935 #if QUOTA
2936 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)2937 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2938 {
2939 struct mount *mp;
2940 int error, quota_cmd, quota_status = 0;
2941 caddr_t datap;
2942 size_t fnamelen;
2943 struct nameidata nd;
2944 vfs_context_t ctx = vfs_context_current();
2945 struct dqblk my_dqblk = {};
2946
2947 AUDIT_ARG(uid, uap->uid);
2948 AUDIT_ARG(cmd, uap->cmd);
2949 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2950 uap->path, ctx);
2951 error = namei(&nd);
2952 if (error) {
2953 return error;
2954 }
2955 mp = nd.ni_vp->v_mount;
2956 mount_ref(mp, 0);
2957 vnode_put(nd.ni_vp);
2958 nameidone(&nd);
2959
2960 #if CONFIG_MACF
2961 error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
2962 if (error != 0) {
2963 goto out;
2964 }
2965 #endif
2966
2967 /* copyin any data we will need for downstream code */
2968 quota_cmd = uap->cmd >> SUBCMDSHIFT;
2969
2970 switch (quota_cmd) {
2971 case Q_QUOTAON:
2972 /* uap->arg specifies a file from which to take the quotas */
2973 fnamelen = MAXPATHLEN;
2974 datap = zalloc(ZV_NAMEI);
2975 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2976 break;
2977 case Q_GETQUOTA:
2978 /* uap->arg is a pointer to a dqblk structure. */
2979 datap = (caddr_t) &my_dqblk;
2980 break;
2981 case Q_SETQUOTA:
2982 case Q_SETUSE:
2983 /* uap->arg is a pointer to a dqblk structure. */
2984 datap = (caddr_t) &my_dqblk;
2985 if (proc_is64bit(p)) {
2986 struct user_dqblk my_dqblk64;
2987 error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
2988 if (error == 0) {
2989 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2990 }
2991 } else {
2992 error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
2993 }
2994 break;
2995 case Q_QUOTASTAT:
2996 /* uap->arg is a pointer to an integer */
2997 datap = (caddr_t) "a_status;
2998 break;
2999 default:
3000 datap = NULL;
3001 break;
3002 } /* switch */
3003
3004 if (error == 0) {
3005 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3006 }
3007
3008 switch (quota_cmd) {
3009 case Q_QUOTAON:
3010 if (datap != NULL) {
3011 zfree(ZV_NAMEI, datap);
3012 }
3013 break;
3014 case Q_GETQUOTA:
3015 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
3016 if (error == 0) {
3017 if (proc_is64bit(p)) {
3018 struct user_dqblk my_dqblk64;
3019
3020 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3021 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3022 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3023 } else {
3024 error = copyout(datap, uap->arg, sizeof(struct dqblk));
3025 }
3026 }
3027 break;
3028 case Q_QUOTASTAT:
3029 /* uap->arg is a pointer to an integer */
3030 if (error == 0) {
3031 error = copyout(datap, uap->arg, sizeof(quota_status));
3032 }
3033 break;
3034 default:
3035 break;
3036 } /* switch */
3037
3038 out:
3039 mount_drop(mp, 0);
3040 return error;
3041 }
3042 #else
3043 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3044 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3045 {
3046 return EOPNOTSUPP;
3047 }
3048 #endif /* QUOTA */
3049
3050 /*
3051 * Get filesystem statistics.
3052 *
3053 * Returns: 0 Success
3054 * namei:???
3055 * vfs_update_vfsstat:???
3056 * munge_statfs:EFAULT
3057 */
3058 /* ARGSUSED */
3059 int
statfs(__unused proc_t p,struct statfs_args * uap,__unused int32_t * retval)3060 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3061 {
3062 struct mount *mp;
3063 struct vfsstatfs *sp;
3064 int error;
3065 struct nameidata nd;
3066 vfs_context_t ctx = vfs_context_current();
3067 vnode_t vp;
3068
3069 NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3070 UIO_USERSPACE, uap->path, ctx);
3071 error = namei(&nd);
3072 if (error != 0) {
3073 return error;
3074 }
3075 vp = nd.ni_vp;
3076 mp = vp->v_mount;
3077 sp = &mp->mnt_vfsstat;
3078 nameidone(&nd);
3079
3080 #if CONFIG_MACF
3081 error = mac_mount_check_stat(ctx, mp);
3082 if (error != 0) {
3083 vnode_put(vp);
3084 return error;
3085 }
3086 #endif
3087
3088 error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3089 if (error != 0) {
3090 vnode_put(vp);
3091 return error;
3092 }
3093
3094 error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
3095 vnode_put(vp);
3096 return error;
3097 }
3098
3099 /*
3100 * Get filesystem statistics.
3101 */
3102 /* ARGSUSED */
3103 int
fstatfs(__unused proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3104 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3105 {
3106 vnode_t vp;
3107 struct mount *mp;
3108 struct vfsstatfs *sp;
3109 int error;
3110
3111 AUDIT_ARG(fd, uap->fd);
3112
3113 if ((error = file_vnode(uap->fd, &vp))) {
3114 return error;
3115 }
3116
3117 error = vnode_getwithref(vp);
3118 if (error) {
3119 file_drop(uap->fd);
3120 return error;
3121 }
3122
3123 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3124
3125 mp = vp->v_mount;
3126 if (!mp) {
3127 error = EBADF;
3128 goto out;
3129 }
3130
3131 #if CONFIG_MACF
3132 error = mac_mount_check_stat(vfs_context_current(), mp);
3133 if (error != 0) {
3134 goto out;
3135 }
3136 #endif
3137
3138 sp = &mp->mnt_vfsstat;
3139 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3140 goto out;
3141 }
3142
3143 error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
3144
3145 out:
3146 file_drop(uap->fd);
3147 vnode_put(vp);
3148
3149 return error;
3150 }
3151
3152 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3153 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3154 {
3155 struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3156
3157 bzero(sfs, sizeof(*sfs));
3158
3159 sfs->f_bsize = vsfs->f_bsize;
3160 sfs->f_iosize = (int32_t)vsfs->f_iosize;
3161 sfs->f_blocks = vsfs->f_blocks;
3162 sfs->f_bfree = vsfs->f_bfree;
3163 sfs->f_bavail = vsfs->f_bavail;
3164 sfs->f_files = vsfs->f_files;
3165 sfs->f_ffree = vsfs->f_ffree;
3166 sfs->f_fsid = vsfs->f_fsid;
3167 sfs->f_owner = vsfs->f_owner;
3168 sfs->f_type = mp->mnt_vtable->vfc_typenum;
3169 sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3170 sfs->f_fssubtype = vsfs->f_fssubtype;
3171 sfs->f_flags_ext = (mp->mnt_kern_flag & MNTK_SYSTEMDATA) ? MNT_EXT_ROOT_DATA_VOL : 0;
3172 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3173 strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3174 } else {
3175 strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
3176 }
3177 strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3178 strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3179 }
3180
3181 /*
3182 * Get file system statistics in 64-bit mode
3183 */
3184 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3185 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3186 {
3187 struct mount *mp;
3188 int error;
3189 struct nameidata *ndp;
3190 struct statfs64 *sfsp;
3191 vfs_context_t ctxp = vfs_context_current();
3192 vnode_t vp;
3193 struct {
3194 struct nameidata nd;
3195 struct statfs64 sfs;
3196 } *__nameidata_statfs64;
3197
3198 __nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3199 Z_WAITOK);
3200 ndp = &__nameidata_statfs64->nd;
3201
3202 NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3203 UIO_USERSPACE, uap->path, ctxp);
3204 error = namei(ndp);
3205 if (error != 0) {
3206 goto out;
3207 }
3208 vp = ndp->ni_vp;
3209 mp = vp->v_mount;
3210 nameidone(ndp);
3211
3212 #if CONFIG_MACF
3213 error = mac_mount_check_stat(ctxp, mp);
3214 if (error != 0) {
3215 vnode_put(vp);
3216 goto out;
3217 }
3218 #endif
3219
3220 error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3221 if (error != 0) {
3222 vnode_put(vp);
3223 goto out;
3224 }
3225
3226 sfsp = &__nameidata_statfs64->sfs;
3227 vfs_get_statfs64(mp, sfsp);
3228 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3229 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3230 /* This process does not want to see a seperate data volume mountpoint */
3231 strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3232 }
3233 error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3234 vnode_put(vp);
3235
3236 out:
3237 kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3238
3239 return error;
3240 }
3241
3242 /*
3243 * Get file system statistics in 64-bit mode
3244 */
3245 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3246 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3247 {
3248 struct vnode *vp;
3249 struct mount *mp;
3250 struct statfs64 sfs;
3251 int error;
3252
3253 AUDIT_ARG(fd, uap->fd);
3254
3255 if ((error = file_vnode(uap->fd, &vp))) {
3256 return error;
3257 }
3258
3259 error = vnode_getwithref(vp);
3260 if (error) {
3261 file_drop(uap->fd);
3262 return error;
3263 }
3264
3265 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3266
3267 mp = vp->v_mount;
3268 if (!mp) {
3269 error = EBADF;
3270 goto out;
3271 }
3272
3273 #if CONFIG_MACF
3274 error = mac_mount_check_stat(vfs_context_current(), mp);
3275 if (error != 0) {
3276 goto out;
3277 }
3278 #endif
3279
3280 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3281 goto out;
3282 }
3283
3284 vfs_get_statfs64(mp, &sfs);
3285 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3286 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3287 /* This process does not want to see a seperate data volume mountpoint */
3288 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3289 }
3290 error = copyout(&sfs, uap->buf, sizeof(sfs));
3291
3292 out:
3293 file_drop(uap->fd);
3294 vnode_put(vp);
3295
3296 return error;
3297 }
3298
3299 struct getfsstat_struct {
3300 user_addr_t sfsp;
3301 user_addr_t *mp;
3302 int count;
3303 int maxcount;
3304 int flags;
3305 int error;
3306 };
3307
3308
3309 static int
getfsstat_callback(mount_t mp,void * arg)3310 getfsstat_callback(mount_t mp, void * arg)
3311 {
3312 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3313 struct vfsstatfs *sp;
3314 int error, my_size;
3315 vfs_context_t ctx = vfs_context_current();
3316
3317 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3318 #if CONFIG_MACF
3319 error = mac_mount_check_stat(ctx, mp);
3320 if (error != 0) {
3321 fstp->error = error;
3322 return VFS_RETURNED_DONE;
3323 }
3324 #endif
3325 sp = &mp->mnt_vfsstat;
3326 /*
3327 * If MNT_NOWAIT is specified, do not refresh the
3328 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3329 */
3330 if ((mp->mnt_lflag & MNT_LDEAD) ||
3331 (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3332 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3333 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3334 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3335 return VFS_RETURNED;
3336 }
3337
3338 /*
3339 * Need to handle LP64 version of struct statfs
3340 */
3341 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3342 if (error) {
3343 fstp->error = error;
3344 return VFS_RETURNED_DONE;
3345 }
3346 fstp->sfsp += my_size;
3347
3348 if (fstp->mp) {
3349 #if CONFIG_MACF
3350 error = mac_mount_label_get(mp, *fstp->mp);
3351 if (error) {
3352 fstp->error = error;
3353 return VFS_RETURNED_DONE;
3354 }
3355 #endif
3356 fstp->mp++;
3357 }
3358 }
3359 fstp->count++;
3360 return VFS_RETURNED;
3361 }
3362
3363 /*
3364 * Get statistics on all filesystems.
3365 */
3366 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3367 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3368 {
3369 struct __mac_getfsstat_args muap;
3370
3371 muap.buf = uap->buf;
3372 muap.bufsize = uap->bufsize;
3373 muap.mac = USER_ADDR_NULL;
3374 muap.macsize = 0;
3375 muap.flags = uap->flags;
3376
3377 return __mac_getfsstat(p, &muap, retval);
3378 }
3379
3380 /*
3381 * __mac_getfsstat: Get MAC-related file system statistics
3382 *
3383 * Parameters: p (ignored)
3384 * uap User argument descriptor (see below)
3385 * retval Count of file system statistics (N stats)
3386 *
3387 * Indirect: uap->bufsize Buffer size
3388 * uap->macsize MAC info size
3389 * uap->buf Buffer where information will be returned
3390 * uap->mac MAC info
3391 * uap->flags File system flags
3392 *
3393 *
3394 * Returns: 0 Success
3395 * !0 Not success
3396 *
3397 */
3398 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3399 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3400 {
3401 user_addr_t sfsp;
3402 user_addr_t *mp;
3403 size_t count, maxcount, bufsize, macsize;
3404 struct getfsstat_struct fst;
3405
3406 if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3407 return EINVAL;
3408 }
3409
3410 bufsize = (size_t) uap->bufsize;
3411 macsize = (size_t) uap->macsize;
3412
3413 if (IS_64BIT_PROCESS(p)) {
3414 maxcount = bufsize / sizeof(struct user64_statfs);
3415 } else {
3416 maxcount = bufsize / sizeof(struct user32_statfs);
3417 }
3418 sfsp = uap->buf;
3419 count = 0;
3420
3421 mp = NULL;
3422
3423 #if CONFIG_MACF
3424 if (uap->mac != USER_ADDR_NULL) {
3425 u_int32_t *mp0;
3426 int error;
3427 unsigned int i;
3428
3429 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3430 if (count != maxcount) {
3431 return EINVAL;
3432 }
3433
3434 /* Copy in the array */
3435 mp0 = kalloc_data(macsize, Z_WAITOK);
3436 if (mp0 == NULL) {
3437 return ENOMEM;
3438 }
3439
3440 error = copyin(uap->mac, mp0, macsize);
3441 if (error) {
3442 kfree_data(mp0, macsize);
3443 return error;
3444 }
3445
3446 /* Normalize to an array of user_addr_t */
3447 mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3448 if (mp == NULL) {
3449 kfree_data(mp0, macsize);
3450 return ENOMEM;
3451 }
3452
3453 for (i = 0; i < count; i++) {
3454 if (IS_64BIT_PROCESS(p)) {
3455 mp[i] = ((user_addr_t *)mp0)[i];
3456 } else {
3457 mp[i] = (user_addr_t)mp0[i];
3458 }
3459 }
3460 kfree_data(mp0, macsize);
3461 }
3462 #endif
3463
3464
3465 fst.sfsp = sfsp;
3466 fst.mp = mp;
3467 fst.flags = uap->flags;
3468 fst.count = 0;
3469 fst.error = 0;
3470 fst.maxcount = (int)maxcount;
3471
3472
3473 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3474
3475 if (mp) {
3476 kfree_data(mp, count * sizeof(user_addr_t));
3477 }
3478
3479 if (fst.error) {
3480 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3481 return fst.error;
3482 }
3483
3484 if (fst.sfsp && fst.count > fst.maxcount) {
3485 *retval = fst.maxcount;
3486 } else {
3487 *retval = fst.count;
3488 }
3489 return 0;
3490 }
3491
3492 static int
getfsstat64_callback(mount_t mp,void * arg)3493 getfsstat64_callback(mount_t mp, void * arg)
3494 {
3495 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3496 struct vfsstatfs *sp;
3497 struct statfs64 sfs;
3498 int error;
3499
3500 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3501 #if CONFIG_MACF
3502 error = mac_mount_check_stat(vfs_context_current(), mp);
3503 if (error != 0) {
3504 fstp->error = error;
3505 return VFS_RETURNED_DONE;
3506 }
3507 #endif
3508 sp = &mp->mnt_vfsstat;
3509 /*
3510 * If MNT_NOWAIT is specified, do not refresh the fsstat
3511 * cache. MNT_WAIT overrides MNT_NOWAIT.
3512 *
3513 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3514 * getfsstat, since the constants are out of the same
3515 * namespace.
3516 */
3517 if ((mp->mnt_lflag & MNT_LDEAD) ||
3518 ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3519 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3520 (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3521 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3522 return VFS_RETURNED;
3523 }
3524
3525 vfs_get_statfs64(mp, &sfs);
3526 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3527 if (error) {
3528 fstp->error = error;
3529 return VFS_RETURNED_DONE;
3530 }
3531 fstp->sfsp += sizeof(sfs);
3532 }
3533 fstp->count++;
3534 return VFS_RETURNED;
3535 }
3536
3537 /*
3538 * Get statistics on all file systems in 64 bit mode.
3539 */
3540 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)3541 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3542 {
3543 user_addr_t sfsp;
3544 int count, maxcount;
3545 struct getfsstat_struct fst;
3546
3547 maxcount = uap->bufsize / sizeof(struct statfs64);
3548
3549 sfsp = uap->buf;
3550 count = 0;
3551
3552 fst.sfsp = sfsp;
3553 fst.flags = uap->flags;
3554 fst.count = 0;
3555 fst.error = 0;
3556 fst.maxcount = maxcount;
3557
3558 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3559
3560 if (fst.error) {
3561 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3562 return fst.error;
3563 }
3564
3565 if (fst.sfsp && fst.count > fst.maxcount) {
3566 *retval = fst.maxcount;
3567 } else {
3568 *retval = fst.count;
3569 }
3570
3571 return 0;
3572 }
3573
3574 /*
3575 * gets the associated vnode with the file descriptor passed.
3576 * as input
3577 *
3578 * INPUT
3579 * ctx - vfs context of caller
3580 * fd - file descriptor for which vnode is required.
3581 * vpp - Pointer to pointer to vnode to be returned.
3582 *
3583 * The vnode is returned with an iocount so any vnode obtained
3584 * by this call needs a vnode_put
3585 *
3586 */
3587 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)3588 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3589 {
3590 int error;
3591 vnode_t vp;
3592 struct fileproc *fp;
3593 proc_t p = vfs_context_proc(ctx);
3594
3595 *vpp = NULLVP;
3596
3597 error = fp_getfvp(p, fd, &fp, &vp);
3598 if (error) {
3599 return error;
3600 }
3601
3602 error = vnode_getwithref(vp);
3603 if (error) {
3604 (void)fp_drop(p, fd, fp, 0);
3605 return error;
3606 }
3607
3608 (void)fp_drop(p, fd, fp, 0);
3609 *vpp = vp;
3610 return error;
3611 }
3612
3613 /*
3614 * Wrapper function around namei to start lookup from a directory
3615 * specified by a file descriptor ni_dirfd.
3616 *
3617 * In addition to all the errors returned by namei, this call can
3618 * return ENOTDIR if the file descriptor does not refer to a directory.
3619 * and EBADF if the file descriptor is not valid.
3620 */
3621 int
nameiat(struct nameidata * ndp,int dirfd)3622 nameiat(struct nameidata *ndp, int dirfd)
3623 {
3624 if ((dirfd != AT_FDCWD) &&
3625 !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3626 !(ndp->ni_cnd.cn_flags & USEDVP)) {
3627 int error = 0;
3628 char c;
3629
3630 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3631 error = copyin(ndp->ni_dirp, &c, sizeof(char));
3632 if (error) {
3633 return error;
3634 }
3635 } else {
3636 c = *((char *)(ndp->ni_dirp));
3637 }
3638
3639 if (c != '/') {
3640 vnode_t dvp_at;
3641
3642 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3643 &dvp_at);
3644 if (error) {
3645 return error;
3646 }
3647
3648 if (vnode_vtype(dvp_at) != VDIR) {
3649 vnode_put(dvp_at);
3650 return ENOTDIR;
3651 }
3652
3653 ndp->ni_dvp = dvp_at;
3654 ndp->ni_cnd.cn_flags |= USEDVP;
3655 error = namei(ndp);
3656 ndp->ni_cnd.cn_flags &= ~USEDVP;
3657 vnode_put(dvp_at);
3658 return error;
3659 }
3660 }
3661
3662 return namei(ndp);
3663 }
3664
3665 /*
3666 * Change current working directory to a given file descriptor.
3667 */
3668 /* ARGSUSED */
3669 static int
common_fchdir(proc_t p,struct fchdir_args * uap,int per_thread)3670 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
3671 {
3672 vnode_t vp;
3673 vnode_t tdp;
3674 vnode_t tvp;
3675 struct mount *mp;
3676 int error, should_put = 1;
3677 vfs_context_t ctx = vfs_context_current();
3678
3679 AUDIT_ARG(fd, uap->fd);
3680 if (per_thread && uap->fd == -1) {
3681 /*
3682 * Switching back from per-thread to per process CWD; verify we
3683 * in fact have one before proceeding. The only success case
3684 * for this code path is to return 0 preemptively after zapping
3685 * the thread structure contents.
3686 */
3687 thread_t th = vfs_context_thread(ctx);
3688 if (th) {
3689 uthread_t uth = get_bsdthread_info(th);
3690 tvp = uth->uu_cdir;
3691 uth->uu_cdir = NULLVP;
3692 if (tvp != NULLVP) {
3693 vnode_rele(tvp);
3694 return 0;
3695 }
3696 }
3697 return EBADF;
3698 }
3699
3700 if ((error = file_vnode(uap->fd, &vp))) {
3701 return error;
3702 }
3703 if ((error = vnode_getwithref(vp))) {
3704 file_drop(uap->fd);
3705 return error;
3706 }
3707
3708 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
3709
3710 if (vp->v_type != VDIR) {
3711 error = ENOTDIR;
3712 goto out;
3713 }
3714
3715 #if CONFIG_MACF
3716 error = mac_vnode_check_chdir(ctx, vp);
3717 if (error) {
3718 goto out;
3719 }
3720 #endif
3721 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3722 if (error) {
3723 goto out;
3724 }
3725
3726 while (!error && (mp = vp->v_mountedhere) != NULL) {
3727 if (vfs_busy(mp, LK_NOWAIT)) {
3728 error = EACCES;
3729 goto out;
3730 }
3731 error = VFS_ROOT(mp, &tdp, ctx);
3732 vfs_unbusy(mp);
3733 if (error) {
3734 break;
3735 }
3736 vnode_put(vp);
3737 vp = tdp;
3738 }
3739 if (error) {
3740 goto out;
3741 }
3742 if ((error = vnode_ref(vp))) {
3743 goto out;
3744 }
3745 vnode_put(vp);
3746 should_put = 0;
3747
3748 if (per_thread) {
3749 thread_t th = vfs_context_thread(ctx);
3750 if (th) {
3751 uthread_t uth = get_bsdthread_info(th);
3752 tvp = uth->uu_cdir;
3753 uth->uu_cdir = vp;
3754 OSBitOrAtomic(P_THCWD, &p->p_flag);
3755 } else {
3756 vnode_rele(vp);
3757 error = ENOENT;
3758 goto out;
3759 }
3760 } else {
3761 proc_dirs_lock_exclusive(p);
3762 proc_fdlock(p);
3763 tvp = p->p_fd.fd_cdir;
3764 p->p_fd.fd_cdir = vp;
3765 proc_fdunlock(p);
3766 proc_dirs_unlock_exclusive(p);
3767 }
3768
3769 if (tvp) {
3770 vnode_rele(tvp);
3771 }
3772
3773 out:
3774 if (should_put) {
3775 vnode_put(vp);
3776 }
3777 file_drop(uap->fd);
3778
3779 return error;
3780 }
3781
3782 int
fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)3783 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3784 {
3785 return common_fchdir(p, uap, 0);
3786 }
3787
3788 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)3789 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3790 {
3791 return common_fchdir(p, (void *)uap, 1);
3792 }
3793
3794
3795 /*
3796 * Change current working directory (".").
3797 *
3798 * Returns: 0 Success
3799 * change_dir:ENOTDIR
3800 * change_dir:???
3801 * vnode_ref:ENOENT No such file or directory
3802 */
3803 /* ARGSUSED */
3804 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)3805 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
3806 {
3807 int error;
3808 vnode_t tvp;
3809
3810 error = change_dir(ndp, ctx);
3811 if (error) {
3812 return error;
3813 }
3814 if ((error = vnode_ref(ndp->ni_vp))) {
3815 vnode_put(ndp->ni_vp);
3816 return error;
3817 }
3818 /*
3819 * drop the iocount we picked up in change_dir
3820 */
3821 vnode_put(ndp->ni_vp);
3822
3823 if (per_thread) {
3824 thread_t th = vfs_context_thread(ctx);
3825 if (th) {
3826 uthread_t uth = get_bsdthread_info(th);
3827 tvp = uth->uu_cdir;
3828 uth->uu_cdir = ndp->ni_vp;
3829 OSBitOrAtomic(P_THCWD, &p->p_flag);
3830 } else {
3831 vnode_rele(ndp->ni_vp);
3832 return ENOENT;
3833 }
3834 } else {
3835 proc_dirs_lock_exclusive(p);
3836 proc_fdlock(p);
3837 tvp = p->p_fd.fd_cdir;
3838 p->p_fd.fd_cdir = ndp->ni_vp;
3839 proc_fdunlock(p);
3840 proc_dirs_unlock_exclusive(p);
3841 }
3842
3843 if (tvp) {
3844 vnode_rele(tvp);
3845 }
3846
3847 return 0;
3848 }
3849
3850
3851 /*
3852 * Change current working directory (".").
3853 *
3854 * Returns: 0 Success
3855 * chdir_internal:ENOTDIR
3856 * chdir_internal:ENOENT No such file or directory
3857 * chdir_internal:???
3858 */
3859 /* ARGSUSED */
3860 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)3861 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3862 {
3863 struct nameidata nd;
3864 vfs_context_t ctx = vfs_context_current();
3865
3866 NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3867 UIO_USERSPACE, uap->path, ctx);
3868
3869 return chdir_internal(p, ctx, &nd, per_thread);
3870 }
3871
3872
3873 /*
3874 * chdir
3875 *
3876 * Change current working directory (".") for the entire process
3877 *
3878 * Parameters: p Process requesting the call
3879 * uap User argument descriptor (see below)
3880 * retval (ignored)
3881 *
3882 * Indirect parameters: uap->path Directory path
3883 *
3884 * Returns: 0 Success
3885 * common_chdir: ENOTDIR
3886 * common_chdir: ENOENT No such file or directory
3887 * common_chdir: ???
3888 *
3889 */
3890 int
chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)3891 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3892 {
3893 return common_chdir(p, (void *)uap, 0);
3894 }
3895
3896 /*
3897 * __pthread_chdir
3898 *
3899 * Change current working directory (".") for a single thread
3900 *
3901 * Parameters: p Process requesting the call
3902 * uap User argument descriptor (see below)
3903 * retval (ignored)
3904 *
3905 * Indirect parameters: uap->path Directory path
3906 *
3907 * Returns: 0 Success
3908 * common_chdir: ENOTDIR
3909 * common_chdir: ENOENT No such file or directory
3910 * common_chdir: ???
3911 *
3912 */
3913 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)3914 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3915 {
3916 return common_chdir(p, (void *)uap, 1);
3917 }
3918
3919
3920 /*
3921 * Change notion of root (``/'') directory.
3922 */
3923 /* ARGSUSED */
3924 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)3925 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3926 {
3927 struct filedesc *fdp = &p->p_fd;
3928 int error;
3929 struct nameidata nd;
3930 vnode_t tvp;
3931 vfs_context_t ctx = vfs_context_current();
3932
3933 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
3934 return error;
3935 }
3936
3937 NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3938 UIO_USERSPACE, uap->path, ctx);
3939 error = change_dir(&nd, ctx);
3940 if (error) {
3941 return error;
3942 }
3943
3944 #if CONFIG_MACF
3945 error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3946 &nd.ni_cnd);
3947 if (error) {
3948 vnode_put(nd.ni_vp);
3949 return error;
3950 }
3951 #endif
3952
3953 if ((error = vnode_ref(nd.ni_vp))) {
3954 vnode_put(nd.ni_vp);
3955 return error;
3956 }
3957 vnode_put(nd.ni_vp);
3958
3959 /*
3960 * This lock provides the guarantee that as long as you hold the lock
3961 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
3962 * on a referenced vnode in namei when determining the rootvnode for
3963 * a process.
3964 */
3965 /* needed for synchronization with lookup */
3966 proc_dirs_lock_exclusive(p);
3967 /* needed for setting the flag and other activities on the fd itself */
3968 proc_fdlock(p);
3969 tvp = fdp->fd_rdir;
3970 fdp->fd_rdir = nd.ni_vp;
3971 fdt_flag_set(fdp, FD_CHROOT);
3972 proc_fdunlock(p);
3973 proc_dirs_unlock_exclusive(p);
3974
3975 if (tvp != NULL) {
3976 vnode_rele(tvp);
3977 }
3978
3979 return 0;
3980 }
3981
3982 #define PATHSTATICBUFLEN 256
3983 #define PIVOT_ROOT_ENTITLEMENT \
3984 "com.apple.private.vfs.pivot-root"
3985
3986 #if defined(XNU_TARGET_OS_OSX)
3987 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)3988 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
3989 {
3990 int error;
3991 char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
3992 char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
3993 char *new_rootfs_path_before_buf = NULL;
3994 char *old_rootfs_path_after_buf = NULL;
3995 char *incoming = NULL;
3996 char *outgoing = NULL;
3997 vnode_t incoming_rootvp = NULLVP;
3998 size_t bytes_copied;
3999
4000 /*
4001 * XXX : Additional restrictions needed
4002 * - perhaps callable only once.
4003 */
4004 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4005 return error;
4006 }
4007
4008 /*
4009 * pivot_root can be executed by launchd only.
4010 * Enforce entitlement.
4011 */
4012 if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4013 return EPERM;
4014 }
4015
4016 error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4017 if (error == ENAMETOOLONG) {
4018 new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4019 error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4020 }
4021
4022 if (error) {
4023 goto out;
4024 }
4025
4026 error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4027 if (error == ENAMETOOLONG) {
4028 old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4029 error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4030 }
4031 if (error) {
4032 goto out;
4033 }
4034
4035 if (new_rootfs_path_before_buf) {
4036 incoming = new_rootfs_path_before_buf;
4037 } else {
4038 incoming = &new_rootfs_path_before[0];
4039 }
4040
4041 if (old_rootfs_path_after_buf) {
4042 outgoing = old_rootfs_path_after_buf;
4043 } else {
4044 outgoing = &old_rootfs_path_after[0];
4045 }
4046
4047 /*
4048 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4049 * Userland is not allowed to pivot to an image.
4050 */
4051 error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4052 if (error) {
4053 goto out;
4054 }
4055 error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4056 if (error) {
4057 goto out;
4058 }
4059
4060 error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4061
4062 out:
4063 if (incoming_rootvp != NULLVP) {
4064 vnode_put(incoming_rootvp);
4065 incoming_rootvp = NULLVP;
4066 }
4067
4068 if (old_rootfs_path_after_buf) {
4069 zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4070 }
4071
4072 if (new_rootfs_path_before_buf) {
4073 zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4074 }
4075
4076 return error;
4077 }
4078 #else
4079 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4080 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4081 {
4082 return nosys(p, NULL, retval);
4083 }
4084 #endif /* XNU_TARGET_OS_OSX */
4085
4086 /*
4087 * Common routine for chroot and chdir.
4088 *
4089 * Returns: 0 Success
4090 * ENOTDIR Not a directory
4091 * namei:??? [anything namei can return]
4092 * vnode_authorize:??? [anything vnode_authorize can return]
4093 */
4094 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4095 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4096 {
4097 vnode_t vp;
4098 int error;
4099
4100 if ((error = namei(ndp))) {
4101 return error;
4102 }
4103 nameidone(ndp);
4104 vp = ndp->ni_vp;
4105
4106 if (vp->v_type != VDIR) {
4107 vnode_put(vp);
4108 return ENOTDIR;
4109 }
4110
4111 #if CONFIG_MACF
4112 error = mac_vnode_check_chdir(ctx, vp);
4113 if (error) {
4114 vnode_put(vp);
4115 return error;
4116 }
4117 #endif
4118
4119 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4120 if (error) {
4121 vnode_put(vp);
4122 return error;
4123 }
4124
4125 return error;
4126 }
4127
4128 /*
4129 * Free the vnode data (for directories) associated with the file glob.
4130 */
4131 struct fd_vn_data *
fg_vn_data_alloc(void)4132 fg_vn_data_alloc(void)
4133 {
4134 struct fd_vn_data *fvdata;
4135
4136 /* Allocate per fd vnode data */
4137 fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4138 lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4139 return fvdata;
4140 }
4141
4142 /*
4143 * Free the vnode data (for directories) associated with the file glob.
4144 */
4145 void
fg_vn_data_free(void * fgvndata)4146 fg_vn_data_free(void *fgvndata)
4147 {
4148 struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4149
4150 kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4151 lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4152 kfree_type(struct fd_vn_data, fvdata);
4153 }
4154
4155 /*
4156 * Check permissions, allocate an open file structure,
4157 * and call the device open routine if any.
4158 *
4159 * Returns: 0 Success
4160 * EINVAL
4161 * EINTR
4162 * falloc:ENFILE
4163 * falloc:EMFILE
4164 * falloc:ENOMEM
4165 * vn_open_auth:???
4166 * dupfdopen:???
4167 * VNOP_ADVLOCK:???
4168 * vnode_setsize:???
4169 *
4170 * XXX Need to implement uid, gid
4171 */
4172 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval)4173 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4174 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval)
4175 {
4176 proc_t p = vfs_context_proc(ctx);
4177 uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4178 struct fileproc *fp;
4179 vnode_t vp;
4180 int flags, oflags;
4181 int type, indx, error;
4182 struct vfs_context context;
4183
4184 oflags = uflags;
4185
4186 if ((oflags & O_ACCMODE) == O_ACCMODE) {
4187 return EINVAL;
4188 }
4189
4190 flags = FFLAGS(uflags);
4191 CLR(flags, FENCRYPTED);
4192 CLR(flags, FUNENCRYPTED);
4193
4194 AUDIT_ARG(fflags, oflags);
4195 AUDIT_ARG(mode, vap->va_mode);
4196
4197 if ((error = falloc_withinit(p, &fp, &indx, ctx, fp_init, initarg)) != 0) {
4198 return error;
4199 }
4200 if (flags & O_CLOEXEC) {
4201 fp->fp_flags |= FP_CLOEXEC;
4202 }
4203 if (flags & O_CLOFORK) {
4204 fp->fp_flags |= FP_CLOFORK;
4205 }
4206
4207 /* setup state to recognize when fdesc_open was called */
4208 uu->uu_dupfd = -1;
4209
4210 if ((error = vn_open_auth(ndp, &flags, vap))) {
4211 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4212 if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4213 *retval = indx;
4214 return 0;
4215 }
4216 }
4217 if (error == ERESTART) {
4218 error = EINTR;
4219 }
4220 fp_free(p, indx, fp);
4221 return error;
4222 }
4223 uu->uu_dupfd = 0;
4224 vp = ndp->ni_vp;
4225
4226 fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4227 fp->fp_glob->fg_ops = &vnops;
4228 fp_set_data(fp, vp);
4229
4230 if (flags & (O_EXLOCK | O_SHLOCK)) {
4231 struct flock lf = {
4232 .l_whence = SEEK_SET,
4233 };
4234
4235 if (flags & O_EXLOCK) {
4236 lf.l_type = F_WRLCK;
4237 } else {
4238 lf.l_type = F_RDLCK;
4239 }
4240 type = F_FLOCK;
4241 if ((flags & FNONBLOCK) == 0) {
4242 type |= F_WAIT;
4243 }
4244 #if CONFIG_MACF
4245 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4246 F_SETLK, &lf);
4247 if (error) {
4248 goto bad;
4249 }
4250 #endif
4251 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4252 goto bad;
4253 }
4254 fp->fp_glob->fg_flag |= FWASLOCKED;
4255 }
4256
4257 /* try to truncate by setting the size attribute */
4258 if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4259 goto bad;
4260 }
4261
4262 /*
4263 * For directories we hold some additional information in the fd.
4264 */
4265 if (vnode_vtype(vp) == VDIR) {
4266 fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4267 } else {
4268 fp->fp_glob->fg_vn_data = NULL;
4269 }
4270
4271 vnode_put(vp);
4272
4273 /*
4274 * The first terminal open (without a O_NOCTTY) by a session leader
4275 * results in it being set as the controlling terminal.
4276 */
4277 if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4278 !(flags & O_NOCTTY)) {
4279 int tmp = 0;
4280
4281 (void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4282 (caddr_t)&tmp, ctx);
4283 }
4284
4285 proc_fdlock(p);
4286 procfdtbl_releasefd(p, indx, NULL);
4287
4288 #if CONFIG_SECLUDED_MEMORY
4289 if (secluded_for_filecache &&
4290 FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE &&
4291 vnode_vtype(vp) == VREG) {
4292 memory_object_control_t moc;
4293
4294 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4295
4296 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4297 /* nothing to do... */
4298 } else if (fp->fp_glob->fg_flag & FWRITE) {
4299 /* writable -> no longer eligible for secluded pages */
4300 memory_object_mark_eligible_for_secluded(moc,
4301 FALSE);
4302 } else if (secluded_for_filecache == 1) {
4303 char pathname[32] = { 0, };
4304 size_t copied;
4305 /* XXX FBDP: better way to detect /Applications/ ? */
4306 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4307 (void)copyinstr(ndp->ni_dirp,
4308 pathname,
4309 sizeof(pathname),
4310 &copied);
4311 } else {
4312 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4313 pathname,
4314 sizeof(pathname),
4315 &copied);
4316 }
4317 pathname[sizeof(pathname) - 1] = '\0';
4318 if (strncmp(pathname,
4319 "/Applications/",
4320 strlen("/Applications/")) == 0 &&
4321 strncmp(pathname,
4322 "/Applications/Camera.app/",
4323 strlen("/Applications/Camera.app/")) != 0) {
4324 /*
4325 * not writable
4326 * AND from "/Applications/"
4327 * AND not from "/Applications/Camera.app/"
4328 * ==> eligible for secluded
4329 */
4330 memory_object_mark_eligible_for_secluded(moc,
4331 TRUE);
4332 }
4333 } else if (secluded_for_filecache == 2) {
4334 size_t len = strlen(vp->v_name);
4335 if (!strncmp(vp->v_name, "dyld", len) ||
4336 !strncmp(vp->v_name, "launchd", len) ||
4337 !strncmp(vp->v_name, "Camera", len) ||
4338 !strncmp(vp->v_name, "mediaserverd", len) ||
4339 !strncmp(vp->v_name, "SpringBoard", len) ||
4340 !strncmp(vp->v_name, "backboardd", len)) {
4341 /*
4342 * This file matters when launching Camera:
4343 * do not store its contents in the secluded
4344 * pool that will be drained on Camera launch.
4345 */
4346 memory_object_mark_eligible_for_secluded(moc,
4347 FALSE);
4348 }
4349 }
4350 }
4351 #endif /* CONFIG_SECLUDED_MEMORY */
4352
4353 fp_drop(p, indx, fp, 1);
4354 proc_fdunlock(p);
4355
4356 *retval = indx;
4357
4358 return 0;
4359 bad:
4360 context = *vfs_context_current();
4361 context.vc_ucred = fp->fp_glob->fg_cred;
4362
4363 if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4364 (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4365 struct flock lf = {
4366 .l_whence = SEEK_SET,
4367 .l_type = F_UNLCK,
4368 };
4369
4370 (void)VNOP_ADVLOCK(
4371 vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4372 }
4373
4374 vn_close(vp, fp->fp_glob->fg_flag, &context);
4375 vnode_put(vp);
4376 fp_free(p, indx, fp);
4377
4378 return error;
4379 }
4380
4381 /*
4382 * While most of the *at syscall handlers can call nameiat() which
4383 * is a wrapper around namei, the use of namei and initialisation
4384 * of nameidata are far removed and in different functions - namei
4385 * gets called in vn_open_auth for open1. So we'll just do here what
4386 * nameiat() does.
4387 */
4388 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd)4389 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4390 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4391 int dirfd)
4392 {
4393 if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4394 int error;
4395 char c;
4396
4397 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4398 error = copyin(ndp->ni_dirp, &c, sizeof(char));
4399 if (error) {
4400 return error;
4401 }
4402 } else {
4403 c = *((char *)(ndp->ni_dirp));
4404 }
4405
4406 if (c != '/') {
4407 vnode_t dvp_at;
4408
4409 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4410 &dvp_at);
4411 if (error) {
4412 return error;
4413 }
4414
4415 if (vnode_vtype(dvp_at) != VDIR) {
4416 vnode_put(dvp_at);
4417 return ENOTDIR;
4418 }
4419
4420 ndp->ni_dvp = dvp_at;
4421 ndp->ni_cnd.cn_flags |= USEDVP;
4422 error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4423 retval);
4424 vnode_put(dvp_at);
4425 return error;
4426 }
4427 }
4428
4429 return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval);
4430 }
4431
4432 /*
4433 * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4434 *
4435 * Parameters: p Process requesting the open
4436 * uap User argument descriptor (see below)
4437 * retval Pointer to an area to receive the
4438 * return calue from the system call
4439 *
4440 * Indirect: uap->path Path to open (same as 'open')
4441 * uap->flags Flags to open (same as 'open'
4442 * uap->uid UID to set, if creating
4443 * uap->gid GID to set, if creating
4444 * uap->mode File mode, if creating (same as 'open')
4445 * uap->xsecurity ACL to set, if creating
4446 *
4447 * Returns: 0 Success
4448 * !0 errno value
4449 *
4450 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
4451 *
4452 * XXX: We should enummerate the possible errno values here, and where
4453 * in the code they originated.
4454 */
4455 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)4456 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4457 {
4458 int ciferror;
4459 kauth_filesec_t xsecdst;
4460 struct vnode_attr va;
4461 struct nameidata nd;
4462 int cmode;
4463
4464 AUDIT_ARG(owner, uap->uid, uap->gid);
4465
4466 xsecdst = NULL;
4467 if ((uap->xsecurity != USER_ADDR_NULL) &&
4468 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4469 return ciferror;
4470 }
4471
4472 VATTR_INIT(&va);
4473 cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4474 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4475 if (uap->uid != KAUTH_UID_NONE) {
4476 VATTR_SET(&va, va_uid, uap->uid);
4477 }
4478 if (uap->gid != KAUTH_GID_NONE) {
4479 VATTR_SET(&va, va_gid, uap->gid);
4480 }
4481 if (xsecdst != NULL) {
4482 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4483 va.va_vaflags |= VA_FILESEC_ACL;
4484 }
4485
4486 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4487 uap->path, vfs_context_current());
4488
4489 ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4490 NULL, NULL, retval);
4491 if (xsecdst != NULL) {
4492 kauth_filesec_free(xsecdst);
4493 }
4494
4495 return ciferror;
4496 }
4497
4498 /*
4499 * Go through the data-protected atomically controlled open (2)
4500 *
4501 * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4502 */
4503 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)4504 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
4505 {
4506 int flags = uap->flags;
4507 int class = uap->class;
4508 int dpflags = uap->dpflags;
4509
4510 /*
4511 * Follow the same path as normal open(2)
4512 * Look up the item if it exists, and acquire the vnode.
4513 */
4514 struct vnode_attr va;
4515 struct nameidata nd;
4516 int cmode;
4517 int error;
4518
4519 VATTR_INIT(&va);
4520 /* Mask off all but regular access permissions */
4521 cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4522 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4523
4524 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4525 uap->path, vfs_context_current());
4526
4527 /*
4528 * Initialize the extra fields in vnode_attr to pass down our
4529 * extra fields.
4530 * 1. target cprotect class.
4531 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4532 */
4533 if (flags & O_CREAT) {
4534 /* lower level kernel code validates that the class is valid before applying it. */
4535 if (class != PROTECTION_CLASS_DEFAULT) {
4536 /*
4537 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4538 * file behave the same as open (2)
4539 */
4540 VATTR_SET(&va, va_dataprotect_class, class);
4541 }
4542 }
4543
4544 if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED)) {
4545 if (flags & (O_RDWR | O_WRONLY)) {
4546 /* Not allowed to write raw encrypted bytes */
4547 return EINVAL;
4548 }
4549 if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
4550 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4551 }
4552 if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
4553 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4554 }
4555 }
4556
4557 error = open1(vfs_context_current(), &nd, uap->flags, &va,
4558 NULL, NULL, retval);
4559
4560 return error;
4561 }
4562
4563 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)4564 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4565 int fd, enum uio_seg segflg, int *retval)
4566 {
4567 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
4568 struct {
4569 struct vnode_attr va;
4570 struct nameidata nd;
4571 } *__open_data;
4572 struct vnode_attr *vap;
4573 struct nameidata *ndp;
4574 int cmode;
4575 int error;
4576
4577 __open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
4578 vap = &__open_data->va;
4579 ndp = &__open_data->nd;
4580
4581 VATTR_INIT(vap);
4582 /* Mask off all but regular access permissions */
4583 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4584 VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
4585
4586 NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
4587 segflg, path, ctx);
4588
4589 error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd);
4590
4591 kfree_type(typeof(*__open_data), __open_data);
4592
4593 return error;
4594 }
4595
4596 int
open(proc_t p,struct open_args * uap,int32_t * retval)4597 open(proc_t p, struct open_args *uap, int32_t *retval)
4598 {
4599 __pthread_testcancel(1);
4600 return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
4601 }
4602
4603 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)4604 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
4605 int32_t *retval)
4606 {
4607 return openat_internal(vfs_context_current(), uap->path, uap->flags,
4608 uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
4609 }
4610
4611 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)4612 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
4613 int32_t *retval)
4614 {
4615 return openat_internal(vfs_context_current(), uap->path, uap->flags,
4616 uap->mode, uap->fd, UIO_USERSPACE, retval);
4617 }
4618
4619 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)4620 openat(proc_t p, struct openat_args *uap, int32_t *retval)
4621 {
4622 __pthread_testcancel(1);
4623 return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
4624 }
4625
4626 /*
4627 * openbyid_np: open a file given a file system id and a file system object id
4628 * the hfs file system object id is an fsobj_id_t {uint32, uint32}
4629 * file systems that don't support object ids it is a node id (uint64_t).
4630 *
4631 * Parameters: p Process requesting the open
4632 * uap User argument descriptor (see below)
4633 * retval Pointer to an area to receive the
4634 * return calue from the system call
4635 *
4636 * Indirect: uap->path Path to open (same as 'open')
4637 *
4638 * uap->fsid id of target file system
4639 * uap->objid id of target file system object
4640 * uap->flags Flags to open (same as 'open')
4641 *
4642 * Returns: 0 Success
4643 * !0 errno value
4644 *
4645 *
4646 * XXX: We should enummerate the possible errno values here, and where
4647 * in the code they originated.
4648 */
4649 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)4650 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
4651 {
4652 fsid_t fsid;
4653 uint64_t objid;
4654 int error;
4655 char *buf = NULL;
4656 int buflen = MAXPATHLEN;
4657 int pathlen = 0;
4658 vfs_context_t ctx = vfs_context_current();
4659
4660 if ((error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_OPEN_BY_ID, 0))) {
4661 return error;
4662 }
4663
4664 if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
4665 return error;
4666 }
4667
4668 /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
4669 if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
4670 return error;
4671 }
4672
4673 AUDIT_ARG(value32, fsid.val[0]);
4674 AUDIT_ARG(value64, objid);
4675
4676 /*resolve path from fsis, objid*/
4677 do {
4678 buf = kalloc_data(buflen + 1, Z_WAITOK);
4679 if (buf == NULL) {
4680 return ENOMEM;
4681 }
4682
4683 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
4684 buf, FSOPT_ISREALFSID, &pathlen);
4685
4686 if (error) {
4687 kfree_data(buf, buflen + 1);
4688 buf = NULL;
4689 }
4690 } while (error == ENOSPC && (buflen += MAXPATHLEN));
4691
4692 if (error) {
4693 return error;
4694 }
4695
4696 buf[pathlen] = 0;
4697
4698 error = openat_internal(
4699 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
4700
4701 kfree_data(buf, buflen + 1);
4702
4703 return error;
4704 }
4705
4706
4707 /*
4708 * Create a special file.
4709 */
4710 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
4711
4712 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)4713 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
4714 {
4715 struct vnode_attr va;
4716 vfs_context_t ctx = vfs_context_current();
4717 int error;
4718 struct nameidata nd;
4719 vnode_t vp, dvp;
4720
4721 VATTR_INIT(&va);
4722 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
4723 VATTR_SET(&va, va_rdev, uap->dev);
4724
4725 /* If it's a mknod() of a FIFO, call mkfifo1() instead */
4726 if ((uap->mode & S_IFMT) == S_IFIFO) {
4727 return mkfifo1(ctx, uap->path, &va);
4728 }
4729
4730 AUDIT_ARG(mode, (mode_t)uap->mode);
4731 AUDIT_ARG(value32, uap->dev);
4732
4733 if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
4734 return error;
4735 }
4736 NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
4737 UIO_USERSPACE, uap->path, ctx);
4738 error = namei(&nd);
4739 if (error) {
4740 return error;
4741 }
4742 dvp = nd.ni_dvp;
4743 vp = nd.ni_vp;
4744
4745 if (vp != NULL) {
4746 error = EEXIST;
4747 goto out;
4748 }
4749
4750 switch (uap->mode & S_IFMT) {
4751 case S_IFCHR:
4752 VATTR_SET(&va, va_type, VCHR);
4753 break;
4754 case S_IFBLK:
4755 VATTR_SET(&va, va_type, VBLK);
4756 break;
4757 default:
4758 error = EINVAL;
4759 goto out;
4760 }
4761
4762 #if CONFIG_MACF
4763 error = mac_vnode_check_create(ctx,
4764 nd.ni_dvp, &nd.ni_cnd, &va);
4765 if (error) {
4766 goto out;
4767 }
4768 #endif
4769
4770 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
4771 goto out;
4772 }
4773
4774 if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0) {
4775 goto out;
4776 }
4777
4778 if (vp) {
4779 int update_flags = 0;
4780
4781 // Make sure the name & parent pointers are hooked up
4782 if (vp->v_name == NULL) {
4783 update_flags |= VNODE_UPDATE_NAME;
4784 }
4785 if (vp->v_parent == NULLVP) {
4786 update_flags |= VNODE_UPDATE_PARENT;
4787 }
4788
4789 if (update_flags) {
4790 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4791 }
4792
4793 #if CONFIG_FSE
4794 add_fsevent(FSE_CREATE_FILE, ctx,
4795 FSE_ARG_VNODE, vp,
4796 FSE_ARG_DONE);
4797 #endif
4798 }
4799
4800 out:
4801 /*
4802 * nameidone has to happen before we vnode_put(dvp)
4803 * since it may need to release the fs_nodelock on the dvp
4804 */
4805 nameidone(&nd);
4806
4807 if (vp) {
4808 vnode_put(vp);
4809 }
4810 vnode_put(dvp);
4811
4812 return error;
4813 }
4814
4815 /*
4816 * Create a named pipe.
4817 *
4818 * Returns: 0 Success
4819 * EEXIST
4820 * namei:???
4821 * vnode_authorize:???
4822 * vn_create:???
4823 */
4824 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap)4825 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
4826 {
4827 vnode_t vp, dvp;
4828 int error;
4829 struct nameidata nd;
4830
4831 NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
4832 UIO_USERSPACE, upath, ctx);
4833 error = namei(&nd);
4834 if (error) {
4835 return error;
4836 }
4837 dvp = nd.ni_dvp;
4838 vp = nd.ni_vp;
4839
4840 /* check that this is a new file and authorize addition */
4841 if (vp != NULL) {
4842 error = EEXIST;
4843 goto out;
4844 }
4845 VATTR_SET(vap, va_type, VFIFO);
4846
4847 if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
4848 goto out;
4849 }
4850
4851 error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
4852 out:
4853 /*
4854 * nameidone has to happen before we vnode_put(dvp)
4855 * since it may need to release the fs_nodelock on the dvp
4856 */
4857 nameidone(&nd);
4858
4859 if (vp) {
4860 vnode_put(vp);
4861 }
4862 vnode_put(dvp);
4863
4864 return error;
4865 }
4866
4867
4868 /*
4869 * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
4870 *
4871 * Parameters: p Process requesting the open
4872 * uap User argument descriptor (see below)
4873 * retval (Ignored)
4874 *
4875 * Indirect: uap->path Path to fifo (same as 'mkfifo')
4876 * uap->uid UID to set
4877 * uap->gid GID to set
4878 * uap->mode File mode to set (same as 'mkfifo')
4879 * uap->xsecurity ACL to set, if creating
4880 *
4881 * Returns: 0 Success
4882 * !0 errno value
4883 *
4884 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
4885 *
4886 * XXX: We should enummerate the possible errno values here, and where
4887 * in the code they originated.
4888 */
4889 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)4890 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
4891 {
4892 int ciferror;
4893 kauth_filesec_t xsecdst;
4894 struct vnode_attr va;
4895
4896 AUDIT_ARG(owner, uap->uid, uap->gid);
4897
4898 xsecdst = KAUTH_FILESEC_NONE;
4899 if (uap->xsecurity != USER_ADDR_NULL) {
4900 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
4901 return ciferror;
4902 }
4903 }
4904
4905 VATTR_INIT(&va);
4906 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
4907 if (uap->uid != KAUTH_UID_NONE) {
4908 VATTR_SET(&va, va_uid, uap->uid);
4909 }
4910 if (uap->gid != KAUTH_GID_NONE) {
4911 VATTR_SET(&va, va_gid, uap->gid);
4912 }
4913 if (xsecdst != KAUTH_FILESEC_NONE) {
4914 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4915 va.va_vaflags |= VA_FILESEC_ACL;
4916 }
4917
4918 ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
4919
4920 if (xsecdst != KAUTH_FILESEC_NONE) {
4921 kauth_filesec_free(xsecdst);
4922 }
4923 return ciferror;
4924 }
4925
4926 /* ARGSUSED */
4927 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)4928 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
4929 {
4930 struct vnode_attr va;
4931
4932 VATTR_INIT(&va);
4933 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
4934
4935 return mkfifo1(vfs_context_current(), uap->path, &va);
4936 }
4937
4938 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
4939 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4940 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4941
4942 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)4943 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
4944 {
4945 int ret, len = _len;
4946
4947 *truncated_path = 0;
4948
4949 if (firmlink) {
4950 ret = vn_getpath(dvp, path, &len);
4951 } else {
4952 ret = vn_getpath_no_firmlink(dvp, path, &len);
4953 }
4954 if (ret == 0 && len < (MAXPATHLEN - 1)) {
4955 if (leafname) {
4956 path[len - 1] = '/';
4957 len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
4958 if (len > MAXPATHLEN) {
4959 char *ptr;
4960
4961 // the string got truncated!
4962 *truncated_path = 1;
4963 ptr = strrchr(path, '/');
4964 if (ptr) {
4965 *ptr = '\0'; // chop off the string at the last directory component
4966 }
4967 len = (int)strlen(path) + 1;
4968 }
4969 }
4970 } else if (ret == 0) {
4971 *truncated_path = 1;
4972 } else if (ret != 0) {
4973 struct vnode *mydvp = dvp;
4974
4975 if (ret != ENOSPC) {
4976 printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
4977 dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
4978 }
4979 *truncated_path = 1;
4980
4981 do {
4982 if (mydvp->v_parent != NULL) {
4983 mydvp = mydvp->v_parent;
4984 } else if (mydvp->v_mount) {
4985 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4986 break;
4987 } else {
4988 // no parent and no mount point? only thing is to punt and say "/" changed
4989 strlcpy(path, "/", _len);
4990 len = 2;
4991 mydvp = NULL;
4992 }
4993
4994 if (mydvp == NULL) {
4995 break;
4996 }
4997
4998 len = _len;
4999 if (firmlink) {
5000 ret = vn_getpath(mydvp, path, &len);
5001 } else {
5002 ret = vn_getpath_no_firmlink(mydvp, path, &len);
5003 }
5004 } while (ret == ENOSPC);
5005 }
5006
5007 return len;
5008 }
5009
5010 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5011 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5012 {
5013 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5014 }
5015
5016 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5017 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5018 {
5019 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5020 }
5021
5022 /*
5023 * Make a hard file link.
5024 *
5025 * Returns: 0 Success
5026 * EPERM
5027 * EEXIST
5028 * EXDEV
5029 * namei:???
5030 * vnode_authorize:???
5031 * VNOP_LINK:???
5032 */
5033 /* ARGSUSED */
5034 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5035 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5036 user_addr_t link, int flag, enum uio_seg segflg)
5037 {
5038 vnode_t vp, pvp, dvp, lvp;
5039 struct nameidata nd;
5040 int follow;
5041 int error;
5042 #if CONFIG_FSE
5043 fse_info finfo;
5044 #endif
5045 int need_event, has_listeners, need_kpath2;
5046 char *target_path = NULL;
5047 char *no_firmlink_path = NULL;
5048 int truncated = 0;
5049 int truncated_no_firmlink_path = 0;
5050
5051 vp = dvp = lvp = NULLVP;
5052
5053 /* look up the object we are linking to */
5054 follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5055 NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5056 segflg, path, ctx);
5057
5058 error = nameiat(&nd, fd1);
5059 if (error) {
5060 return error;
5061 }
5062 vp = nd.ni_vp;
5063
5064 nameidone(&nd);
5065
5066 /*
5067 * Normally, linking to directories is not supported.
5068 * However, some file systems may have limited support.
5069 */
5070 if (vp->v_type == VDIR) {
5071 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5072 error = EPERM; /* POSIX */
5073 goto out;
5074 }
5075
5076 /* Linking to a directory requires ownership. */
5077 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5078 struct vnode_attr dva;
5079
5080 VATTR_INIT(&dva);
5081 VATTR_WANTED(&dva, va_uid);
5082 if (vnode_getattr(vp, &dva, ctx) != 0 ||
5083 !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5084 (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5085 error = EACCES;
5086 goto out;
5087 }
5088 }
5089 }
5090
5091 /* lookup the target node */
5092 #if CONFIG_TRIGGERS
5093 nd.ni_op = OP_LINK;
5094 #endif
5095 nd.ni_cnd.cn_nameiop = CREATE;
5096 nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5097 nd.ni_dirp = link;
5098 error = nameiat(&nd, fd2);
5099 if (error != 0) {
5100 goto out;
5101 }
5102 dvp = nd.ni_dvp;
5103 lvp = nd.ni_vp;
5104
5105 #if CONFIG_MACF
5106 if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5107 goto out2;
5108 }
5109 #endif
5110
5111 /* or to anything that kauth doesn't want us to (eg. immutable items) */
5112 if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5113 goto out2;
5114 }
5115
5116 /* target node must not exist */
5117 if (lvp != NULLVP) {
5118 error = EEXIST;
5119 goto out2;
5120 }
5121 /* cannot link across mountpoints */
5122 if (vnode_mount(vp) != vnode_mount(dvp)) {
5123 error = EXDEV;
5124 goto out2;
5125 }
5126
5127 /* authorize creation of the target note */
5128 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5129 goto out2;
5130 }
5131
5132 /* and finally make the link */
5133 error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5134 if (error) {
5135 goto out2;
5136 }
5137
5138 #if CONFIG_MACF
5139 (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5140 #endif
5141
5142 #if CONFIG_FSE
5143 need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5144 #else
5145 need_event = 0;
5146 #endif
5147 has_listeners = kauth_authorize_fileop_has_listeners();
5148
5149 need_kpath2 = 0;
5150 #if CONFIG_AUDIT
5151 if (AUDIT_RECORD_EXISTS()) {
5152 need_kpath2 = 1;
5153 }
5154 #endif
5155
5156 if (need_event || has_listeners || need_kpath2) {
5157 char *link_to_path = NULL;
5158 int len, link_name_len;
5159 int len_no_firmlink_path = 0;
5160
5161 /* build the path to the new link file */
5162 GET_PATH(target_path);
5163
5164 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5165 if (no_firmlink_path == NULL) {
5166 GET_PATH(no_firmlink_path);
5167 }
5168 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5169
5170 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5171
5172 if (has_listeners) {
5173 /* build the path to file we are linking to */
5174 GET_PATH(link_to_path);
5175
5176 link_name_len = MAXPATHLEN;
5177 if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5178 /*
5179 * Call out to allow 3rd party notification of rename.
5180 * Ignore result of kauth_authorize_fileop call.
5181 */
5182 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5183 (uintptr_t)link_to_path,
5184 (uintptr_t)target_path);
5185 }
5186 if (link_to_path != NULL) {
5187 RELEASE_PATH(link_to_path);
5188 }
5189 }
5190 #if CONFIG_FSE
5191 if (need_event) {
5192 /* construct fsevent */
5193 if (get_fse_info(vp, &finfo, ctx) == 0) {
5194 if (truncated_no_firmlink_path) {
5195 finfo.mode |= FSE_TRUNCATED_PATH;
5196 }
5197
5198 // build the path to the destination of the link
5199 add_fsevent(FSE_CREATE_FILE, ctx,
5200 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5201 FSE_ARG_FINFO, &finfo,
5202 FSE_ARG_DONE);
5203 }
5204
5205 pvp = vp->v_parent;
5206 // need an iocount on pvp in this case
5207 if (pvp && pvp != dvp) {
5208 error = vnode_get(pvp);
5209 if (error) {
5210 pvp = NULLVP;
5211 error = 0;
5212 }
5213 }
5214 if (pvp) {
5215 add_fsevent(FSE_STAT_CHANGED, ctx,
5216 FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5217 }
5218 if (pvp && pvp != dvp) {
5219 vnode_put(pvp);
5220 }
5221 }
5222 #endif
5223 }
5224 out2:
5225 /*
5226 * nameidone has to happen before we vnode_put(dvp)
5227 * since it may need to release the fs_nodelock on the dvp
5228 */
5229 nameidone(&nd);
5230 if (target_path != NULL) {
5231 RELEASE_PATH(target_path);
5232 }
5233 if (no_firmlink_path != NULL) {
5234 RELEASE_PATH(no_firmlink_path);
5235 no_firmlink_path = NULL;
5236 }
5237 out:
5238 if (lvp) {
5239 vnode_put(lvp);
5240 }
5241 if (dvp) {
5242 vnode_put(dvp);
5243 }
5244 vnode_put(vp);
5245 return error;
5246 }
5247
5248 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5249 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5250 {
5251 return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5252 AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5253 }
5254
5255 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5256 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5257 {
5258 if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5259 return EINVAL;
5260 }
5261
5262 return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5263 uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5264 }
5265
5266 /*
5267 * Make a symbolic link.
5268 *
5269 * We could add support for ACLs here too...
5270 */
5271 /* ARGSUSED */
5272 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5273 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5274 user_addr_t link, enum uio_seg segflg)
5275 {
5276 struct vnode_attr va;
5277 char *path;
5278 int error;
5279 struct nameidata nd;
5280 vnode_t vp, dvp;
5281 size_t dummy = 0;
5282 proc_t p;
5283
5284 error = 0;
5285 if (UIO_SEG_IS_USER_SPACE(segflg)) {
5286 path = zalloc(ZV_NAMEI);
5287 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5288 } else {
5289 path = (char *)path_data;
5290 }
5291 if (error) {
5292 goto out;
5293 }
5294 AUDIT_ARG(text, path); /* This is the link string */
5295
5296 NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5297 segflg, link, ctx);
5298
5299 error = nameiat(&nd, fd);
5300 if (error) {
5301 goto out;
5302 }
5303 dvp = nd.ni_dvp;
5304 vp = nd.ni_vp;
5305
5306 p = vfs_context_proc(ctx);
5307 VATTR_INIT(&va);
5308 VATTR_SET(&va, va_type, VLNK);
5309 VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5310
5311 #if CONFIG_MACF
5312 error = mac_vnode_check_create(ctx,
5313 dvp, &nd.ni_cnd, &va);
5314 #endif
5315 if (error != 0) {
5316 goto skipit;
5317 }
5318
5319 if (vp != NULL) {
5320 error = EEXIST;
5321 goto skipit;
5322 }
5323
5324 /* authorize */
5325 if (error == 0) {
5326 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5327 }
5328 /* get default ownership, etc. */
5329 if (error == 0) {
5330 error = vnode_authattr_new(dvp, &va, 0, ctx);
5331 }
5332 if (error == 0) {
5333 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5334 }
5335
5336 /* do fallback attribute handling */
5337 if (error == 0 && vp) {
5338 error = vnode_setattr_fallback(vp, &va, ctx);
5339 }
5340
5341 #if CONFIG_MACF
5342 if (error == 0 && vp) {
5343 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5344 }
5345 #endif
5346
5347 if (error == 0) {
5348 int update_flags = 0;
5349
5350 /*check if a new vnode was created, else try to get one*/
5351 if (vp == NULL) {
5352 nd.ni_cnd.cn_nameiop = LOOKUP;
5353 #if CONFIG_TRIGGERS
5354 nd.ni_op = OP_LOOKUP;
5355 #endif
5356 /*
5357 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5358 * reallocated again in namei().
5359 */
5360 nd.ni_cnd.cn_flags &= HASBUF;
5361 error = nameiat(&nd, fd);
5362 if (error) {
5363 goto skipit;
5364 }
5365 vp = nd.ni_vp;
5366 }
5367
5368 #if 0 /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5369 /* call out to allow 3rd party notification of rename.
5370 * Ignore result of kauth_authorize_fileop call.
5371 */
5372 if (kauth_authorize_fileop_has_listeners() &&
5373 namei(&nd) == 0) {
5374 char *new_link_path = NULL;
5375 int len;
5376
5377 /* build the path to the new link file */
5378 new_link_path = get_pathbuff();
5379 len = MAXPATHLEN;
5380 vn_getpath(dvp, new_link_path, &len);
5381 if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5382 new_link_path[len - 1] = '/';
5383 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5384 }
5385
5386 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5387 (uintptr_t)path, (uintptr_t)new_link_path);
5388 if (new_link_path != NULL) {
5389 release_pathbuff(new_link_path);
5390 }
5391 }
5392 #endif
5393 // Make sure the name & parent pointers are hooked up
5394 if (vp->v_name == NULL) {
5395 update_flags |= VNODE_UPDATE_NAME;
5396 }
5397 if (vp->v_parent == NULLVP) {
5398 update_flags |= VNODE_UPDATE_PARENT;
5399 }
5400
5401 if (update_flags) {
5402 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5403 }
5404
5405 #if CONFIG_FSE
5406 add_fsevent(FSE_CREATE_FILE, ctx,
5407 FSE_ARG_VNODE, vp,
5408 FSE_ARG_DONE);
5409 #endif
5410 }
5411
5412 skipit:
5413 /*
5414 * nameidone has to happen before we vnode_put(dvp)
5415 * since it may need to release the fs_nodelock on the dvp
5416 */
5417 nameidone(&nd);
5418
5419 if (vp) {
5420 vnode_put(vp);
5421 }
5422 vnode_put(dvp);
5423 out:
5424 if (path && (path != (char *)path_data)) {
5425 zfree(ZV_NAMEI, path);
5426 }
5427
5428 return error;
5429 }
5430
5431 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)5432 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5433 {
5434 return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5435 uap->link, UIO_USERSPACE);
5436 }
5437
5438 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)5439 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5440 __unused int32_t *retval)
5441 {
5442 return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5443 uap->path2, UIO_USERSPACE);
5444 }
5445
5446 /*
5447 * Delete a whiteout from the filesystem.
5448 * No longer supported.
5449 */
5450 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)5451 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5452 {
5453 return ENOTSUP;
5454 }
5455
5456 /*
5457 * Delete a name from the filesystem.
5458 */
5459 /* ARGSUSED */
5460 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5461 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5462 user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5463 {
5464 struct {
5465 struct nameidata nd;
5466 #if CONFIG_FSE
5467 struct vnode_attr va;
5468 fse_info finfo;
5469 #endif
5470 } *__unlink_data;
5471 struct nameidata *ndp;
5472 vnode_t vp, dvp;
5473 int error;
5474 struct componentname *cnp;
5475 char *path = NULL;
5476 char *no_firmlink_path = NULL;
5477 int len_path = 0;
5478 int len_no_firmlink_path = 0;
5479 int flags;
5480 int need_event;
5481 int has_listeners;
5482 int truncated_path;
5483 int truncated_no_firmlink_path;
5484 int batched;
5485 struct vnode_attr *vap;
5486 int do_retry;
5487 int retry_count = 0;
5488 int cn_flags;
5489
5490 cn_flags = LOCKPARENT;
5491 if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
5492 cn_flags |= AUDITVNPATH1;
5493 }
5494 /* If a starting dvp is passed, it trumps any fd passed. */
5495 if (start_dvp) {
5496 cn_flags |= USEDVP;
5497 }
5498
5499 #if NAMEDRSRCFORK
5500 /* unlink or delete is allowed on rsrc forks and named streams */
5501 cn_flags |= CN_ALLOWRSRCFORK;
5502 #endif
5503
5504 __unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
5505 ndp = &__unlink_data->nd;
5506 #if CONFIG_FSE
5507 fse_info *finfop = &__unlink_data->finfo;
5508 #endif
5509
5510 retry:
5511 do_retry = 0;
5512 flags = 0;
5513 need_event = 0;
5514 has_listeners = 0;
5515 truncated_path = 0;
5516 truncated_no_firmlink_path = 0;
5517 vap = NULL;
5518
5519 NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
5520
5521 ndp->ni_dvp = start_dvp;
5522 ndp->ni_flag |= NAMEI_COMPOUNDREMOVE;
5523 cnp = &ndp->ni_cnd;
5524
5525 continue_lookup:
5526 error = nameiat(ndp, fd);
5527 if (error) {
5528 goto early_out;
5529 }
5530
5531 dvp = ndp->ni_dvp;
5532 vp = ndp->ni_vp;
5533
5534 /* With Carbon delete semantics, busy files cannot be deleted */
5535 if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
5536 flags |= VNODE_REMOVE_NODELETEBUSY;
5537 }
5538
5539 /* Skip any potential upcalls if told to. */
5540 if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
5541 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
5542 }
5543
5544 if (vp) {
5545 batched = vnode_compound_remove_available(vp);
5546 /*
5547 * The root of a mounted filesystem cannot be deleted.
5548 */
5549 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
5550 error = EBUSY;
5551 goto out;
5552 }
5553
5554 #if DEVELOPMENT || DEBUG
5555 /*
5556 * XXX VSWAP: Check for entitlements or special flag here
5557 * so we can restrict access appropriately.
5558 */
5559 #else /* DEVELOPMENT || DEBUG */
5560
5561 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
5562 error = EPERM;
5563 goto out;
5564 }
5565 #endif /* DEVELOPMENT || DEBUG */
5566
5567 if (!batched) {
5568 error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
5569 if (error) {
5570 if (error == ENOENT) {
5571 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5572 do_retry = 1;
5573 retry_count++;
5574 }
5575 }
5576 goto out;
5577 }
5578 }
5579 } else {
5580 batched = 1;
5581
5582 if (!vnode_compound_remove_available(dvp)) {
5583 panic("No vp, but no compound remove?");
5584 }
5585 }
5586
5587 #if CONFIG_FSE
5588 need_event = need_fsevent(FSE_DELETE, dvp);
5589 if (need_event) {
5590 if (!batched) {
5591 if ((vp->v_flag & VISHARDLINK) == 0) {
5592 /* XXX need to get these data in batched VNOP */
5593 get_fse_info(vp, finfop, ctx);
5594 }
5595 } else {
5596 error =
5597 vfs_get_notify_attributes(&__unlink_data->va);
5598 if (error) {
5599 goto out;
5600 }
5601
5602 vap = &__unlink_data->va;
5603 }
5604 }
5605 #endif
5606 has_listeners = kauth_authorize_fileop_has_listeners();
5607 if (need_event || has_listeners) {
5608 if (path == NULL) {
5609 GET_PATH(path);
5610 }
5611 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
5612 if (no_firmlink_path == NULL) {
5613 GET_PATH(no_firmlink_path);
5614 }
5615 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5616 }
5617
5618 #if NAMEDRSRCFORK
5619 if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
5620 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
5621 } else
5622 #endif
5623 {
5624 error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
5625 vp = ndp->ni_vp;
5626 if (error == EKEEPLOOKING) {
5627 if (!batched) {
5628 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
5629 }
5630
5631 if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
5632 panic("EKEEPLOOKING, but continue flag not set?");
5633 }
5634
5635 if (vnode_isdir(vp)) {
5636 error = EISDIR;
5637 goto out;
5638 }
5639 goto continue_lookup;
5640 } else if (error == ENOENT && batched) {
5641 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5642 /*
5643 * For compound VNOPs, the authorization callback may
5644 * return ENOENT in case of racing hardlink lookups
5645 * hitting the name cache, redrive the lookup.
5646 */
5647 do_retry = 1;
5648 retry_count += 1;
5649 goto out;
5650 }
5651 }
5652 }
5653
5654 /*
5655 * Call out to allow 3rd party notification of delete.
5656 * Ignore result of kauth_authorize_fileop call.
5657 */
5658 if (!error) {
5659 if (has_listeners) {
5660 kauth_authorize_fileop(vfs_context_ucred(ctx),
5661 KAUTH_FILEOP_DELETE,
5662 (uintptr_t)vp,
5663 (uintptr_t)path);
5664 }
5665
5666 if (vp->v_flag & VISHARDLINK) {
5667 //
5668 // if a hardlink gets deleted we want to blow away the
5669 // v_parent link because the path that got us to this
5670 // instance of the link is no longer valid. this will
5671 // force the next call to get the path to ask the file
5672 // system instead of just following the v_parent link.
5673 //
5674 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
5675 }
5676
5677 #if CONFIG_FSE
5678 if (need_event) {
5679 if (vp->v_flag & VISHARDLINK) {
5680 get_fse_info(vp, finfop, ctx);
5681 } else if (vap) {
5682 vnode_get_fse_info_from_vap(vp, finfop, vap);
5683 }
5684 if (truncated_path) {
5685 finfop->mode |= FSE_TRUNCATED_PATH;
5686 }
5687 add_fsevent(FSE_DELETE, ctx,
5688 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5689 FSE_ARG_FINFO, finfop,
5690 FSE_ARG_DONE);
5691 }
5692 #endif
5693 }
5694
5695 out:
5696 if (path != NULL) {
5697 RELEASE_PATH(path);
5698 path = NULL;
5699 }
5700
5701 if (no_firmlink_path != NULL) {
5702 RELEASE_PATH(no_firmlink_path);
5703 no_firmlink_path = NULL;
5704 }
5705 #if NAMEDRSRCFORK
5706 /* recycle the deleted rsrc fork vnode to force a reclaim, which
5707 * will cause its shadow file to go away if necessary.
5708 */
5709 if (vp && (vnode_isnamedstream(vp)) &&
5710 (vp->v_parent != NULLVP) &&
5711 vnode_isshadow(vp)) {
5712 vnode_recycle(vp);
5713 }
5714 #endif
5715 /*
5716 * nameidone has to happen before we vnode_put(dvp)
5717 * since it may need to release the fs_nodelock on the dvp
5718 */
5719 nameidone(ndp);
5720 vnode_put(dvp);
5721 if (vp) {
5722 vnode_put(vp);
5723 }
5724
5725 if (do_retry) {
5726 goto retry;
5727 }
5728
5729 early_out:
5730 kfree_type(typeof(*__unlink_data), __unlink_data);
5731 return error;
5732 }
5733
5734 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5735 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
5736 enum uio_seg segflg, int unlink_flags)
5737 {
5738 return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
5739 unlink_flags);
5740 }
5741
5742 /*
5743 * Delete a name from the filesystem using Carbon semantics.
5744 */
5745 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)5746 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
5747 {
5748 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5749 uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
5750 }
5751
5752 /*
5753 * Delete a name from the filesystem using POSIX semantics.
5754 */
5755 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)5756 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
5757 {
5758 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5759 uap->path, UIO_USERSPACE, 0);
5760 }
5761
5762 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)5763 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
5764 {
5765 if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5766 return EINVAL;
5767 }
5768
5769 if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5770 int unlink_flags = 0;
5771
5772 if (uap->flag & AT_REMOVEDIR_DATALESS) {
5773 unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
5774 }
5775 return rmdirat_internal(vfs_context_current(), uap->fd,
5776 uap->path, UIO_USERSPACE, unlink_flags);
5777 } else {
5778 return unlinkat_internal(vfs_context_current(), uap->fd,
5779 NULLVP, uap->path, UIO_USERSPACE, 0);
5780 }
5781 }
5782
5783 /*
5784 * Reposition read/write file offset.
5785 */
5786 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)5787 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
5788 {
5789 struct fileproc *fp;
5790 vnode_t vp;
5791 struct vfs_context *ctx;
5792 off_t offset = uap->offset, file_size;
5793 int error;
5794
5795 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
5796 if (error == ENOTSUP) {
5797 return ESPIPE;
5798 }
5799 return error;
5800 }
5801 if (vnode_isfifo(vp)) {
5802 file_drop(uap->fd);
5803 return ESPIPE;
5804 }
5805
5806
5807 ctx = vfs_context_current();
5808 #if CONFIG_MACF
5809 if (uap->whence == L_INCR && uap->offset == 0) {
5810 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
5811 fp->fp_glob);
5812 } else {
5813 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
5814 fp->fp_glob);
5815 }
5816 if (error) {
5817 file_drop(uap->fd);
5818 return error;
5819 }
5820 #endif
5821 if ((error = vnode_getwithref(vp))) {
5822 file_drop(uap->fd);
5823 return error;
5824 }
5825
5826 switch (uap->whence) {
5827 case L_INCR:
5828 offset += fp->fp_glob->fg_offset;
5829 break;
5830 case L_XTND:
5831 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
5832 break;
5833 }
5834 offset += file_size;
5835 break;
5836 case L_SET:
5837 break;
5838 case SEEK_HOLE:
5839 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
5840 break;
5841 case SEEK_DATA:
5842 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
5843 break;
5844 default:
5845 error = EINVAL;
5846 }
5847 if (error == 0) {
5848 if (uap->offset > 0 && offset < 0) {
5849 /* Incremented/relative move past max size */
5850 error = EOVERFLOW;
5851 } else {
5852 /*
5853 * Allow negative offsets on character devices, per
5854 * POSIX 1003.1-2001. Most likely for writing disk
5855 * labels.
5856 */
5857 if (offset < 0 && vp->v_type != VCHR) {
5858 /* Decremented/relative move before start */
5859 error = EINVAL;
5860 } else {
5861 /* Success */
5862 fp->fp_glob->fg_offset = offset;
5863 *retval = fp->fp_glob->fg_offset;
5864 }
5865 }
5866 }
5867
5868 /*
5869 * An lseek can affect whether data is "available to read." Use
5870 * hint of NOTE_NONE so no EVFILT_VNODE events fire
5871 */
5872 post_event_if_success(vp, error, NOTE_NONE);
5873 (void)vnode_put(vp);
5874 file_drop(uap->fd);
5875 return error;
5876 }
5877
5878
5879 /*
5880 * Check access permissions.
5881 *
5882 * Returns: 0 Success
5883 * vnode_authorize:???
5884 */
5885 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)5886 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
5887 {
5888 kauth_action_t action;
5889 int error;
5890
5891 /*
5892 * If just the regular access bits, convert them to something
5893 * that vnode_authorize will understand.
5894 */
5895 if (!(uflags & _ACCESS_EXTENDED_MASK)) {
5896 action = 0;
5897 if (uflags & R_OK) {
5898 action |= KAUTH_VNODE_READ_DATA; /* aka KAUTH_VNODE_LIST_DIRECTORY */
5899 }
5900 if (uflags & W_OK) {
5901 if (vnode_isdir(vp)) {
5902 action |= KAUTH_VNODE_ADD_FILE |
5903 KAUTH_VNODE_ADD_SUBDIRECTORY;
5904 /* might want delete rights here too */
5905 } else {
5906 action |= KAUTH_VNODE_WRITE_DATA;
5907 }
5908 }
5909 if (uflags & X_OK) {
5910 if (vnode_isdir(vp)) {
5911 action |= KAUTH_VNODE_SEARCH;
5912 } else {
5913 action |= KAUTH_VNODE_EXECUTE;
5914 }
5915 }
5916 } else {
5917 /* take advantage of definition of uflags */
5918 action = uflags >> 8;
5919 }
5920
5921 #if CONFIG_MACF
5922 error = mac_vnode_check_access(ctx, vp, uflags);
5923 if (error) {
5924 return error;
5925 }
5926 #endif /* MAC */
5927
5928 /* action == 0 means only check for existence */
5929 if (action != 0) {
5930 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
5931 } else {
5932 error = 0;
5933 }
5934
5935 return error;
5936 }
5937
5938
5939
5940 /*
5941 * access_extended: Check access permissions in bulk.
5942 *
5943 * Description: uap->entries Pointer to an array of accessx
5944 * descriptor structs, plus one or
5945 * more NULL terminated strings (see
5946 * "Notes" section below).
5947 * uap->size Size of the area pointed to by
5948 * uap->entries.
5949 * uap->results Pointer to the results array.
5950 *
5951 * Returns: 0 Success
5952 * ENOMEM Insufficient memory
5953 * EINVAL Invalid arguments
5954 * namei:EFAULT Bad address
5955 * namei:ENAMETOOLONG Filename too long
5956 * namei:ENOENT No such file or directory
5957 * namei:ELOOP Too many levels of symbolic links
5958 * namei:EBADF Bad file descriptor
5959 * namei:ENOTDIR Not a directory
5960 * namei:???
5961 * access1:
5962 *
5963 * Implicit returns:
5964 * uap->results Array contents modified
5965 *
5966 * Notes: The uap->entries are structured as an arbitrary length array
5967 * of accessx descriptors, followed by one or more NULL terminated
5968 * strings
5969 *
5970 * struct accessx_descriptor[0]
5971 * ...
5972 * struct accessx_descriptor[n]
5973 * char name_data[0];
5974 *
5975 * We determine the entry count by walking the buffer containing
5976 * the uap->entries argument descriptor. For each descriptor we
5977 * see, the valid values for the offset ad_name_offset will be
5978 * in the byte range:
5979 *
5980 * [ uap->entries + sizeof(struct accessx_descriptor) ]
5981 * to
5982 * [ uap->entries + uap->size - 2 ]
5983 *
5984 * since we must have at least one string, and the string must
5985 * be at least one character plus the NULL terminator in length.
5986 *
5987 * XXX: Need to support the check-as uid argument
5988 */
5989 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)5990 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
5991 {
5992 struct accessx_descriptor *input = NULL;
5993 errno_t *result = NULL;
5994 errno_t error = 0;
5995 int wantdelete = 0;
5996 size_t desc_max, desc_actual;
5997 unsigned int i, j;
5998 struct vfs_context context;
5999 struct nameidata nd;
6000 int niopts;
6001 vnode_t vp = NULL;
6002 vnode_t dvp = NULL;
6003 #define ACCESSX_MAX_DESCR_ON_STACK 10
6004 struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6005
6006 context.vc_ucred = NULL;
6007
6008 /*
6009 * Validate parameters; if valid, copy the descriptor array and string
6010 * arguments into local memory. Before proceeding, the following
6011 * conditions must have been met:
6012 *
6013 * o The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6014 * o There must be sufficient room in the request for at least one
6015 * descriptor and a one yte NUL terminated string.
6016 * o The allocation of local storage must not fail.
6017 */
6018 if (uap->size > ACCESSX_MAX_TABLESIZE) {
6019 return ENOMEM;
6020 }
6021 if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6022 return EINVAL;
6023 }
6024 if (uap->size <= sizeof(stack_input)) {
6025 input = stack_input;
6026 } else {
6027 input = kalloc_data(uap->size, Z_WAITOK);
6028 if (input == NULL) {
6029 error = ENOMEM;
6030 goto out;
6031 }
6032 }
6033 error = copyin(uap->entries, input, uap->size);
6034 if (error) {
6035 goto out;
6036 }
6037
6038 AUDIT_ARG(opaque, input, uap->size);
6039
6040 /*
6041 * Force NUL termination of the copyin buffer to avoid nami() running
6042 * off the end. If the caller passes us bogus data, they may get a
6043 * bogus result.
6044 */
6045 ((char *)input)[uap->size - 1] = 0;
6046
6047 /*
6048 * Access is defined as checking against the process' real identity,
6049 * even if operations are checking the effective identity. This
6050 * requires that we use a local vfs context.
6051 */
6052 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6053 context.vc_thread = current_thread();
6054
6055 /*
6056 * Find out how many entries we have, so we can allocate the result
6057 * array by walking the list and adjusting the count downward by the
6058 * earliest string offset we see.
6059 */
6060 desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6061 desc_actual = desc_max;
6062 for (i = 0; i < desc_actual; i++) {
6063 /*
6064 * Take the offset to the name string for this entry and
6065 * convert to an input array index, which would be one off
6066 * the end of the array if this entry was the lowest-addressed
6067 * name string.
6068 */
6069 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6070
6071 /*
6072 * An offset greater than the max allowable offset is an error.
6073 * It is also an error for any valid entry to point
6074 * to a location prior to the end of the current entry, if
6075 * it's not a reference to the string of the previous entry.
6076 */
6077 if (j > desc_max || (j != 0 && j <= i)) {
6078 error = EINVAL;
6079 goto out;
6080 }
6081
6082 /* Also do not let ad_name_offset point to something beyond the size of the input */
6083 if (input[i].ad_name_offset >= uap->size) {
6084 error = EINVAL;
6085 goto out;
6086 }
6087
6088 /*
6089 * An offset of 0 means use the previous descriptor's offset;
6090 * this is used to chain multiple requests for the same file
6091 * to avoid multiple lookups.
6092 */
6093 if (j == 0) {
6094 /* This is not valid for the first entry */
6095 if (i == 0) {
6096 error = EINVAL;
6097 goto out;
6098 }
6099 continue;
6100 }
6101
6102 /*
6103 * If the offset of the string for this descriptor is before
6104 * what we believe is the current actual last descriptor,
6105 * then we need to adjust our estimate downward; this permits
6106 * the string table following the last descriptor to be out
6107 * of order relative to the descriptor list.
6108 */
6109 if (j < desc_actual) {
6110 desc_actual = j;
6111 }
6112 }
6113
6114 /*
6115 * We limit the actual number of descriptors we are willing to process
6116 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS. If the number being
6117 * requested does not exceed this limit,
6118 */
6119 if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6120 error = ENOMEM;
6121 goto out;
6122 }
6123 result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6124 if (result == NULL) {
6125 error = ENOMEM;
6126 goto out;
6127 }
6128
6129 /*
6130 * Do the work by iterating over the descriptor entries we know to
6131 * at least appear to contain valid data.
6132 */
6133 error = 0;
6134 for (i = 0; i < desc_actual; i++) {
6135 /*
6136 * If the ad_name_offset is 0, then we use the previous
6137 * results to make the check; otherwise, we are looking up
6138 * a new file name.
6139 */
6140 if (input[i].ad_name_offset != 0) {
6141 /* discard old vnodes */
6142 if (vp) {
6143 vnode_put(vp);
6144 vp = NULL;
6145 }
6146 if (dvp) {
6147 vnode_put(dvp);
6148 dvp = NULL;
6149 }
6150
6151 /*
6152 * Scan forward in the descriptor list to see if we
6153 * need the parent vnode. We will need it if we are
6154 * deleting, since we must have rights to remove
6155 * entries in the parent directory, as well as the
6156 * rights to delete the object itself.
6157 */
6158 wantdelete = input[i].ad_flags & _DELETE_OK;
6159 for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6160 if (input[j].ad_flags & _DELETE_OK) {
6161 wantdelete = 1;
6162 }
6163 }
6164
6165 niopts = FOLLOW | AUDITVNPATH1;
6166
6167 /* need parent for vnode_authorize for deletion test */
6168 if (wantdelete) {
6169 niopts |= WANTPARENT;
6170 }
6171
6172 /* do the lookup */
6173 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6174 CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6175 &context);
6176 error = namei(&nd);
6177 if (!error) {
6178 vp = nd.ni_vp;
6179 if (wantdelete) {
6180 dvp = nd.ni_dvp;
6181 }
6182 }
6183 nameidone(&nd);
6184 }
6185
6186 /*
6187 * Handle lookup errors.
6188 */
6189 switch (error) {
6190 case ENOENT:
6191 case EACCES:
6192 case EPERM:
6193 case ENOTDIR:
6194 result[i] = error;
6195 break;
6196 case 0:
6197 /* run this access check */
6198 result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6199 break;
6200 default:
6201 /* fatal lookup error */
6202
6203 goto out;
6204 }
6205 }
6206
6207 AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6208
6209 /* copy out results */
6210 error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6211
6212 out:
6213 if (input && input != stack_input) {
6214 kfree_data(input, uap->size);
6215 }
6216 if (result) {
6217 kfree_data(result, desc_actual * sizeof(errno_t));
6218 }
6219 if (vp) {
6220 vnode_put(vp);
6221 }
6222 if (dvp) {
6223 vnode_put(dvp);
6224 }
6225 if (IS_VALID_CRED(context.vc_ucred)) {
6226 kauth_cred_unref(&context.vc_ucred);
6227 }
6228 return error;
6229 }
6230
6231
6232 /*
6233 * Returns: 0 Success
6234 * namei:EFAULT Bad address
6235 * namei:ENAMETOOLONG Filename too long
6236 * namei:ENOENT No such file or directory
6237 * namei:ELOOP Too many levels of symbolic links
6238 * namei:EBADF Bad file descriptor
6239 * namei:ENOTDIR Not a directory
6240 * namei:???
6241 * access1:
6242 */
6243 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6244 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6245 int flag, enum uio_seg segflg)
6246 {
6247 int error;
6248 struct nameidata nd;
6249 int niopts;
6250 struct vfs_context context;
6251 #if NAMEDRSRCFORK
6252 int is_namedstream = 0;
6253 #endif
6254
6255 /*
6256 * Unless the AT_EACCESS option is used, Access is defined as checking
6257 * against the process' real identity, even if operations are checking
6258 * the effective identity. So we need to tweak the credential
6259 * in the context for that case.
6260 */
6261 if (!(flag & AT_EACCESS)) {
6262 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6263 } else {
6264 context.vc_ucred = ctx->vc_ucred;
6265 }
6266 context.vc_thread = ctx->vc_thread;
6267
6268
6269 niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6270 /* need parent for vnode_authorize for deletion test */
6271 if (amode & _DELETE_OK) {
6272 niopts |= WANTPARENT;
6273 }
6274 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6275 path, &context);
6276 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6277 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6278 }
6279
6280 #if NAMEDRSRCFORK
6281 /* access(F_OK) calls are allowed for resource forks. */
6282 if (amode == F_OK) {
6283 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6284 }
6285 #endif
6286 error = nameiat(&nd, fd);
6287 if (error) {
6288 goto out;
6289 }
6290
6291 #if NAMEDRSRCFORK
6292 /* Grab reference on the shadow stream file vnode to
6293 * force an inactive on release which will mark it
6294 * for recycle.
6295 */
6296 if (vnode_isnamedstream(nd.ni_vp) &&
6297 (nd.ni_vp->v_parent != NULLVP) &&
6298 vnode_isshadow(nd.ni_vp)) {
6299 is_namedstream = 1;
6300 vnode_ref(nd.ni_vp);
6301 }
6302 #endif
6303
6304 error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6305
6306 #if NAMEDRSRCFORK
6307 if (is_namedstream) {
6308 vnode_rele(nd.ni_vp);
6309 }
6310 #endif
6311
6312 vnode_put(nd.ni_vp);
6313 if (amode & _DELETE_OK) {
6314 vnode_put(nd.ni_dvp);
6315 }
6316 nameidone(&nd);
6317
6318 out:
6319 if (!(flag & AT_EACCESS)) {
6320 kauth_cred_unref(&context.vc_ucred);
6321 }
6322 return error;
6323 }
6324
6325 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)6326 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6327 {
6328 return faccessat_internal(vfs_context_current(), AT_FDCWD,
6329 uap->path, uap->flags, 0, UIO_USERSPACE);
6330 }
6331
6332 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)6333 faccessat(__unused proc_t p, struct faccessat_args *uap,
6334 __unused int32_t *retval)
6335 {
6336 if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
6337 return EINVAL;
6338 }
6339
6340 return faccessat_internal(vfs_context_current(), uap->fd,
6341 uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6342 }
6343
6344 /*
6345 * Returns: 0 Success
6346 * EFAULT
6347 * copyout:EFAULT
6348 * namei:???
6349 * vn_stat:???
6350 */
6351 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)6352 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6353 user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6354 enum uio_seg segflg, int fd, int flag)
6355 {
6356 struct nameidata nd;
6357 int follow;
6358 union {
6359 struct stat sb;
6360 struct stat64 sb64;
6361 } source = {};
6362 union {
6363 struct user64_stat user64_sb;
6364 struct user32_stat user32_sb;
6365 struct user64_stat64 user64_sb64;
6366 struct user32_stat64 user32_sb64;
6367 } dest = {};
6368 caddr_t sbp;
6369 int error, my_size;
6370 kauth_filesec_t fsec;
6371 size_t xsecurity_bufsize;
6372 void * statptr;
6373 struct fileproc *fp = NULL;
6374 int needsrealdev = 0;
6375
6376 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
6377 NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6378 segflg, path, ctx);
6379 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6380 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6381 }
6382
6383 #if NAMEDRSRCFORK
6384 int is_namedstream = 0;
6385 /* stat calls are allowed for resource forks. */
6386 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6387 #endif
6388
6389 if (flag & AT_FDONLY) {
6390 vnode_t fvp;
6391
6392 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6393 if (error) {
6394 return error;
6395 }
6396 if ((error = vnode_getwithref(fvp))) {
6397 file_drop(fd);
6398 return error;
6399 }
6400 nd.ni_vp = fvp;
6401 } else {
6402 error = nameiat(&nd, fd);
6403 if (error) {
6404 return error;
6405 }
6406 }
6407 fsec = KAUTH_FILESEC_NONE;
6408
6409 statptr = (void *)&source;
6410
6411 #if NAMEDRSRCFORK
6412 /* Grab reference on the shadow stream file vnode to
6413 * force an inactive on release which will mark it
6414 * for recycle.
6415 */
6416 if (vnode_isnamedstream(nd.ni_vp) &&
6417 (nd.ni_vp->v_parent != NULLVP) &&
6418 vnode_isshadow(nd.ni_vp)) {
6419 is_namedstream = 1;
6420 vnode_ref(nd.ni_vp);
6421 }
6422 #endif
6423
6424 needsrealdev = flag & AT_REALDEV ? 1 : 0;
6425 if (fp && (xsecurity == USER_ADDR_NULL)) {
6426 /*
6427 * If the caller has the file open, and is not
6428 * requesting extended security information, we are
6429 * going to let them get the basic stat information.
6430 */
6431 error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6432 fp->fp_glob->fg_cred);
6433 } else {
6434 error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6435 isstat64, needsrealdev, ctx);
6436 }
6437
6438 #if NAMEDRSRCFORK
6439 if (is_namedstream) {
6440 vnode_rele(nd.ni_vp);
6441 }
6442 #endif
6443 vnode_put(nd.ni_vp);
6444 nameidone(&nd);
6445 if (fp) {
6446 file_drop(fd);
6447 fp = NULL;
6448 }
6449
6450 if (error) {
6451 return error;
6452 }
6453 /* Zap spare fields */
6454 if (isstat64 != 0) {
6455 source.sb64.st_lspare = 0;
6456 source.sb64.st_qspare[0] = 0LL;
6457 source.sb64.st_qspare[1] = 0LL;
6458 if (vfs_context_is64bit(ctx)) {
6459 munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6460 my_size = sizeof(dest.user64_sb64);
6461 sbp = (caddr_t)&dest.user64_sb64;
6462 } else {
6463 munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6464 my_size = sizeof(dest.user32_sb64);
6465 sbp = (caddr_t)&dest.user32_sb64;
6466 }
6467 /*
6468 * Check if we raced (post lookup) against the last unlink of a file.
6469 */
6470 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
6471 source.sb64.st_nlink = 1;
6472 }
6473 } else {
6474 source.sb.st_lspare = 0;
6475 source.sb.st_qspare[0] = 0LL;
6476 source.sb.st_qspare[1] = 0LL;
6477 if (vfs_context_is64bit(ctx)) {
6478 munge_user64_stat(&source.sb, &dest.user64_sb);
6479 my_size = sizeof(dest.user64_sb);
6480 sbp = (caddr_t)&dest.user64_sb;
6481 } else {
6482 munge_user32_stat(&source.sb, &dest.user32_sb);
6483 my_size = sizeof(dest.user32_sb);
6484 sbp = (caddr_t)&dest.user32_sb;
6485 }
6486
6487 /*
6488 * Check if we raced (post lookup) against the last unlink of a file.
6489 */
6490 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
6491 source.sb.st_nlink = 1;
6492 }
6493 }
6494 if ((error = copyout(sbp, ub, my_size)) != 0) {
6495 goto out;
6496 }
6497
6498 /* caller wants extended security information? */
6499 if (xsecurity != USER_ADDR_NULL) {
6500 /* did we get any? */
6501 if (fsec == KAUTH_FILESEC_NONE) {
6502 if (susize(xsecurity_size, 0) != 0) {
6503 error = EFAULT;
6504 goto out;
6505 }
6506 } else {
6507 /* find the user buffer size */
6508 xsecurity_bufsize = fusize(xsecurity_size);
6509
6510 /* copy out the actual data size */
6511 if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
6512 error = EFAULT;
6513 goto out;
6514 }
6515
6516 /* if the caller supplied enough room, copy out to it */
6517 if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
6518 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
6519 }
6520 }
6521 }
6522 out:
6523 if (fsec != KAUTH_FILESEC_NONE) {
6524 kauth_filesec_free(fsec);
6525 }
6526 return error;
6527 }
6528
6529 /*
6530 * stat_extended: Get file status; with extended security (ACL).
6531 *
6532 * Parameters: p (ignored)
6533 * uap User argument descriptor (see below)
6534 * retval (ignored)
6535 *
6536 * Indirect: uap->path Path of file to get status from
6537 * uap->ub User buffer (holds file status info)
6538 * uap->xsecurity ACL to get (extended security)
6539 * uap->xsecurity_size Size of ACL
6540 *
6541 * Returns: 0 Success
6542 * !0 errno value
6543 *
6544 */
6545 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)6546 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
6547 __unused int32_t *retval)
6548 {
6549 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6550 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6551 0);
6552 }
6553
6554 /*
6555 * Returns: 0 Success
6556 * fstatat_internal:??? [see fstatat_internal() in this file]
6557 */
6558 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)6559 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
6560 {
6561 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6562 0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
6563 }
6564
6565 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)6566 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
6567 {
6568 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6569 0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
6570 }
6571
6572 /*
6573 * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
6574 *
6575 * Parameters: p (ignored)
6576 * uap User argument descriptor (see below)
6577 * retval (ignored)
6578 *
6579 * Indirect: uap->path Path of file to get status from
6580 * uap->ub User buffer (holds file status info)
6581 * uap->xsecurity ACL to get (extended security)
6582 * uap->xsecurity_size Size of ACL
6583 *
6584 * Returns: 0 Success
6585 * !0 errno value
6586 *
6587 */
6588 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)6589 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
6590 {
6591 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6592 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6593 0);
6594 }
6595
6596 /*
6597 * lstat_extended: Get file status; does not follow links; with extended security (ACL).
6598 *
6599 * Parameters: p (ignored)
6600 * uap User argument descriptor (see below)
6601 * retval (ignored)
6602 *
6603 * Indirect: uap->path Path of file to get status from
6604 * uap->ub User buffer (holds file status info)
6605 * uap->xsecurity ACL to get (extended security)
6606 * uap->xsecurity_size Size of ACL
6607 *
6608 * Returns: 0 Success
6609 * !0 errno value
6610 *
6611 */
6612 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)6613 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
6614 {
6615 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6616 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6617 AT_SYMLINK_NOFOLLOW);
6618 }
6619
6620 /*
6621 * Get file status; this version does not follow links.
6622 */
6623 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)6624 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
6625 {
6626 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6627 0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6628 }
6629
6630 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)6631 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
6632 {
6633 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6634 0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6635 }
6636
6637 /*
6638 * lstat64_extended: Get file status; can handle large inode numbers; does not
6639 * follow links; with extended security (ACL).
6640 *
6641 * Parameters: p (ignored)
6642 * uap User argument descriptor (see below)
6643 * retval (ignored)
6644 *
6645 * Indirect: uap->path Path of file to get status from
6646 * uap->ub User buffer (holds file status info)
6647 * uap->xsecurity ACL to get (extended security)
6648 * uap->xsecurity_size Size of ACL
6649 *
6650 * Returns: 0 Success
6651 * !0 errno value
6652 *
6653 */
6654 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)6655 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
6656 {
6657 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6658 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6659 AT_SYMLINK_NOFOLLOW);
6660 }
6661
6662 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)6663 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
6664 {
6665 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
6666 return EINVAL;
6667 }
6668
6669 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6670 0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
6671 }
6672
6673 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)6674 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
6675 __unused int32_t *retval)
6676 {
6677 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
6678 return EINVAL;
6679 }
6680
6681 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6682 0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
6683 }
6684
6685 /*
6686 * Get configurable pathname variables.
6687 *
6688 * Returns: 0 Success
6689 * namei:???
6690 * vn_pathconf:???
6691 *
6692 * Notes: Global implementation constants are intended to be
6693 * implemented in this function directly; all other constants
6694 * are per-FS implementation, and therefore must be handled in
6695 * each respective FS, instead.
6696 *
6697 * XXX We implement some things globally right now that should actually be
6698 * XXX per-FS; we will need to deal with this at some point.
6699 */
6700 /* ARGSUSED */
6701 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)6702 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
6703 {
6704 int error;
6705 struct nameidata nd;
6706 vfs_context_t ctx = vfs_context_current();
6707
6708 NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
6709 UIO_USERSPACE, uap->path, ctx);
6710 error = namei(&nd);
6711 if (error) {
6712 return error;
6713 }
6714
6715 error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
6716
6717 vnode_put(nd.ni_vp);
6718 nameidone(&nd);
6719 return error;
6720 }
6721
6722 /*
6723 * Return target name of a symbolic link.
6724 */
6725 /* ARGSUSED */
6726 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)6727 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
6728 enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
6729 int *retval)
6730 {
6731 vnode_t vp;
6732 uio_t auio;
6733 int error;
6734 struct nameidata nd;
6735 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
6736 bool put_vnode;
6737
6738 if (bufsize > INT32_MAX) {
6739 return EINVAL;
6740 }
6741
6742 if (lnk_vp) {
6743 vp = lnk_vp;
6744 put_vnode = false;
6745 } else {
6746 NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
6747 seg, path, ctx);
6748
6749 error = nameiat(&nd, fd);
6750 if (error) {
6751 return error;
6752 }
6753 vp = nd.ni_vp;
6754 put_vnode = true;
6755 nameidone(&nd);
6756 }
6757
6758 auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
6759 &uio_buf[0], sizeof(uio_buf));
6760 uio_addiov(auio, buf, bufsize);
6761 if (vp->v_type != VLNK) {
6762 error = EINVAL;
6763 } else {
6764 #if CONFIG_MACF
6765 error = mac_vnode_check_readlink(ctx, vp);
6766 #endif
6767 if (error == 0) {
6768 error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
6769 ctx);
6770 }
6771 if (error == 0) {
6772 error = VNOP_READLINK(vp, auio, ctx);
6773 }
6774 }
6775
6776 if (put_vnode) {
6777 vnode_put(vp);
6778 }
6779
6780 *retval = (int)(bufsize - uio_resid(auio));
6781 return error;
6782 }
6783
6784 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)6785 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
6786 {
6787 enum uio_seg procseg;
6788 vnode_t vp;
6789 int error;
6790
6791 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6792
6793 AUDIT_ARG(fd, uap->fd);
6794
6795 if ((error = file_vnode(uap->fd, &vp))) {
6796 return error;
6797 }
6798 if ((error = vnode_getwithref(vp))) {
6799 file_drop(uap->fd);
6800 return error;
6801 }
6802
6803 error = readlinkat_internal(vfs_context_current(), -1,
6804 vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
6805 uap->bufsize, procseg, retval);
6806
6807 vnode_put(vp);
6808 file_drop(uap->fd);
6809 return error;
6810 }
6811
6812 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)6813 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
6814 {
6815 enum uio_seg procseg;
6816
6817 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6818 return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
6819 CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
6820 uap->count, procseg, retval);
6821 }
6822
6823 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)6824 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
6825 {
6826 enum uio_seg procseg;
6827
6828 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6829 return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
6830 CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
6831 retval);
6832 }
6833
6834 /*
6835 * Change file flags, the deep inner layer.
6836 */
6837 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)6838 chflags0(vnode_t vp, struct vnode_attr *va,
6839 int (*setattr)(vnode_t, void *, vfs_context_t),
6840 void *arg, vfs_context_t ctx)
6841 {
6842 kauth_action_t action = 0;
6843 int error;
6844
6845 #if CONFIG_MACF
6846 error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
6847 if (error) {
6848 goto out;
6849 }
6850 #endif
6851
6852 /* request authorisation, disregard immutability */
6853 if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
6854 goto out;
6855 }
6856 /*
6857 * Request that the auth layer disregard those file flags it's allowed to when
6858 * authorizing this operation; we need to do this in order to be able to
6859 * clear immutable flags.
6860 */
6861 if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
6862 goto out;
6863 }
6864 error = (*setattr)(vp, arg, ctx);
6865
6866 #if CONFIG_MACF
6867 if (error == 0) {
6868 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
6869 }
6870 #endif
6871
6872 out:
6873 return error;
6874 }
6875
6876 /*
6877 * Change file flags.
6878 *
6879 * NOTE: this will vnode_put() `vp'
6880 */
6881 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)6882 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
6883 {
6884 struct vnode_attr va;
6885 int error;
6886
6887 VATTR_INIT(&va);
6888 VATTR_SET(&va, va_flags, flags);
6889
6890 error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
6891 vnode_put(vp);
6892
6893 if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
6894 error = ENOTSUP;
6895 }
6896
6897 return error;
6898 }
6899
6900 /*
6901 * Change flags of a file given a path name.
6902 */
6903 /* ARGSUSED */
6904 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)6905 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
6906 {
6907 vnode_t vp;
6908 vfs_context_t ctx = vfs_context_current();
6909 int error;
6910 struct nameidata nd;
6911
6912 AUDIT_ARG(fflags, uap->flags);
6913 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6914 UIO_USERSPACE, uap->path, ctx);
6915 error = namei(&nd);
6916 if (error) {
6917 return error;
6918 }
6919 vp = nd.ni_vp;
6920 nameidone(&nd);
6921
6922 /* we don't vnode_put() here because chflags1 does internally */
6923 error = chflags1(vp, uap->flags, ctx);
6924
6925 return error;
6926 }
6927
6928 /*
6929 * Change flags of a file given a file descriptor.
6930 */
6931 /* ARGSUSED */
6932 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)6933 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
6934 {
6935 vnode_t vp;
6936 int error;
6937
6938 AUDIT_ARG(fd, uap->fd);
6939 AUDIT_ARG(fflags, uap->flags);
6940 if ((error = file_vnode(uap->fd, &vp))) {
6941 return error;
6942 }
6943
6944 if ((error = vnode_getwithref(vp))) {
6945 file_drop(uap->fd);
6946 return error;
6947 }
6948
6949 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6950
6951 /* we don't vnode_put() here because chflags1 does internally */
6952 error = chflags1(vp, uap->flags, vfs_context_current());
6953
6954 file_drop(uap->fd);
6955 return error;
6956 }
6957
6958 /*
6959 * Change security information on a filesystem object.
6960 *
6961 * Returns: 0 Success
6962 * EPERM Operation not permitted
6963 * vnode_authattr:??? [anything vnode_authattr can return]
6964 * vnode_authorize:??? [anything vnode_authorize can return]
6965 * vnode_setattr:??? [anything vnode_setattr can return]
6966 *
6967 * Notes: If vnode_authattr or vnode_authorize return EACCES, it will be
6968 * translated to EPERM before being returned.
6969 */
6970 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)6971 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
6972 {
6973 kauth_action_t action;
6974 int error;
6975
6976 AUDIT_ARG(mode, vap->va_mode);
6977 /* XXX audit new args */
6978
6979 #if NAMEDSTREAMS
6980 /* chmod calls are not allowed for resource forks. */
6981 if (vp->v_flag & VISNAMEDSTREAM) {
6982 return EPERM;
6983 }
6984 #endif
6985
6986 #if CONFIG_MACF
6987 if (VATTR_IS_ACTIVE(vap, va_mode) &&
6988 (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
6989 return error;
6990 }
6991
6992 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6993 if ((error = mac_vnode_check_setowner(ctx, vp,
6994 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6995 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
6996 return error;
6997 }
6998 }
6999
7000 if (VATTR_IS_ACTIVE(vap, va_acl) &&
7001 (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7002 return error;
7003 }
7004 #endif
7005
7006 /* make sure that the caller is allowed to set this security information */
7007 if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7008 ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7009 if (error == EACCES) {
7010 error = EPERM;
7011 }
7012 return error;
7013 }
7014
7015 if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7016 return error;
7017 }
7018
7019 #if CONFIG_MACF
7020 if (VATTR_IS_ACTIVE(vap, va_mode)) {
7021 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7022 }
7023
7024 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7025 mac_vnode_notify_setowner(ctx, vp,
7026 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7027 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7028 }
7029
7030 if (VATTR_IS_ACTIVE(vap, va_acl)) {
7031 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7032 }
7033 #endif
7034
7035 return error;
7036 }
7037
7038
7039 /*
7040 * Change mode of a file given a path name.
7041 *
7042 * Returns: 0 Success
7043 * namei:??? [anything namei can return]
7044 * chmod_vnode:??? [anything chmod_vnode can return]
7045 */
7046 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7047 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7048 int fd, int flag, enum uio_seg segflg)
7049 {
7050 struct nameidata nd;
7051 int follow, error;
7052
7053 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7054 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
7055 segflg, path, ctx);
7056 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7057 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7058 }
7059 if ((error = nameiat(&nd, fd))) {
7060 return error;
7061 }
7062 error = chmod_vnode(ctx, nd.ni_vp, vap);
7063 vnode_put(nd.ni_vp);
7064 nameidone(&nd);
7065 return error;
7066 }
7067
7068 /*
7069 * chmod_extended: Change the mode of a file given a path name; with extended
7070 * argument list (including extended security (ACL)).
7071 *
7072 * Parameters: p Process requesting the open
7073 * uap User argument descriptor (see below)
7074 * retval (ignored)
7075 *
7076 * Indirect: uap->path Path to object (same as 'chmod')
7077 * uap->uid UID to set
7078 * uap->gid GID to set
7079 * uap->mode File mode to set (same as 'chmod')
7080 * uap->xsecurity ACL to set (or delete)
7081 *
7082 * Returns: 0 Success
7083 * !0 errno value
7084 *
7085 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
7086 *
7087 * XXX: We should enummerate the possible errno values here, and where
7088 * in the code they originated.
7089 */
7090 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7091 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7092 {
7093 int error;
7094 struct vnode_attr va;
7095 kauth_filesec_t xsecdst;
7096
7097 AUDIT_ARG(owner, uap->uid, uap->gid);
7098
7099 VATTR_INIT(&va);
7100 if (uap->mode != -1) {
7101 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7102 }
7103 if (uap->uid != KAUTH_UID_NONE) {
7104 VATTR_SET(&va, va_uid, uap->uid);
7105 }
7106 if (uap->gid != KAUTH_GID_NONE) {
7107 VATTR_SET(&va, va_gid, uap->gid);
7108 }
7109
7110 xsecdst = NULL;
7111 switch (uap->xsecurity) {
7112 /* explicit remove request */
7113 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7114 VATTR_SET(&va, va_acl, NULL);
7115 break;
7116 /* not being set */
7117 case USER_ADDR_NULL:
7118 break;
7119 default:
7120 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
7121 return error;
7122 }
7123 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
7124 va.va_vaflags |= VA_FILESEC_ACL;
7125 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
7126 }
7127
7128 error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7129 UIO_USERSPACE);
7130
7131 if (xsecdst != NULL) {
7132 kauth_filesec_free(xsecdst);
7133 }
7134 return error;
7135 }
7136
7137 /*
7138 * Returns: 0 Success
7139 * chmodat:??? [anything chmodat can return]
7140 */
7141 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7142 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7143 int flag, enum uio_seg segflg)
7144 {
7145 struct vnode_attr va;
7146
7147 VATTR_INIT(&va);
7148 VATTR_SET(&va, va_mode, mode & ALLPERMS);
7149
7150 return chmodat(ctx, path, &va, fd, flag, segflg);
7151 }
7152
7153 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7154 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7155 {
7156 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7157 AT_FDCWD, 0, UIO_USERSPACE);
7158 }
7159
7160 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7161 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7162 {
7163 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7164 return EINVAL;
7165 }
7166
7167 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7168 uap->fd, uap->flag, UIO_USERSPACE);
7169 }
7170
7171 /*
7172 * Change mode of a file given a file descriptor.
7173 */
7174 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7175 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7176 {
7177 vnode_t vp;
7178 int error;
7179
7180 AUDIT_ARG(fd, fd);
7181
7182 if ((error = file_vnode(fd, &vp)) != 0) {
7183 return error;
7184 }
7185 if ((error = vnode_getwithref(vp)) != 0) {
7186 file_drop(fd);
7187 return error;
7188 }
7189 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7190
7191 error = chmod_vnode(vfs_context_current(), vp, vap);
7192 (void)vnode_put(vp);
7193 file_drop(fd);
7194
7195 return error;
7196 }
7197
7198 /*
7199 * fchmod_extended: Change mode of a file given a file descriptor; with
7200 * extended argument list (including extended security (ACL)).
7201 *
7202 * Parameters: p Process requesting to change file mode
7203 * uap User argument descriptor (see below)
7204 * retval (ignored)
7205 *
7206 * Indirect: uap->mode File mode to set (same as 'chmod')
7207 * uap->uid UID to set
7208 * uap->gid GID to set
7209 * uap->xsecurity ACL to set (or delete)
7210 * uap->fd File descriptor of file to change mode
7211 *
7212 * Returns: 0 Success
7213 * !0 errno value
7214 *
7215 */
7216 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7217 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7218 {
7219 int error;
7220 struct vnode_attr va;
7221 kauth_filesec_t xsecdst;
7222
7223 AUDIT_ARG(owner, uap->uid, uap->gid);
7224
7225 VATTR_INIT(&va);
7226 if (uap->mode != -1) {
7227 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7228 } else {
7229 va.va_mode = 0;
7230 }
7231
7232 if (uap->uid != KAUTH_UID_NONE) {
7233 VATTR_SET(&va, va_uid, uap->uid);
7234 }
7235 if (uap->gid != KAUTH_GID_NONE) {
7236 VATTR_SET(&va, va_gid, uap->gid);
7237 }
7238
7239 xsecdst = NULL;
7240 switch (uap->xsecurity) {
7241 case USER_ADDR_NULL:
7242 VATTR_SET(&va, va_acl, NULL);
7243 break;
7244 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7245 VATTR_SET(&va, va_acl, NULL);
7246 break;
7247 /* not being set */
7248 case CAST_USER_ADDR_T(-1):
7249 break;
7250 default:
7251 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
7252 return error;
7253 }
7254 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
7255 va.va_vaflags |= VA_FILESEC_ACL;
7256 }
7257
7258 error = fchmod1(p, uap->fd, &va);
7259
7260
7261 switch (uap->xsecurity) {
7262 case USER_ADDR_NULL:
7263 case CAST_USER_ADDR_T(-1):
7264 break;
7265 default:
7266 if (xsecdst != NULL) {
7267 kauth_filesec_free(xsecdst);
7268 }
7269 }
7270 return error;
7271 }
7272
7273 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)7274 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7275 {
7276 struct vnode_attr va;
7277
7278 VATTR_INIT(&va);
7279 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7280
7281 return fchmod1(p, uap->fd, &va);
7282 }
7283
7284
7285 /*
7286 * Set ownership given a path name.
7287 */
7288 /* ARGSUSED */
7289 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)7290 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7291 gid_t gid, int flag, enum uio_seg segflg)
7292 {
7293 vnode_t vp;
7294 struct vnode_attr va;
7295 int error;
7296 struct nameidata nd;
7297 int follow;
7298 kauth_action_t action;
7299
7300 AUDIT_ARG(owner, uid, gid);
7301
7302 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7303 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
7304 path, ctx);
7305 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7306 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7307 }
7308 error = nameiat(&nd, fd);
7309 if (error) {
7310 return error;
7311 }
7312 vp = nd.ni_vp;
7313
7314 nameidone(&nd);
7315
7316 VATTR_INIT(&va);
7317 if (uid != (uid_t)VNOVAL) {
7318 VATTR_SET(&va, va_uid, uid);
7319 }
7320 if (gid != (gid_t)VNOVAL) {
7321 VATTR_SET(&va, va_gid, gid);
7322 }
7323
7324 #if CONFIG_MACF
7325 error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7326 if (error) {
7327 goto out;
7328 }
7329 #endif
7330
7331 /* preflight and authorize attribute changes */
7332 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7333 goto out;
7334 }
7335 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7336 goto out;
7337 }
7338 error = vnode_setattr(vp, &va, ctx);
7339
7340 #if CONFIG_MACF
7341 if (error == 0) {
7342 mac_vnode_notify_setowner(ctx, vp, uid, gid);
7343 }
7344 #endif
7345
7346 out:
7347 /*
7348 * EACCES is only allowed from namei(); permissions failure should
7349 * return EPERM, so we need to translate the error code.
7350 */
7351 if (error == EACCES) {
7352 error = EPERM;
7353 }
7354
7355 vnode_put(vp);
7356 return error;
7357 }
7358
7359 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)7360 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7361 {
7362 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7363 uap->uid, uap->gid, 0, UIO_USERSPACE);
7364 }
7365
7366 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)7367 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7368 {
7369 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7370 uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7371 }
7372
7373 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)7374 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7375 {
7376 if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7377 return EINVAL;
7378 }
7379
7380 return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7381 uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7382 }
7383
7384 /*
7385 * Set ownership given a file descriptor.
7386 */
7387 /* ARGSUSED */
7388 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)7389 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7390 {
7391 struct vnode_attr va;
7392 vfs_context_t ctx = vfs_context_current();
7393 vnode_t vp;
7394 int error;
7395 kauth_action_t action;
7396
7397 AUDIT_ARG(owner, uap->uid, uap->gid);
7398 AUDIT_ARG(fd, uap->fd);
7399
7400 if ((error = file_vnode(uap->fd, &vp))) {
7401 return error;
7402 }
7403
7404 if ((error = vnode_getwithref(vp))) {
7405 file_drop(uap->fd);
7406 return error;
7407 }
7408 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7409
7410 VATTR_INIT(&va);
7411 if (uap->uid != VNOVAL) {
7412 VATTR_SET(&va, va_uid, uap->uid);
7413 }
7414 if (uap->gid != VNOVAL) {
7415 VATTR_SET(&va, va_gid, uap->gid);
7416 }
7417
7418 #if NAMEDSTREAMS
7419 /* chown calls are not allowed for resource forks. */
7420 if (vp->v_flag & VISNAMEDSTREAM) {
7421 error = EPERM;
7422 goto out;
7423 }
7424 #endif
7425
7426 #if CONFIG_MACF
7427 error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
7428 if (error) {
7429 goto out;
7430 }
7431 #endif
7432
7433 /* preflight and authorize attribute changes */
7434 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7435 goto out;
7436 }
7437 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7438 if (error == EACCES) {
7439 error = EPERM;
7440 }
7441 goto out;
7442 }
7443 error = vnode_setattr(vp, &va, ctx);
7444
7445 #if CONFIG_MACF
7446 if (error == 0) {
7447 mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
7448 }
7449 #endif
7450
7451 out:
7452 (void)vnode_put(vp);
7453 file_drop(uap->fd);
7454 return error;
7455 }
7456
7457 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)7458 getutimes(user_addr_t usrtvp, struct timespec *tsp)
7459 {
7460 int error;
7461
7462 if (usrtvp == USER_ADDR_NULL) {
7463 struct timeval old_tv;
7464 /* XXX Y2038 bug because of microtime argument */
7465 microtime(&old_tv);
7466 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
7467 tsp[1] = tsp[0];
7468 } else {
7469 if (IS_64BIT_PROCESS(current_proc())) {
7470 struct user64_timeval tv[2];
7471 error = copyin(usrtvp, (void *)tv, sizeof(tv));
7472 if (error) {
7473 return error;
7474 }
7475 TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
7476 TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
7477 } else {
7478 struct user32_timeval tv[2];
7479 error = copyin(usrtvp, (void *)tv, sizeof(tv));
7480 if (error) {
7481 return error;
7482 }
7483 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
7484 TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
7485 }
7486 }
7487 return 0;
7488 }
7489
7490 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)7491 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
7492 int nullflag)
7493 {
7494 int error;
7495 struct vnode_attr va;
7496 kauth_action_t action;
7497
7498 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7499
7500 VATTR_INIT(&va);
7501 VATTR_SET(&va, va_access_time, ts[0]);
7502 VATTR_SET(&va, va_modify_time, ts[1]);
7503 if (nullflag) {
7504 va.va_vaflags |= VA_UTIMES_NULL;
7505 }
7506
7507 #if NAMEDSTREAMS
7508 /* utimes calls are not allowed for resource forks. */
7509 if (vp->v_flag & VISNAMEDSTREAM) {
7510 error = EPERM;
7511 goto out;
7512 }
7513 #endif
7514
7515 #if CONFIG_MACF
7516 error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
7517 if (error) {
7518 goto out;
7519 }
7520 #endif
7521 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7522 if (!nullflag && error == EACCES) {
7523 error = EPERM;
7524 }
7525 goto out;
7526 }
7527
7528 /* since we may not need to auth anything, check here */
7529 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7530 if (!nullflag && error == EACCES) {
7531 error = EPERM;
7532 }
7533 goto out;
7534 }
7535 error = vnode_setattr(vp, &va, ctx);
7536
7537 #if CONFIG_MACF
7538 if (error == 0) {
7539 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
7540 }
7541 #endif
7542
7543 out:
7544 return error;
7545 }
7546
7547 /*
7548 * Set the access and modification times of a file.
7549 */
7550 /* ARGSUSED */
7551 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)7552 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
7553 {
7554 struct timespec ts[2];
7555 user_addr_t usrtvp;
7556 int error;
7557 struct nameidata nd;
7558 vfs_context_t ctx = vfs_context_current();
7559
7560 /*
7561 * AUDIT: Needed to change the order of operations to do the
7562 * name lookup first because auditing wants the path.
7563 */
7564 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
7565 UIO_USERSPACE, uap->path, ctx);
7566 error = namei(&nd);
7567 if (error) {
7568 return error;
7569 }
7570 nameidone(&nd);
7571
7572 /*
7573 * Fetch the user-supplied time. If usrtvp is USER_ADDR_NULL, we fetch
7574 * the current time instead.
7575 */
7576 usrtvp = uap->tptr;
7577 if ((error = getutimes(usrtvp, ts)) != 0) {
7578 goto out;
7579 }
7580
7581 error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
7582
7583 out:
7584 vnode_put(nd.ni_vp);
7585 return error;
7586 }
7587
7588 /*
7589 * Set the access and modification times of a file.
7590 */
7591 /* ARGSUSED */
7592 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)7593 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
7594 {
7595 struct timespec ts[2];
7596 vnode_t vp;
7597 user_addr_t usrtvp;
7598 int error;
7599
7600 AUDIT_ARG(fd, uap->fd);
7601 usrtvp = uap->tptr;
7602 if ((error = getutimes(usrtvp, ts)) != 0) {
7603 return error;
7604 }
7605 if ((error = file_vnode(uap->fd, &vp)) != 0) {
7606 return error;
7607 }
7608 if ((error = vnode_getwithref(vp))) {
7609 file_drop(uap->fd);
7610 return error;
7611 }
7612
7613 error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
7614 vnode_put(vp);
7615 file_drop(uap->fd);
7616 return error;
7617 }
7618
7619 /*
7620 * Truncate a file given its path name.
7621 */
7622 /* ARGSUSED */
7623 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)7624 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
7625 {
7626 vnode_t vp;
7627 struct vnode_attr va;
7628 vfs_context_t ctx = vfs_context_current();
7629 int error;
7630 struct nameidata nd;
7631 kauth_action_t action;
7632 rlim_t fsize_limit;
7633
7634 if (uap->length < 0) {
7635 return EINVAL;
7636 }
7637
7638 fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
7639 if ((rlim_t)uap->length > fsize_limit) {
7640 psignal(p, SIGXFSZ);
7641 return EFBIG;
7642 }
7643
7644 NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
7645 UIO_USERSPACE, uap->path, ctx);
7646 if ((error = namei(&nd))) {
7647 return error;
7648 }
7649 vp = nd.ni_vp;
7650
7651 nameidone(&nd);
7652
7653 VATTR_INIT(&va);
7654 VATTR_SET(&va, va_data_size, uap->length);
7655
7656 #if CONFIG_MACF
7657 error = mac_vnode_check_truncate(ctx, NOCRED, vp);
7658 if (error) {
7659 goto out;
7660 }
7661 #endif
7662
7663 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7664 goto out;
7665 }
7666 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7667 goto out;
7668 }
7669 error = vnode_setattr(vp, &va, ctx);
7670
7671 #if CONFIG_MACF
7672 if (error == 0) {
7673 mac_vnode_notify_truncate(ctx, NOCRED, vp);
7674 }
7675 #endif
7676
7677 out:
7678 vnode_put(vp);
7679 return error;
7680 }
7681
7682 /*
7683 * Truncate a file given a file descriptor.
7684 */
7685 /* ARGSUSED */
7686 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)7687 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
7688 {
7689 vfs_context_t ctx = vfs_context_current();
7690 struct vnode_attr va;
7691 vnode_t vp;
7692 struct fileproc *fp;
7693 int error;
7694 int fd = uap->fd;
7695 rlim_t fsize_limit;
7696
7697 AUDIT_ARG(fd, uap->fd);
7698 if (uap->length < 0) {
7699 return EINVAL;
7700 }
7701
7702 fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
7703 if ((rlim_t)uap->length > fsize_limit) {
7704 psignal(p, SIGXFSZ);
7705 return EFBIG;
7706 }
7707
7708 if ((error = fp_lookup(p, fd, &fp, 0))) {
7709 return error;
7710 }
7711
7712 switch (FILEGLOB_DTYPE(fp->fp_glob)) {
7713 case DTYPE_PSXSHM:
7714 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
7715 goto out;
7716 case DTYPE_VNODE:
7717 break;
7718 default:
7719 error = EINVAL;
7720 goto out;
7721 }
7722
7723 vp = (vnode_t)fp_get_data(fp);
7724
7725 if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
7726 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7727 error = EINVAL;
7728 goto out;
7729 }
7730
7731 if ((error = vnode_getwithref(vp)) != 0) {
7732 goto out;
7733 }
7734
7735 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7736
7737 #if CONFIG_MACF
7738 error = mac_vnode_check_truncate(ctx,
7739 fp->fp_glob->fg_cred, vp);
7740 if (error) {
7741 (void)vnode_put(vp);
7742 goto out;
7743 }
7744 #endif
7745 VATTR_INIT(&va);
7746 VATTR_SET(&va, va_data_size, uap->length);
7747 error = vnode_setattr(vp, &va, ctx);
7748
7749 #if CONFIG_MACF
7750 if (error == 0) {
7751 mac_vnode_notify_truncate(ctx, fp->fp_glob->fg_cred, vp);
7752 }
7753 #endif
7754
7755 (void)vnode_put(vp);
7756 out:
7757 file_drop(fd);
7758 return error;
7759 }
7760
7761
7762 /*
7763 * Sync an open file with synchronized I/O _file_ integrity completion
7764 */
7765 /* ARGSUSED */
7766 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)7767 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
7768 {
7769 __pthread_testcancel(1);
7770 return fsync_common(p, uap, MNT_WAIT);
7771 }
7772
7773
7774 /*
7775 * Sync an open file with synchronized I/O _file_ integrity completion
7776 *
7777 * Notes: This is a legacy support function that does not test for
7778 * thread cancellation points.
7779 */
7780 /* ARGSUSED */
7781 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)7782 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
7783 {
7784 return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
7785 }
7786
7787
7788 /*
7789 * Sync an open file with synchronized I/O _data_ integrity completion
7790 */
7791 /* ARGSUSED */
7792 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)7793 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
7794 {
7795 __pthread_testcancel(1);
7796 return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
7797 }
7798
7799
7800 /*
7801 * fsync_common
7802 *
7803 * Common fsync code to support both synchronized I/O file integrity completion
7804 * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
7805 *
7806 * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
7807 * will only guarantee that the file data contents are retrievable. If
7808 * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
7809 * includes additional metadata unnecessary for retrieving the file data
7810 * contents, such as atime, mtime, ctime, etc., also be committed to stable
7811 * storage.
7812 *
7813 * Parameters: p The process
7814 * uap->fd The descriptor to synchronize
7815 * flags The data integrity flags
7816 *
7817 * Returns: int Success
7818 * fp_getfvp:EBADF Bad file descriptor
7819 * fp_getfvp:ENOTSUP fd does not refer to a vnode
7820 * VNOP_FSYNC:??? unspecified
7821 *
7822 * Notes: We use struct fsync_args because it is a short name, and all
7823 * caller argument structures are otherwise identical.
7824 */
7825 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)7826 fsync_common(proc_t p, struct fsync_args *uap, int flags)
7827 {
7828 vnode_t vp;
7829 struct fileproc *fp;
7830 vfs_context_t ctx = vfs_context_current();
7831 int error;
7832
7833 AUDIT_ARG(fd, uap->fd);
7834
7835 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
7836 return error;
7837 }
7838 if ((error = vnode_getwithref(vp))) {
7839 file_drop(uap->fd);
7840 return error;
7841 }
7842
7843 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7844
7845 error = VNOP_FSYNC(vp, flags, ctx);
7846
7847 #if NAMEDRSRCFORK
7848 /* Sync resource fork shadow file if necessary. */
7849 if ((error == 0) &&
7850 (vp->v_flag & VISNAMEDSTREAM) &&
7851 (vp->v_parent != NULLVP) &&
7852 vnode_isshadow(vp) &&
7853 (fp->fp_glob->fg_flag & FWASWRITTEN)) {
7854 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
7855 }
7856 #endif
7857
7858 (void)vnode_put(vp);
7859 file_drop(uap->fd);
7860 return error;
7861 }
7862
7863 /*
7864 * Duplicate files. Source must be a file, target must be a file or
7865 * must not exist.
7866 *
7867 * XXX Copyfile authorisation checking is woefully inadequate, and will not
7868 * perform inheritance correctly.
7869 */
7870 /* ARGSUSED */
7871 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)7872 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
7873 {
7874 vnode_t tvp, fvp, tdvp, sdvp;
7875 struct nameidata fromnd, tond;
7876 int error;
7877 vfs_context_t ctx = vfs_context_current();
7878
7879 /* Check that the flags are valid. */
7880 if (uap->flags & ~CPF_MASK) {
7881 return EINVAL;
7882 }
7883
7884 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
7885 UIO_USERSPACE, uap->from, ctx);
7886 if ((error = namei(&fromnd))) {
7887 return error;
7888 }
7889 fvp = fromnd.ni_vp;
7890
7891 NDINIT(&tond, CREATE, OP_LINK,
7892 LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
7893 UIO_USERSPACE, uap->to, ctx);
7894 if ((error = namei(&tond))) {
7895 goto out1;
7896 }
7897 tdvp = tond.ni_dvp;
7898 tvp = tond.ni_vp;
7899
7900 if (tvp != NULL) {
7901 if (!(uap->flags & CPF_OVERWRITE)) {
7902 error = EEXIST;
7903 goto out;
7904 }
7905 }
7906
7907 if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
7908 error = EISDIR;
7909 goto out;
7910 }
7911
7912 if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
7913 error = EOPNOTSUPP;
7914 goto out;
7915 }
7916
7917 #if CONFIG_MACF
7918 if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
7919 goto out;
7920 }
7921 #endif /* CONFIG_MACF */
7922
7923 if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
7924 goto out;
7925 }
7926 if (tvp) {
7927 if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
7928 goto out;
7929 }
7930 }
7931 if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
7932 goto out;
7933 }
7934
7935 if (fvp == tdvp) {
7936 error = EINVAL;
7937 }
7938 /*
7939 * If source is the same as the destination (that is the
7940 * same inode number) then there is nothing to do.
7941 * (fixed to have POSIX semantics - CSM 3/2/98)
7942 */
7943 if (fvp == tvp) {
7944 error = -1;
7945 }
7946 if (!error) {
7947 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
7948 }
7949 out:
7950 sdvp = tond.ni_startdir;
7951 /*
7952 * nameidone has to happen before we vnode_put(tdvp)
7953 * since it may need to release the fs_nodelock on the tdvp
7954 */
7955 nameidone(&tond);
7956
7957 if (tvp) {
7958 vnode_put(tvp);
7959 }
7960 vnode_put(tdvp);
7961 vnode_put(sdvp);
7962 out1:
7963 vnode_put(fvp);
7964
7965 nameidone(&fromnd);
7966
7967 if (error == -1) {
7968 return 0;
7969 }
7970 return error;
7971 }
7972
7973 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
7974
7975 /*
7976 * Helper function for doing clones. The caller is expected to provide an
7977 * iocounted source vnode and release it.
7978 */
7979 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)7980 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
7981 user_addr_t dst, uint32_t flags, vfs_context_t ctx)
7982 {
7983 vnode_t tvp, tdvp;
7984 struct nameidata tond;
7985 int error;
7986 int follow;
7987 boolean_t free_src_acl;
7988 boolean_t attr_cleanup;
7989 enum vtype v_type;
7990 kauth_action_t action;
7991 struct componentname *cnp;
7992 uint32_t defaulted;
7993 struct vnode_attr va;
7994 struct vnode_attr nva;
7995 uint32_t vnop_flags;
7996
7997 v_type = vnode_vtype(fvp);
7998 switch (v_type) {
7999 case VLNK:
8000 /* FALLTHRU */
8001 case VREG:
8002 action = KAUTH_VNODE_ADD_FILE;
8003 break;
8004 case VDIR:
8005 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8006 fvp->v_mountedhere) {
8007 return EINVAL;
8008 }
8009 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8010 break;
8011 default:
8012 return EINVAL;
8013 }
8014
8015 AUDIT_ARG(fd2, dst_dirfd);
8016 AUDIT_ARG(value32, flags);
8017
8018 follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8019 NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8020 UIO_USERSPACE, dst, ctx);
8021 if ((error = nameiat(&tond, dst_dirfd))) {
8022 return error;
8023 }
8024 cnp = &tond.ni_cnd;
8025 tdvp = tond.ni_dvp;
8026 tvp = tond.ni_vp;
8027
8028 free_src_acl = FALSE;
8029 attr_cleanup = FALSE;
8030
8031 if (tvp != NULL) {
8032 error = EEXIST;
8033 goto out;
8034 }
8035
8036 if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8037 error = EXDEV;
8038 goto out;
8039 }
8040
8041 #if CONFIG_MACF
8042 if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8043 goto out;
8044 }
8045 #endif
8046 if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8047 goto out;
8048 }
8049
8050 action = KAUTH_VNODE_GENERIC_READ_BITS;
8051 if (data_read_authorised) {
8052 action &= ~KAUTH_VNODE_READ_DATA;
8053 }
8054 if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8055 goto out;
8056 }
8057
8058 /*
8059 * certain attributes may need to be changed from the source, we ask for
8060 * those here with the exception of source file's ACL. The clone file
8061 * will inherit the target directory's ACL.
8062 */
8063 VATTR_INIT(&va);
8064 VATTR_WANTED(&va, va_uid);
8065 VATTR_WANTED(&va, va_gid);
8066 VATTR_WANTED(&va, va_mode);
8067 VATTR_WANTED(&va, va_flags);
8068
8069 if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
8070 goto out;
8071 }
8072
8073 VATTR_INIT(&nva);
8074 VATTR_SET(&nva, va_type, v_type);
8075 if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8076 VATTR_SET(&nva, va_acl, va.va_acl);
8077 free_src_acl = TRUE;
8078 }
8079
8080 /* Handle ACL inheritance, initialize vap. */
8081 if (v_type == VLNK) {
8082 error = vnode_authattr_new(tdvp, &nva, 0, ctx);
8083 } else {
8084 error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
8085 if (error) {
8086 goto out;
8087 }
8088 attr_cleanup = TRUE;
8089 }
8090
8091 vnop_flags = VNODE_CLONEFILE_DEFAULT;
8092 /*
8093 * We've got initial values for all security parameters,
8094 * If we are superuser, then we can change owners to be the
8095 * same as the source. Both superuser and the owner have default
8096 * WRITE_SECURITY privileges so all other fields can be taken
8097 * from source as well.
8098 */
8099 if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8100 if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8101 VATTR_SET(&nva, va_uid, va.va_uid);
8102 }
8103 if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8104 VATTR_SET(&nva, va_gid, va.va_gid);
8105 }
8106 } else {
8107 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8108 }
8109
8110 if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8111 VATTR_SET(&nva, va_mode, va.va_mode);
8112 }
8113 if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8114 VATTR_SET(&nva, va_flags,
8115 ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8116 (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8117 }
8118
8119 error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8120
8121 if (!error && tvp) {
8122 int update_flags = 0;
8123 #if CONFIG_FSE
8124 int fsevent;
8125 #endif /* CONFIG_FSE */
8126
8127 /*
8128 * If some of the requested attributes weren't handled by the
8129 * VNOP, use our fallback code.
8130 */
8131 if (!VATTR_ALL_SUPPORTED(&nva)) {
8132 (void)vnode_setattr_fallback(tvp, &nva, ctx);
8133 }
8134
8135 #if CONFIG_MACF
8136 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8137 VNODE_LABEL_CREATE, ctx);
8138 #endif
8139
8140 // Make sure the name & parent pointers are hooked up
8141 if (tvp->v_name == NULL) {
8142 update_flags |= VNODE_UPDATE_NAME;
8143 }
8144 if (tvp->v_parent == NULLVP) {
8145 update_flags |= VNODE_UPDATE_PARENT;
8146 }
8147
8148 if (update_flags) {
8149 (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8150 cnp->cn_namelen, cnp->cn_hash, update_flags);
8151 }
8152
8153 #if CONFIG_FSE
8154 switch (vnode_vtype(tvp)) {
8155 case VLNK:
8156 /* FALLTHRU */
8157 case VREG:
8158 fsevent = FSE_CREATE_FILE;
8159 break;
8160 case VDIR:
8161 fsevent = FSE_CREATE_DIR;
8162 break;
8163 default:
8164 goto out;
8165 }
8166
8167 if (need_fsevent(fsevent, tvp)) {
8168 /*
8169 * The following is a sequence of three explicit events.
8170 * A pair of FSE_CLONE events representing the source and destination
8171 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8172 * fseventsd may coalesce the destination clone and create events
8173 * into a single event resulting in the following sequence for a client
8174 * FSE_CLONE (src)
8175 * FSE_CLONE | FSE_CREATE (dst)
8176 */
8177 add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8178 FSE_ARG_DONE);
8179 add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8180 FSE_ARG_DONE);
8181 }
8182 #endif /* CONFIG_FSE */
8183 }
8184
8185 out:
8186 if (attr_cleanup) {
8187 vn_attribute_cleanup(&nva, defaulted);
8188 }
8189 if (free_src_acl && va.va_acl) {
8190 kauth_acl_free(va.va_acl);
8191 }
8192 nameidone(&tond);
8193 if (tvp) {
8194 vnode_put(tvp);
8195 }
8196 vnode_put(tdvp);
8197 return error;
8198 }
8199
8200 /*
8201 * clone files or directories, target must not exist.
8202 */
8203 /* ARGSUSED */
8204 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8205 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8206 __unused int32_t *retval)
8207 {
8208 vnode_t fvp;
8209 struct nameidata fromnd;
8210 int follow;
8211 int error;
8212 vfs_context_t ctx = vfs_context_current();
8213
8214 /* Check that the flags are valid. */
8215 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
8216 return EINVAL;
8217 }
8218
8219 AUDIT_ARG(fd, uap->src_dirfd);
8220
8221 follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8222 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8223 UIO_USERSPACE, uap->src, ctx);
8224 if ((error = nameiat(&fromnd, uap->src_dirfd))) {
8225 return error;
8226 }
8227
8228 fvp = fromnd.ni_vp;
8229 nameidone(&fromnd);
8230
8231 error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8232 uap->flags, ctx);
8233
8234 vnode_put(fvp);
8235 return error;
8236 }
8237
8238 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)8239 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8240 __unused int32_t *retval)
8241 {
8242 vnode_t fvp;
8243 struct fileproc *fp;
8244 int error;
8245 vfs_context_t ctx = vfs_context_current();
8246
8247 /* Check that the flags are valid. */
8248 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
8249 return EINVAL;
8250 }
8251
8252 AUDIT_ARG(fd, uap->src_fd);
8253 error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8254 if (error) {
8255 return error;
8256 }
8257
8258 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8259 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8260 error = EBADF;
8261 goto out;
8262 }
8263
8264 if ((error = vnode_getwithref(fvp))) {
8265 goto out;
8266 }
8267
8268 AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8269
8270 error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8271 uap->flags, ctx);
8272
8273 vnode_put(fvp);
8274 out:
8275 file_drop(uap->src_fd);
8276 return error;
8277 }
8278
8279 static int
rename_submounts_callback(mount_t mp,void * arg)8280 rename_submounts_callback(mount_t mp, void *arg)
8281 {
8282 int error = 0;
8283 mount_t pmp = (mount_t)arg;
8284 int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8285
8286 if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8287 return 0;
8288 }
8289
8290 if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8291 return 0;
8292 }
8293
8294 if ((error = vfs_busy(mp, LK_NOWAIT))) {
8295 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8296 return -1;
8297 }
8298
8299 int pathlen = MAXPATHLEN;
8300 if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8301 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8302 }
8303
8304 vfs_unbusy(mp);
8305
8306 return error;
8307 }
8308
8309 /*
8310 * Rename files. Source and destination must either both be directories,
8311 * or both not be directories. If target is a directory, it must be empty.
8312 */
8313 /* ARGSUSED */
8314 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)8315 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8316 int tofd, user_addr_t to, int segflg, u_int uflags)
8317 {
8318 vnode_t tvp, tdvp;
8319 vnode_t fvp, fdvp;
8320 vnode_t mnt_fvp;
8321 struct nameidata *fromnd, *tond;
8322 int error;
8323 int do_retry;
8324 int retry_count;
8325 int mntrename;
8326 int need_event;
8327 int need_kpath2;
8328 int has_listeners;
8329 const char *oname = NULL;
8330 char *from_name = NULL, *to_name = NULL;
8331 char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8332 int from_len = 0, to_len = 0;
8333 int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8334 int holding_mntlock;
8335 int vn_authorize_skipped;
8336 mount_t locked_mp = NULL;
8337 vnode_t oparent = NULLVP;
8338 #if CONFIG_FSE
8339 fse_info from_finfo = {}, to_finfo;
8340 #endif
8341 int from_truncated = 0, to_truncated = 0;
8342 int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8343 int batched = 0;
8344 struct vnode_attr *fvap, *tvap;
8345 int continuing = 0;
8346 vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
8347 int32_t nofollow_any = 0;
8348 /* carving out a chunk for structs that are too big to be on stack. */
8349 struct {
8350 struct nameidata from_node, to_node;
8351 struct vnode_attr fv_attr, tv_attr;
8352 } * __rename_data;
8353
8354 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
8355 fromnd = &__rename_data->from_node;
8356 tond = &__rename_data->to_node;
8357
8358 holding_mntlock = 0;
8359 do_retry = 0;
8360 retry_count = 0;
8361 retry:
8362 fvp = tvp = NULL;
8363 fdvp = tdvp = NULL;
8364 fvap = tvap = NULL;
8365 mnt_fvp = NULLVP;
8366 mntrename = FALSE;
8367 vn_authorize_skipped = FALSE;
8368
8369 if (uflags & RENAME_NOFOLLOW_ANY) {
8370 nofollow_any = NAMEI_NOFOLLOW_ANY;
8371 }
8372 NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8373 segflg, from, ctx);
8374 fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8375
8376 NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8377 segflg, to, ctx);
8378 tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8379
8380 continue_lookup:
8381 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8382 if ((error = nameiat(fromnd, fromfd))) {
8383 goto out1;
8384 }
8385 fdvp = fromnd->ni_dvp;
8386 fvp = fromnd->ni_vp;
8387
8388 if (fvp && fvp->v_type == VDIR) {
8389 tond->ni_cnd.cn_flags |= WILLBEDIR;
8390 }
8391 }
8392
8393 if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8394 if ((error = nameiat(tond, tofd))) {
8395 /*
8396 * Translate error code for rename("dir1", "dir2/.").
8397 */
8398 if (error == EISDIR && fvp->v_type == VDIR) {
8399 error = EINVAL;
8400 }
8401 goto out1;
8402 }
8403 tdvp = tond->ni_dvp;
8404 tvp = tond->ni_vp;
8405 }
8406
8407 #if DEVELOPMENT || DEBUG
8408 /*
8409 * XXX VSWAP: Check for entitlements or special flag here
8410 * so we can restrict access appropriately.
8411 */
8412 #else /* DEVELOPMENT || DEBUG */
8413
8414 if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
8415 error = EPERM;
8416 goto out1;
8417 }
8418
8419 if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
8420 error = EPERM;
8421 goto out1;
8422 }
8423 #endif /* DEVELOPMENT || DEBUG */
8424
8425 if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
8426 error = ENOENT;
8427 goto out1;
8428 }
8429
8430 if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
8431 int32_t pval = 0;
8432 int err = 0;
8433
8434 /*
8435 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
8436 * has the same name as target iff the following conditions are met:
8437 * 1. the target file system is case insensitive
8438 * 2. source and target directories are the same
8439 * 3. source and target files are the same
8440 * 4. name only differs in case (determined by underlying filesystem)
8441 */
8442 if (fvp != tvp || fdvp != tdvp) {
8443 error = EEXIST;
8444 goto out1;
8445 }
8446
8447 /*
8448 * Assume that the target file system is case sensitive if
8449 * _PC_CASE_SENSITIVE selector isn't supported.
8450 */
8451 err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
8452 if (err != 0 || pval != 0) {
8453 error = EEXIST;
8454 goto out1;
8455 }
8456 }
8457
8458 batched = vnode_compound_rename_available(fdvp);
8459
8460 #if CONFIG_FSE
8461 need_event = need_fsevent(FSE_RENAME, fdvp);
8462 if (need_event) {
8463 if (fvp) {
8464 get_fse_info(fvp, &from_finfo, ctx);
8465 } else {
8466 error = vfs_get_notify_attributes(&__rename_data->fv_attr);
8467 if (error) {
8468 goto out1;
8469 }
8470
8471 fvap = &__rename_data->fv_attr;
8472 }
8473
8474 if (tvp) {
8475 get_fse_info(tvp, &to_finfo, ctx);
8476 } else if (batched) {
8477 error = vfs_get_notify_attributes(&__rename_data->tv_attr);
8478 if (error) {
8479 goto out1;
8480 }
8481
8482 tvap = &__rename_data->tv_attr;
8483 }
8484 }
8485 #else
8486 need_event = 0;
8487 #endif /* CONFIG_FSE */
8488
8489 has_listeners = kauth_authorize_fileop_has_listeners();
8490
8491 need_kpath2 = 0;
8492 #if CONFIG_AUDIT
8493 if (AUDIT_RECORD_EXISTS()) {
8494 need_kpath2 = 1;
8495 }
8496 #endif
8497
8498 if (need_event || has_listeners) {
8499 if (from_name == NULL) {
8500 GET_PATH(from_name);
8501 }
8502
8503 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
8504
8505 if (from_name_no_firmlink == NULL) {
8506 GET_PATH(from_name_no_firmlink);
8507 }
8508
8509 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
8510 }
8511
8512 if (need_event || need_kpath2 || has_listeners) {
8513 if (to_name == NULL) {
8514 GET_PATH(to_name);
8515 }
8516
8517 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
8518
8519 if (to_name_no_firmlink == NULL) {
8520 GET_PATH(to_name_no_firmlink);
8521 }
8522
8523 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
8524 if (to_name && need_kpath2) {
8525 AUDIT_ARG(kpath, to_name, ARG_KPATH2);
8526 }
8527 }
8528 if (!fvp) {
8529 /*
8530 * Claim: this check will never reject a valid rename.
8531 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
8532 * Suppose fdvp and tdvp are not on the same mount.
8533 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root,
8534 * then you can't move it to within another dir on the same mountpoint.
8535 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
8536 *
8537 * If this check passes, then we are safe to pass these vnodes to the same FS.
8538 */
8539 if (fdvp->v_mount != tdvp->v_mount) {
8540 error = EXDEV;
8541 goto out1;
8542 }
8543 goto skipped_lookup;
8544 }
8545
8546 /*
8547 * If the source and destination are the same (i.e. they're
8548 * links to the same vnode) and the target file system is
8549 * case sensitive, then there is nothing to do.
8550 *
8551 * XXX Come back to this.
8552 */
8553 if (fvp == tvp) {
8554 int pathconf_val;
8555
8556 /*
8557 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
8558 * then assume that this file system is case sensitive.
8559 */
8560 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
8561 pathconf_val != 0) {
8562 vn_authorize_skipped = TRUE;
8563 goto out1;
8564 }
8565 }
8566
8567 /*
8568 * Allow the renaming of mount points.
8569 * - target must not exist
8570 * - target must reside in the same directory as source
8571 * - union mounts cannot be renamed
8572 * - the root fs, and tightly-linked system volumes, cannot be renamed
8573 *
8574 * XXX Handle this in VFS after a continued lookup (if we missed
8575 * in the cache to start off)
8576 *
8577 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
8578 * we'll skip past here. The file system is responsible for
8579 * checking that @tvp is not a descendent of @fvp and vice versa
8580 * so it should always return EINVAL if either @tvp or @fvp is the
8581 * root of a volume.
8582 */
8583 if ((fvp->v_flag & VROOT) &&
8584 (fvp->v_type == VDIR) &&
8585 (tvp == NULL) &&
8586 (fvp->v_mountedhere == NULL) &&
8587 (fdvp == tdvp) &&
8588 ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
8589 ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
8590 (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
8591 vnode_t coveredvp;
8592
8593 /* switch fvp to the covered vnode */
8594 coveredvp = fvp->v_mount->mnt_vnodecovered;
8595 if ((vnode_getwithref(coveredvp))) {
8596 error = ENOENT;
8597 goto out1;
8598 }
8599 /*
8600 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
8601 * later.
8602 */
8603 mnt_fvp = fvp;
8604
8605 fvp = coveredvp;
8606 mntrename = TRUE;
8607 }
8608 /*
8609 * Check for cross-device rename.
8610 */
8611 if ((fvp->v_mount != tdvp->v_mount) ||
8612 (tvp && (fvp->v_mount != tvp->v_mount))) {
8613 error = EXDEV;
8614 goto out1;
8615 }
8616
8617 /*
8618 * If source is the same as the destination (that is the
8619 * same inode number) then there is nothing to do...
8620 * EXCEPT if the underlying file system supports case
8621 * insensitivity and is case preserving. In this case
8622 * the file system needs to handle the special case of
8623 * getting the same vnode as target (fvp) and source (tvp).
8624 *
8625 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
8626 * and _PC_CASE_PRESERVING can have this exception, and they need to
8627 * handle the special case of getting the same vnode as target and
8628 * source. NOTE: Then the target is unlocked going into vnop_rename,
8629 * so not to cause locking problems. There is a single reference on tvp.
8630 *
8631 * NOTE - that fvp == tvp also occurs if they are hard linked and
8632 * that correct behaviour then is just to return success without doing
8633 * anything.
8634 *
8635 * XXX filesystem should take care of this itself, perhaps...
8636 */
8637 if (fvp == tvp && fdvp == tdvp) {
8638 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
8639 !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
8640 fromnd->ni_cnd.cn_namelen)) {
8641 vn_authorize_skipped = TRUE;
8642 goto out1;
8643 }
8644 }
8645
8646 if (holding_mntlock && fvp->v_mount != locked_mp) {
8647 /*
8648 * we're holding a reference and lock
8649 * on locked_mp, but it no longer matches
8650 * what we want to do... so drop our hold
8651 */
8652 mount_unlock_renames(locked_mp);
8653 mount_drop(locked_mp, 0);
8654 holding_mntlock = 0;
8655 }
8656 if (tdvp != fdvp && fvp->v_type == VDIR) {
8657 /*
8658 * serialize renames that re-shape
8659 * the tree... if holding_mntlock is
8660 * set, then we're ready to go...
8661 * otherwise we
8662 * first need to drop the iocounts
8663 * we picked up, second take the
8664 * lock to serialize the access,
8665 * then finally start the lookup
8666 * process over with the lock held
8667 */
8668 if (!holding_mntlock) {
8669 /*
8670 * need to grab a reference on
8671 * the mount point before we
8672 * drop all the iocounts... once
8673 * the iocounts are gone, the mount
8674 * could follow
8675 */
8676 locked_mp = fvp->v_mount;
8677 mount_ref(locked_mp, 0);
8678
8679 /*
8680 * nameidone has to happen before we vnode_put(tvp)
8681 * since it may need to release the fs_nodelock on the tvp
8682 */
8683 nameidone(tond);
8684
8685 if (tvp) {
8686 vnode_put(tvp);
8687 }
8688 vnode_put(tdvp);
8689
8690 /*
8691 * nameidone has to happen before we vnode_put(fdvp)
8692 * since it may need to release the fs_nodelock on the fvp
8693 */
8694 nameidone(fromnd);
8695
8696 vnode_put(fvp);
8697 vnode_put(fdvp);
8698
8699 if (mnt_fvp != NULLVP) {
8700 vnode_put(mnt_fvp);
8701 }
8702
8703 mount_lock_renames(locked_mp);
8704 holding_mntlock = 1;
8705
8706 goto retry;
8707 }
8708 } else {
8709 /*
8710 * when we dropped the iocounts to take
8711 * the lock, we allowed the identity of
8712 * the various vnodes to change... if they did,
8713 * we may no longer be dealing with a rename
8714 * that reshapes the tree... once we're holding
8715 * the iocounts, the vnodes can't change type
8716 * so we're free to drop the lock at this point
8717 * and continue on
8718 */
8719 if (holding_mntlock) {
8720 mount_unlock_renames(locked_mp);
8721 mount_drop(locked_mp, 0);
8722 holding_mntlock = 0;
8723 }
8724 }
8725
8726 if (!batched) {
8727 error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
8728 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
8729 flags, NULL);
8730 if (error) {
8731 if (error == ENOENT) {
8732 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8733 /*
8734 * We encountered a race where after doing the namei,
8735 * tvp stops being valid. If so, simply re-drive the rename
8736 * call from the top.
8737 */
8738 do_retry = 1;
8739 retry_count += 1;
8740 }
8741 }
8742 goto out1;
8743 }
8744 }
8745
8746 /* Release the 'mnt_fvp' now that it is no longer needed. */
8747 if (mnt_fvp != NULLVP) {
8748 vnode_put(mnt_fvp);
8749 mnt_fvp = NULLVP;
8750 }
8751
8752 // save these off so we can later verify that fvp is the same
8753 oname = fvp->v_name;
8754 oparent = fvp->v_parent;
8755
8756 skipped_lookup:
8757 error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
8758 tdvp, &tvp, &tond->ni_cnd, tvap,
8759 flags, ctx);
8760
8761 if (holding_mntlock) {
8762 /*
8763 * we can drop our serialization
8764 * lock now
8765 */
8766 mount_unlock_renames(locked_mp);
8767 mount_drop(locked_mp, 0);
8768 holding_mntlock = 0;
8769 }
8770 if (error) {
8771 if (error == EDATALESS) {
8772 /*
8773 * If we've been here before, something has gone
8774 * horribly wrong and we should just get out lest
8775 * we spiral around the drain forever.
8776 */
8777 if (flags & VFS_RENAME_DATALESS) {
8778 error = EIO;
8779 goto out1;
8780 }
8781
8782 /*
8783 * The object we're renaming is dataless (or has a
8784 * dataless descendent) and requires materialization
8785 * before the rename occurs. But we're holding the
8786 * mount point's rename lock, so it's not safe to
8787 * make the upcall.
8788 *
8789 * In this case, we release the lock, perform the
8790 * materialization, and start the whole thing over.
8791 */
8792 error = vnode_materialize_dataless_file(fvp,
8793 NAMESPACE_HANDLER_RENAME_OP);
8794
8795 if (error == 0) {
8796 /*
8797 * The next time around we need to tell the
8798 * file system that the materializtaion has
8799 * been performed.
8800 */
8801 flags |= VFS_RENAME_DATALESS;
8802 do_retry = 1;
8803 }
8804 goto out1;
8805 }
8806 if (error == EKEEPLOOKING) {
8807 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8808 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8809 panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
8810 }
8811 }
8812
8813 fromnd->ni_vp = fvp;
8814 tond->ni_vp = tvp;
8815
8816 goto continue_lookup;
8817 }
8818
8819 /*
8820 * We may encounter a race in the VNOP where the destination didn't
8821 * exist when we did the namei, but it does by the time we go and
8822 * try to create the entry. In this case, we should re-drive this rename
8823 * call from the top again. Currently, only HFS bubbles out ERECYCLE,
8824 * but other filesystems susceptible to this race could return it, too.
8825 */
8826 if (error == ERECYCLE) {
8827 if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
8828 do_retry = 1;
8829 retry_count += 1;
8830 } else {
8831 printf("rename retry limit due to ERECYCLE reached\n");
8832 error = ENOENT;
8833 }
8834 }
8835
8836 /*
8837 * For compound VNOPs, the authorization callback may return
8838 * ENOENT in case of racing hardlink lookups hitting the name
8839 * cache, redrive the lookup.
8840 */
8841 if (batched && error == ENOENT) {
8842 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8843 do_retry = 1;
8844 retry_count += 1;
8845 }
8846 }
8847
8848 goto out1;
8849 }
8850
8851 /* call out to allow 3rd party notification of rename.
8852 * Ignore result of kauth_authorize_fileop call.
8853 */
8854 kauth_authorize_fileop(vfs_context_ucred(ctx),
8855 KAUTH_FILEOP_RENAME,
8856 (uintptr_t)from_name, (uintptr_t)to_name);
8857 if (flags & VFS_RENAME_SWAP) {
8858 kauth_authorize_fileop(vfs_context_ucred(ctx),
8859 KAUTH_FILEOP_RENAME,
8860 (uintptr_t)to_name, (uintptr_t)from_name);
8861 }
8862
8863 #if CONFIG_FSE
8864 if (from_name != NULL && to_name != NULL) {
8865 if (from_truncated || to_truncated) {
8866 // set it here since only the from_finfo gets reported up to user space
8867 from_finfo.mode |= FSE_TRUNCATED_PATH;
8868 }
8869
8870 if (tvap && tvp) {
8871 vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
8872 }
8873 if (fvap) {
8874 vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
8875 }
8876
8877 if (tvp) {
8878 add_fsevent(FSE_RENAME, ctx,
8879 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8880 FSE_ARG_FINFO, &from_finfo,
8881 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8882 FSE_ARG_FINFO, &to_finfo,
8883 FSE_ARG_DONE);
8884 if (flags & VFS_RENAME_SWAP) {
8885 /*
8886 * Strictly speaking, swap is the equivalent of
8887 * *three* renames. FSEvents clients should only take
8888 * the events as a hint, so we only bother reporting
8889 * two.
8890 */
8891 add_fsevent(FSE_RENAME, ctx,
8892 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8893 FSE_ARG_FINFO, &to_finfo,
8894 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8895 FSE_ARG_FINFO, &from_finfo,
8896 FSE_ARG_DONE);
8897 }
8898 } else {
8899 add_fsevent(FSE_RENAME, ctx,
8900 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8901 FSE_ARG_FINFO, &from_finfo,
8902 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8903 FSE_ARG_DONE);
8904 }
8905 }
8906 #endif /* CONFIG_FSE */
8907
8908 /*
8909 * update filesystem's mount point data
8910 */
8911 if (mntrename) {
8912 char *cp, *pathend, *mpname;
8913 char * tobuf;
8914 struct mount *mp;
8915 int maxlen;
8916 size_t len = 0;
8917
8918 mp = fvp->v_mountedhere;
8919
8920 if (vfs_busy(mp, LK_NOWAIT)) {
8921 error = EBUSY;
8922 goto out1;
8923 }
8924 tobuf = zalloc(ZV_NAMEI);
8925
8926 if (UIO_SEG_IS_USER_SPACE(segflg)) {
8927 error = copyinstr(to, tobuf, MAXPATHLEN, &len);
8928 } else {
8929 error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
8930 }
8931 if (!error) {
8932 /* find current mount point prefix */
8933 pathend = &mp->mnt_vfsstat.f_mntonname[0];
8934 for (cp = pathend; *cp != '\0'; ++cp) {
8935 if (*cp == '/') {
8936 pathend = cp + 1;
8937 }
8938 }
8939 /* find last component of target name */
8940 for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
8941 if (*cp == '/') {
8942 mpname = cp + 1;
8943 }
8944 }
8945
8946 /* Update f_mntonname of sub mounts */
8947 vfs_iterate(0, rename_submounts_callback, (void *)mp);
8948
8949 /* append name to prefix */
8950 maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
8951 bzero(pathend, maxlen);
8952
8953 strlcpy(pathend, mpname, maxlen);
8954 }
8955 zfree(ZV_NAMEI, tobuf);
8956
8957 vfs_unbusy(mp);
8958
8959 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
8960 }
8961 /*
8962 * fix up name & parent pointers. note that we first
8963 * check that fvp has the same name/parent pointers it
8964 * had before the rename call... this is a 'weak' check
8965 * at best...
8966 *
8967 * XXX oparent and oname may not be set in the compound vnop case
8968 */
8969 if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
8970 int update_flags;
8971
8972 update_flags = VNODE_UPDATE_NAME;
8973
8974 if (fdvp != tdvp) {
8975 update_flags |= VNODE_UPDATE_PARENT;
8976 }
8977
8978 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
8979 }
8980 out1:
8981 /*
8982 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
8983 * skipped earlier as no actual rename was performed.
8984 */
8985 if (vn_authorize_skipped && error == 0) {
8986 error = vn_authorize_renamex_with_paths(fdvp, fvp,
8987 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
8988 flags, NULL);
8989 if (error && error == ENOENT) {
8990 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8991 do_retry = 1;
8992 retry_count += 1;
8993 }
8994 }
8995 }
8996 if (to_name != NULL) {
8997 RELEASE_PATH(to_name);
8998 to_name = NULL;
8999 }
9000 if (to_name_no_firmlink != NULL) {
9001 RELEASE_PATH(to_name_no_firmlink);
9002 to_name_no_firmlink = NULL;
9003 }
9004 if (from_name != NULL) {
9005 RELEASE_PATH(from_name);
9006 from_name = NULL;
9007 }
9008 if (from_name_no_firmlink != NULL) {
9009 RELEASE_PATH(from_name_no_firmlink);
9010 from_name_no_firmlink = NULL;
9011 }
9012 if (holding_mntlock) {
9013 mount_unlock_renames(locked_mp);
9014 mount_drop(locked_mp, 0);
9015 holding_mntlock = 0;
9016 }
9017 if (tdvp) {
9018 /*
9019 * nameidone has to happen before we vnode_put(tdvp)
9020 * since it may need to release the fs_nodelock on the tdvp
9021 */
9022 nameidone(tond);
9023
9024 if (tvp) {
9025 vnode_put(tvp);
9026 }
9027 vnode_put(tdvp);
9028 }
9029 if (fdvp) {
9030 /*
9031 * nameidone has to happen before we vnode_put(fdvp)
9032 * since it may need to release the fs_nodelock on the fdvp
9033 */
9034 nameidone(fromnd);
9035
9036 if (fvp) {
9037 vnode_put(fvp);
9038 }
9039 vnode_put(fdvp);
9040 }
9041 if (mnt_fvp != NULLVP) {
9042 vnode_put(mnt_fvp);
9043 }
9044 /*
9045 * If things changed after we did the namei, then we will re-drive
9046 * this rename call from the top.
9047 */
9048 if (do_retry) {
9049 do_retry = 0;
9050 goto retry;
9051 }
9052
9053 kfree_type(typeof(*__rename_data), __rename_data);
9054 return error;
9055 }
9056
9057 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9058 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9059 {
9060 return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9061 AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9062 }
9063
9064 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9065 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9066 {
9067 if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9068 return EINVAL;
9069 }
9070
9071 if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9072 return EINVAL;
9073 }
9074
9075 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9076 uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9077 }
9078
9079 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9080 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9081 {
9082 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9083 uap->tofd, uap->to, UIO_USERSPACE, 0);
9084 }
9085
9086 /*
9087 * Make a directory file.
9088 *
9089 * Returns: 0 Success
9090 * EEXIST
9091 * namei:???
9092 * vnode_authorize:???
9093 * vn_create:???
9094 */
9095 /* ARGSUSED */
9096 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9097 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9098 enum uio_seg segflg)
9099 {
9100 vnode_t vp, dvp;
9101 int error;
9102 int update_flags = 0;
9103 int batched;
9104 struct nameidata nd;
9105
9106 AUDIT_ARG(mode, vap->va_mode);
9107 NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9108 path, ctx);
9109 nd.ni_cnd.cn_flags |= WILLBEDIR;
9110 nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9111
9112 continue_lookup:
9113 error = nameiat(&nd, fd);
9114 if (error) {
9115 return error;
9116 }
9117 dvp = nd.ni_dvp;
9118 vp = nd.ni_vp;
9119
9120 if (vp != NULL) {
9121 error = EEXIST;
9122 goto out;
9123 }
9124
9125 batched = vnode_compound_mkdir_available(dvp);
9126
9127 VATTR_SET(vap, va_type, VDIR);
9128
9129 /*
9130 * XXX
9131 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9132 * only get EXISTS or EISDIR for existing path components, and not that it could see
9133 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9134 * it will fail in a spurious manner. Need to figure out if this is valid behavior.
9135 */
9136 if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9137 if (error == EACCES || error == EPERM) {
9138 int error2;
9139
9140 nameidone(&nd);
9141 vnode_put(dvp);
9142 dvp = NULLVP;
9143
9144 /*
9145 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9146 * rather than EACCESS if the target exists.
9147 */
9148 NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9149 path, ctx);
9150 error2 = nameiat(&nd, fd);
9151 if (error2) {
9152 goto out;
9153 } else {
9154 vp = nd.ni_vp;
9155 error = EEXIST;
9156 goto out;
9157 }
9158 }
9159
9160 goto out;
9161 }
9162
9163 /*
9164 * make the directory
9165 */
9166 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9167 if (error == EKEEPLOOKING) {
9168 nd.ni_vp = vp;
9169 goto continue_lookup;
9170 }
9171
9172 goto out;
9173 }
9174
9175 // Make sure the name & parent pointers are hooked up
9176 if (vp->v_name == NULL) {
9177 update_flags |= VNODE_UPDATE_NAME;
9178 }
9179 if (vp->v_parent == NULLVP) {
9180 update_flags |= VNODE_UPDATE_PARENT;
9181 }
9182
9183 if (update_flags) {
9184 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9185 }
9186
9187 #if CONFIG_FSE
9188 add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9189 #endif
9190
9191 out:
9192 /*
9193 * nameidone has to happen before we vnode_put(dvp)
9194 * since it may need to release the fs_nodelock on the dvp
9195 */
9196 nameidone(&nd);
9197
9198 if (vp) {
9199 vnode_put(vp);
9200 }
9201 if (dvp) {
9202 vnode_put(dvp);
9203 }
9204
9205 return error;
9206 }
9207
9208 /*
9209 * mkdir_extended: Create a directory; with extended security (ACL).
9210 *
9211 * Parameters: p Process requesting to create the directory
9212 * uap User argument descriptor (see below)
9213 * retval (ignored)
9214 *
9215 * Indirect: uap->path Path of directory to create
9216 * uap->mode Access permissions to set
9217 * uap->xsecurity ACL to set
9218 *
9219 * Returns: 0 Success
9220 * !0 Not success
9221 *
9222 */
9223 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)9224 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9225 {
9226 int ciferror;
9227 kauth_filesec_t xsecdst;
9228 struct vnode_attr va;
9229
9230 AUDIT_ARG(owner, uap->uid, uap->gid);
9231
9232 xsecdst = NULL;
9233 if ((uap->xsecurity != USER_ADDR_NULL) &&
9234 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9235 return ciferror;
9236 }
9237
9238 VATTR_INIT(&va);
9239 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9240 if (xsecdst != NULL) {
9241 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9242 va.va_vaflags |= VA_FILESEC_ACL;
9243 }
9244
9245 ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9246 UIO_USERSPACE);
9247 if (xsecdst != NULL) {
9248 kauth_filesec_free(xsecdst);
9249 }
9250 return ciferror;
9251 }
9252
9253 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)9254 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9255 {
9256 struct vnode_attr va;
9257
9258 VATTR_INIT(&va);
9259 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9260
9261 return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9262 UIO_USERSPACE);
9263 }
9264
9265 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)9266 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9267 {
9268 struct vnode_attr va;
9269
9270 VATTR_INIT(&va);
9271 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9272
9273 return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9274 UIO_USERSPACE);
9275 }
9276
9277 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)9278 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9279 enum uio_seg segflg, int unlink_flags)
9280 {
9281 struct {
9282 struct nameidata nd;
9283 #if CONFIG_FSE
9284 struct vnode_attr va;
9285 #endif /* CONFIG_FSE */
9286 } *__rmdir_data;
9287 vnode_t vp, dvp;
9288 int error;
9289 struct nameidata *ndp;
9290 char *path = NULL;
9291 char *no_firmlink_path = NULL;
9292 int len_path = 0;
9293 int len_no_firmlink_path = 0;
9294 int has_listeners = 0;
9295 int need_event = 0;
9296 int truncated_path = 0;
9297 int truncated_no_firmlink_path = 0;
9298 struct vnode_attr *vap = NULL;
9299 int restart_count = 0;
9300 int batched;
9301
9302 int restart_flag;
9303
9304 __rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
9305 ndp = &__rmdir_data->nd;
9306
9307 /*
9308 * This loop exists to restart rmdir in the unlikely case that two
9309 * processes are simultaneously trying to remove the same directory
9310 * containing orphaned appleDouble files.
9311 */
9312 do {
9313 NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9314 segflg, dirpath, ctx);
9315 ndp->ni_flag = NAMEI_COMPOUNDRMDIR;
9316 continue_lookup:
9317 restart_flag = 0;
9318 vap = NULL;
9319
9320 error = nameiat(ndp, fd);
9321 if (error) {
9322 goto err_out;
9323 }
9324
9325 dvp = ndp->ni_dvp;
9326 vp = ndp->ni_vp;
9327
9328 if (vp) {
9329 batched = vnode_compound_rmdir_available(vp);
9330
9331 if (vp->v_flag & VROOT) {
9332 /*
9333 * The root of a mounted filesystem cannot be deleted.
9334 */
9335 error = EBUSY;
9336 goto out;
9337 }
9338
9339 #if DEVELOPMENT || DEBUG
9340 /*
9341 * XXX VSWAP: Check for entitlements or special flag here
9342 * so we can restrict access appropriately.
9343 */
9344 #else /* DEVELOPMENT || DEBUG */
9345
9346 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
9347 error = EPERM;
9348 goto out;
9349 }
9350 #endif /* DEVELOPMENT || DEBUG */
9351
9352 /*
9353 * Removed a check here; we used to abort if vp's vid
9354 * was not the same as what we'd seen the last time around.
9355 * I do not think that check was valid, because if we retry
9356 * and all dirents are gone, the directory could legitimately
9357 * be recycled but still be present in a situation where we would
9358 * have had permission to delete. Therefore, we won't make
9359 * an effort to preserve that check now that we may not have a
9360 * vp here.
9361 */
9362
9363 if (!batched) {
9364 error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
9365 if (error) {
9366 if (error == ENOENT) {
9367 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9368 restart_flag = 1;
9369 restart_count += 1;
9370 }
9371 }
9372 goto out;
9373 }
9374 }
9375 } else {
9376 batched = 1;
9377
9378 if (!vnode_compound_rmdir_available(dvp)) {
9379 panic("No error, but no compound rmdir?");
9380 }
9381 }
9382
9383 #if CONFIG_FSE
9384 fse_info finfo = {0};
9385
9386 need_event = need_fsevent(FSE_DELETE, dvp);
9387 if (need_event) {
9388 if (!batched) {
9389 get_fse_info(vp, &finfo, ctx);
9390 } else {
9391 error = vfs_get_notify_attributes(&__rmdir_data->va);
9392 if (error) {
9393 goto out;
9394 }
9395
9396 vap = &__rmdir_data->va;
9397 }
9398 }
9399 #endif
9400 has_listeners = kauth_authorize_fileop_has_listeners();
9401 if (need_event || has_listeners) {
9402 if (path == NULL) {
9403 GET_PATH(path);
9404 }
9405
9406 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
9407
9408 if (no_firmlink_path == NULL) {
9409 GET_PATH(no_firmlink_path);
9410 }
9411
9412 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
9413 #if CONFIG_FSE
9414 if (truncated_no_firmlink_path) {
9415 finfo.mode |= FSE_TRUNCATED_PATH;
9416 }
9417 #endif
9418 }
9419
9420 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
9421 ndp->ni_vp = vp;
9422 if (vp == NULLVP) {
9423 /* Couldn't find a vnode */
9424 goto out;
9425 }
9426
9427 if (error == EKEEPLOOKING) {
9428 goto continue_lookup;
9429 } else if (batched && error == ENOENT) {
9430 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9431 /*
9432 * For compound VNOPs, the authorization callback
9433 * may return ENOENT in case of racing hard link lookups
9434 * redrive the lookup.
9435 */
9436 restart_flag = 1;
9437 restart_count += 1;
9438 goto out;
9439 }
9440 }
9441
9442 /*
9443 * XXX There's no provision for passing flags
9444 * to VNOP_RMDIR(). So, if vn_rmdir() fails
9445 * because it's not empty, then we try again
9446 * with VNOP_REMOVE(), passing in a special
9447 * flag that clever file systems will know
9448 * how to handle.
9449 */
9450 if (error == ENOTEMPTY &&
9451 (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
9452 /*
9453 * If this fails, we want to keep the original
9454 * error.
9455 */
9456 if (vn_remove(dvp, &vp, ndp,
9457 VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
9458 error = 0;
9459 }
9460 }
9461
9462 #if CONFIG_APPLEDOUBLE
9463 /*
9464 * Special case to remove orphaned AppleDouble
9465 * files. I don't like putting this in the kernel,
9466 * but carbon does not like putting this in carbon either,
9467 * so here we are.
9468 */
9469 if (error == ENOTEMPTY) {
9470 int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
9471 if (ad_error == EBUSY) {
9472 error = ad_error;
9473 goto out;
9474 }
9475
9476
9477 /*
9478 * Assuming everything went well, we will try the RMDIR again
9479 */
9480 if (!ad_error) {
9481 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
9482 }
9483 }
9484 #endif /* CONFIG_APPLEDOUBLE */
9485 /*
9486 * Call out to allow 3rd party notification of delete.
9487 * Ignore result of kauth_authorize_fileop call.
9488 */
9489 if (!error) {
9490 if (has_listeners) {
9491 kauth_authorize_fileop(vfs_context_ucred(ctx),
9492 KAUTH_FILEOP_DELETE,
9493 (uintptr_t)vp,
9494 (uintptr_t)path);
9495 }
9496
9497 if (vp->v_flag & VISHARDLINK) {
9498 // see the comment in unlink1() about why we update
9499 // the parent of a hard link when it is removed
9500 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
9501 }
9502
9503 #if CONFIG_FSE
9504 if (need_event) {
9505 if (vap) {
9506 vnode_get_fse_info_from_vap(vp, &finfo, vap);
9507 }
9508 add_fsevent(FSE_DELETE, ctx,
9509 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
9510 FSE_ARG_FINFO, &finfo,
9511 FSE_ARG_DONE);
9512 }
9513 #endif
9514 }
9515
9516 out:
9517 if (path != NULL) {
9518 RELEASE_PATH(path);
9519 path = NULL;
9520 }
9521
9522 if (no_firmlink_path != NULL) {
9523 RELEASE_PATH(no_firmlink_path);
9524 no_firmlink_path = NULL;
9525 }
9526
9527 /*
9528 * nameidone has to happen before we vnode_put(dvp)
9529 * since it may need to release the fs_nodelock on the dvp
9530 */
9531 nameidone(ndp);
9532 vnode_put(dvp);
9533
9534 if (vp) {
9535 vnode_put(vp);
9536 }
9537
9538 if (restart_flag == 0) {
9539 wakeup_one((caddr_t)vp);
9540 goto err_out;
9541 }
9542 tsleep(vp, PVFS, "rm AD", 1);
9543 } while (restart_flag != 0);
9544
9545 err_out:
9546 kfree_type(typeof(*__rmdir_data), __rmdir_data);
9547
9548 return error;
9549 }
9550
9551 /*
9552 * Remove a directory file.
9553 */
9554 /* ARGSUSED */
9555 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)9556 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
9557 {
9558 return rmdirat_internal(vfs_context_current(), AT_FDCWD,
9559 CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
9560 }
9561
9562 /* Get direntry length padded to 8 byte alignment */
9563 #define DIRENT64_LEN(namlen) \
9564 ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
9565
9566 /* Get dirent length padded to 4 byte alignment */
9567 #define DIRENT_LEN(namelen) \
9568 ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
9569
9570 /* Get the end of this dirent */
9571 #define DIRENT_END(dep) \
9572 (((char *)(dep)) + (dep)->d_reclen - 1)
9573
9574 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)9575 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
9576 int *numdirent, vfs_context_t ctxp)
9577 {
9578 /* Check if fs natively supports VNODE_READDIR_EXTENDED */
9579 if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
9580 ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
9581 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
9582 } else {
9583 size_t bufsize;
9584 void * bufptr;
9585 uio_t auio;
9586 struct direntry *entry64;
9587 struct dirent *dep;
9588 size_t bytesread;
9589 int error;
9590
9591 /*
9592 * We're here because the underlying file system does not
9593 * support direnties or we mounted denying support so we must
9594 * fall back to dirents and convert them to direntries.
9595 *
9596 * Our kernel buffer needs to be smaller since re-packing will
9597 * expand each dirent. The worse case (when the name length
9598 * is 3 or less) corresponds to a struct direntry size of 32
9599 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
9600 * (4-byte aligned). So having a buffer that is 3/8 the size
9601 * will prevent us from reading more than we can pack.
9602 *
9603 * Since this buffer is wired memory, we will limit the
9604 * buffer size to a maximum of 32K. We would really like to
9605 * use 32K in the MIN(), but we use magic number 87371 to
9606 * prevent uio_resid() * 3 / 8 from overflowing.
9607 */
9608 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
9609 bufptr = kalloc_data(bufsize, Z_WAITOK);
9610 if (bufptr == NULL) {
9611 return ENOMEM;
9612 }
9613
9614 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
9615 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
9616 auio->uio_offset = uio->uio_offset;
9617
9618 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
9619
9620 dep = (struct dirent *)bufptr;
9621 bytesread = bufsize - uio_resid(auio);
9622
9623 entry64 = kalloc_type(struct direntry, Z_WAITOK);
9624 /*
9625 * Convert all the entries and copy them out to user's buffer.
9626 */
9627 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
9628 /* First check that the dirent struct up to d_name is within the buffer */
9629 if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
9630 /* Check that the length of the entire dirent is within the buffer */
9631 DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
9632 /* Check that the actual length including the name doesn't exceed d_reclen */
9633 DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
9634 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
9635 vp->v_mount->mnt_vfsstat.f_mntonname,
9636 vp->v_name ? vp->v_name : "<unknown>");
9637 error = EIO;
9638 break;
9639 }
9640
9641 size_t enbufsize = DIRENT64_LEN(dep->d_namlen);
9642
9643 bzero(entry64, enbufsize);
9644 /* Convert a dirent to a dirent64. */
9645 entry64->d_ino = dep->d_ino;
9646 entry64->d_seekoff = 0;
9647 entry64->d_reclen = (uint16_t)enbufsize;
9648 entry64->d_namlen = dep->d_namlen;
9649 entry64->d_type = dep->d_type;
9650 bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
9651
9652 /* Move to next entry. */
9653 dep = (struct dirent *)((char *)dep + dep->d_reclen);
9654
9655 /* Copy entry64 to user's buffer. */
9656 error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
9657 }
9658
9659 /* Update the real offset using the offset we got from VNOP_READDIR. */
9660 if (error == 0) {
9661 uio->uio_offset = auio->uio_offset;
9662 }
9663 uio_free(auio);
9664 kfree_data(bufptr, bufsize);
9665 kfree_type(struct direntry, entry64);
9666 return error;
9667 }
9668 }
9669
9670 #define GETDIRENTRIES_MAXBUFSIZE (128 * 1024 * 1024U)
9671
9672 /*
9673 * Read a block of directory entries in a file system independent format.
9674 */
9675 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)9676 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
9677 off_t *offset, int *eofflag, int flags)
9678 {
9679 vnode_t vp;
9680 struct vfs_context context = *vfs_context_current(); /* local copy */
9681 struct fileproc *fp;
9682 uio_t auio;
9683 int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9684 off_t loff;
9685 int error, numdirent;
9686 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
9687
9688 get_from_fd:
9689 error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
9690 if (error) {
9691 return error;
9692 }
9693
9694 vn_offset_lock(fp->fp_glob);
9695 if (((vnode_t)fp_get_data(fp)) != vp) {
9696 vn_offset_unlock(fp->fp_glob);
9697 file_drop(fd);
9698 goto get_from_fd;
9699 }
9700
9701 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
9702 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9703 error = EBADF;
9704 goto out;
9705 }
9706
9707 if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
9708 bufsize = GETDIRENTRIES_MAXBUFSIZE;
9709 }
9710
9711 #if CONFIG_MACF
9712 error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
9713 if (error) {
9714 goto out;
9715 }
9716 #endif
9717
9718 if ((error = vnode_getwithref(vp))) {
9719 goto out;
9720 }
9721 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9722
9723 #if CONFIG_UNION_MOUNTS
9724 unionread:
9725 #endif /* CONFIG_UNION_MOUNTS */
9726 if (vp->v_type != VDIR) {
9727 (void)vnode_put(vp);
9728 error = EINVAL;
9729 goto out;
9730 }
9731
9732 #if CONFIG_MACF
9733 error = mac_vnode_check_readdir(&context, vp);
9734 if (error != 0) {
9735 (void)vnode_put(vp);
9736 goto out;
9737 }
9738 #endif /* MAC */
9739
9740 loff = fp->fp_glob->fg_offset;
9741 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9742 uio_addiov(auio, bufp, bufsize);
9743
9744 if (flags & VNODE_READDIR_EXTENDED) {
9745 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
9746 fp->fp_glob->fg_offset = uio_offset(auio);
9747 } else {
9748 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
9749 fp->fp_glob->fg_offset = uio_offset(auio);
9750 }
9751 if (error) {
9752 (void)vnode_put(vp);
9753 goto out;
9754 }
9755
9756 #if CONFIG_UNION_MOUNTS
9757 if ((user_ssize_t)bufsize == uio_resid(auio) &&
9758 (vp->v_mount->mnt_flag & MNT_UNION)) {
9759 vnode_t uvp;
9760
9761 if (lookup_traverse_union(vp, &uvp, &context) == 0) {
9762 if (vnode_ref(uvp) == 0) {
9763 fp_set_data(fp, uvp);
9764 fp->fp_glob->fg_offset = 0;
9765 vnode_rele(vp);
9766 vnode_put(vp);
9767 vp = uvp;
9768 goto unionread;
9769 } else {
9770 /* could not get a ref, can't replace in fd */
9771 vnode_put(uvp);
9772 }
9773 }
9774 }
9775 #endif /* CONFIG_UNION_MOUNTS */
9776
9777 vnode_put(vp);
9778 if (offset) {
9779 *offset = loff;
9780 }
9781
9782 *bytesread = bufsize - uio_resid(auio);
9783 out:
9784 vn_offset_unlock(fp->fp_glob);
9785 file_drop(fd);
9786 return error;
9787 }
9788
9789
9790 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)9791 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
9792 {
9793 off_t offset;
9794 ssize_t bytesread;
9795 int error, eofflag;
9796
9797 AUDIT_ARG(fd, uap->fd);
9798 error = getdirentries_common(uap->fd, uap->buf, uap->count,
9799 &bytesread, &offset, &eofflag, 0);
9800
9801 if (error == 0) {
9802 if (proc_is64bit(p)) {
9803 user64_long_t base = (user64_long_t)offset;
9804 error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
9805 } else {
9806 user32_long_t base = (user32_long_t)offset;
9807 error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
9808 }
9809 *retval = (int)bytesread;
9810 }
9811 return error;
9812 }
9813
9814 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)9815 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
9816 {
9817 off_t offset;
9818 ssize_t bytesread;
9819 int error, eofflag;
9820 user_size_t bufsize;
9821
9822 AUDIT_ARG(fd, uap->fd);
9823
9824 /*
9825 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
9826 * then the kernel carves out the last 4 bytes to return extended
9827 * information to userspace (namely whether we reached EOF with this call).
9828 */
9829 if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9830 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
9831 } else {
9832 bufsize = uap->bufsize;
9833 }
9834
9835 error = getdirentries_common(uap->fd, uap->buf, bufsize,
9836 &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
9837
9838 if (error == 0) {
9839 *retval = bytesread;
9840 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
9841
9842 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9843 getdirentries64_flags_t flags = 0;
9844 if (eofflag) {
9845 flags |= GETDIRENTRIES64_EOF;
9846 }
9847 error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
9848 sizeof(flags));
9849 }
9850 }
9851 return error;
9852 }
9853
9854
9855 /*
9856 * Set the mode mask for creation of filesystem nodes.
9857 * XXX implement xsecurity
9858 */
9859 #define UMASK_NOXSECURITY (void *)1 /* leave existing xsecurity alone */
9860 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)9861 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
9862 {
9863 AUDIT_ARG(mask, newmask);
9864 proc_fdlock(p);
9865 *retval = p->p_fd.fd_cmask;
9866 p->p_fd.fd_cmask = newmask & ALLPERMS;
9867 proc_fdunlock(p);
9868 return 0;
9869 }
9870
9871 /*
9872 * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
9873 *
9874 * Parameters: p Process requesting to set the umask
9875 * uap User argument descriptor (see below)
9876 * retval umask of the process (parameter p)
9877 *
9878 * Indirect: uap->newmask umask to set
9879 * uap->xsecurity ACL to set
9880 *
9881 * Returns: 0 Success
9882 * !0 Not success
9883 *
9884 */
9885 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)9886 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
9887 {
9888 return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
9889 }
9890
9891 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)9892 umask(proc_t p, struct umask_args *uap, int32_t *retval)
9893 {
9894 return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
9895 }
9896
9897 /*
9898 * Void all references to file by ripping underlying filesystem
9899 * away from vnode.
9900 */
9901 /* ARGSUSED */
9902 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)9903 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
9904 {
9905 vnode_t vp;
9906 struct vnode_attr va;
9907 vfs_context_t ctx = vfs_context_current();
9908 int error;
9909 struct nameidata nd;
9910
9911 NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
9912 uap->path, ctx);
9913 error = namei(&nd);
9914 if (error) {
9915 return error;
9916 }
9917 vp = nd.ni_vp;
9918
9919 nameidone(&nd);
9920
9921 if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
9922 error = ENOTSUP;
9923 goto out;
9924 }
9925
9926 if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
9927 error = EBUSY;
9928 goto out;
9929 }
9930
9931 #if CONFIG_MACF
9932 error = mac_vnode_check_revoke(ctx, vp);
9933 if (error) {
9934 goto out;
9935 }
9936 #endif
9937
9938 VATTR_INIT(&va);
9939 VATTR_WANTED(&va, va_uid);
9940 if ((error = vnode_getattr(vp, &va, ctx))) {
9941 goto out;
9942 }
9943 if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
9944 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
9945 goto out;
9946 }
9947 if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
9948 VNOP_REVOKE(vp, REVOKEALL, ctx);
9949 }
9950 out:
9951 vnode_put(vp);
9952 return error;
9953 }
9954
9955
9956 /*
9957 * HFS/HFS PlUS SPECIFIC SYSTEM CALLS
9958 * The following system calls are designed to support features
9959 * which are specific to the HFS & HFS Plus volume formats
9960 */
9961
9962
9963 /*
9964 * Obtain attribute information on objects in a directory while enumerating
9965 * the directory.
9966 */
9967 /* ARGSUSED */
9968 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)9969 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
9970 {
9971 vnode_t vp;
9972 struct fileproc *fp;
9973 uio_t auio = NULL;
9974 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9975 uint32_t count = 0, savecount = 0;
9976 uint32_t newstate = 0;
9977 int error, eofflag;
9978 off_t loff = 0;
9979 struct attrlist attributelist;
9980 vfs_context_t ctx = vfs_context_current();
9981 int fd = uap->fd;
9982 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
9983 kauth_action_t action;
9984
9985 AUDIT_ARG(fd, fd);
9986
9987 /* Get the attributes into kernel space */
9988 if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
9989 return error;
9990 }
9991 if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
9992 return error;
9993 }
9994 savecount = count;
9995
9996 get_from_fd:
9997 if ((error = fp_getfvp(p, fd, &fp, &vp))) {
9998 return error;
9999 }
10000
10001 vn_offset_lock(fp->fp_glob);
10002 if (((vnode_t)fp_get_data(fp)) != vp) {
10003 vn_offset_unlock(fp->fp_glob);
10004 file_drop(fd);
10005 goto get_from_fd;
10006 }
10007
10008 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10009 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10010 error = EBADF;
10011 goto out;
10012 }
10013
10014
10015 #if CONFIG_MACF
10016 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10017 fp->fp_glob);
10018 if (error) {
10019 goto out;
10020 }
10021 #endif
10022
10023
10024 if ((error = vnode_getwithref(vp))) {
10025 goto out;
10026 }
10027
10028 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10029
10030 #if CONFIG_UNION_MOUNTS
10031 unionread:
10032 #endif /* CONFIG_UNION_MOUNTS */
10033 if (vp->v_type != VDIR) {
10034 (void)vnode_put(vp);
10035 error = EINVAL;
10036 goto out;
10037 }
10038
10039 #if CONFIG_MACF
10040 error = mac_vnode_check_readdir(ctx, vp);
10041 if (error != 0) {
10042 (void)vnode_put(vp);
10043 goto out;
10044 }
10045 #endif /* MAC */
10046
10047 /* set up the uio structure which will contain the users return buffer */
10048 loff = fp->fp_glob->fg_offset;
10049 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10050 uio_addiov(auio, uap->buffer, uap->buffersize);
10051
10052 /*
10053 * If the only item requested is file names, we can let that past with
10054 * just LIST_DIRECTORY. If they want any other attributes, that means
10055 * they need SEARCH as well.
10056 */
10057 action = KAUTH_VNODE_LIST_DIRECTORY;
10058 if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10059 attributelist.fileattr || attributelist.dirattr) {
10060 action |= KAUTH_VNODE_SEARCH;
10061 }
10062
10063 if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10064 /* Believe it or not, uap->options only has 32-bits of valid
10065 * info, so truncate before extending again */
10066
10067 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10068 (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10069 }
10070
10071 if (error) {
10072 (void) vnode_put(vp);
10073 goto out;
10074 }
10075
10076 #if CONFIG_UNION_MOUNTS
10077 /*
10078 * If we've got the last entry of a directory in a union mount
10079 * then reset the eofflag and pretend there's still more to come.
10080 * The next call will again set eofflag and the buffer will be empty,
10081 * so traverse to the underlying directory and do the directory
10082 * read there.
10083 */
10084 if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10085 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10086 eofflag = 0;
10087 } else { // Empty buffer
10088 vnode_t uvp;
10089 if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10090 if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10091 fp_set_data(fp, uvp);
10092 fp->fp_glob->fg_offset = 0; // reset index for new dir
10093 count = savecount;
10094 vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10095 vnode_put(vp);
10096 vp = uvp;
10097 goto unionread;
10098 } else {
10099 /* could not get a ref, can't replace in fd */
10100 vnode_put(uvp);
10101 }
10102 }
10103 }
10104 }
10105 #endif /* CONFIG_UNION_MOUNTS */
10106
10107 (void)vnode_put(vp);
10108
10109 if (error) {
10110 goto out;
10111 }
10112 fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10113
10114 if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10115 goto out;
10116 }
10117 if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10118 goto out;
10119 }
10120 if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10121 goto out;
10122 }
10123
10124 *retval = eofflag; /* similar to getdirentries */
10125 error = 0;
10126 out:
10127 vn_offset_unlock(fp->fp_glob);
10128 file_drop(fd);
10129 return error; /* return error earlier, an retval of 0 or 1 now */
10130 } /* end of getdirentriesattr system call */
10131
10132 /*
10133 * Exchange data between two files
10134 */
10135
10136 /* ARGSUSED */
10137 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10138 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10139 {
10140 struct nameidata fnd, snd;
10141 vfs_context_t ctx = vfs_context_current();
10142 vnode_t fvp;
10143 vnode_t svp;
10144 int error;
10145 u_int32_t nameiflags;
10146 char *fpath = NULL;
10147 char *spath = NULL;
10148 int flen = 0, slen = 0;
10149 int from_truncated = 0, to_truncated = 0;
10150 #if CONFIG_FSE
10151 fse_info f_finfo, s_finfo;
10152 #endif
10153
10154 nameiflags = 0;
10155 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10156 nameiflags |= FOLLOW;
10157 }
10158
10159 NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10160 UIO_USERSPACE, uap->path1, ctx);
10161
10162 error = namei(&fnd);
10163 if (error) {
10164 goto out2;
10165 }
10166
10167 nameidone(&fnd);
10168 fvp = fnd.ni_vp;
10169
10170 NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10171 UIO_USERSPACE, uap->path2, ctx);
10172
10173 error = namei(&snd);
10174 if (error) {
10175 vnode_put(fvp);
10176 goto out2;
10177 }
10178 nameidone(&snd);
10179 svp = snd.ni_vp;
10180
10181 /*
10182 * if the files are the same, return an inval error
10183 */
10184 if (svp == fvp) {
10185 error = EINVAL;
10186 goto out;
10187 }
10188
10189 /*
10190 * if the files are on different volumes, return an error
10191 */
10192 if (svp->v_mount != fvp->v_mount) {
10193 error = EXDEV;
10194 goto out;
10195 }
10196
10197 /* If they're not files, return an error */
10198 if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10199 error = EINVAL;
10200 goto out;
10201 }
10202
10203 #if CONFIG_MACF
10204 error = mac_vnode_check_exchangedata(ctx,
10205 fvp, svp);
10206 if (error) {
10207 goto out;
10208 }
10209 #endif
10210 if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10211 ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10212 goto out;
10213 }
10214
10215 if (
10216 #if CONFIG_FSE
10217 need_fsevent(FSE_EXCHANGE, fvp) ||
10218 #endif
10219 kauth_authorize_fileop_has_listeners()) {
10220 GET_PATH(fpath);
10221 GET_PATH(spath);
10222
10223 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10224 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10225
10226 #if CONFIG_FSE
10227 get_fse_info(fvp, &f_finfo, ctx);
10228 get_fse_info(svp, &s_finfo, ctx);
10229 if (from_truncated || to_truncated) {
10230 // set it here since only the f_finfo gets reported up to user space
10231 f_finfo.mode |= FSE_TRUNCATED_PATH;
10232 }
10233 #endif
10234 }
10235 /* Ok, make the call */
10236 error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10237
10238 if (error == 0) {
10239 const char *tmpname;
10240
10241 if (fpath != NULL && spath != NULL) {
10242 /* call out to allow 3rd party notification of exchangedata.
10243 * Ignore result of kauth_authorize_fileop call.
10244 */
10245 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10246 (uintptr_t)fpath, (uintptr_t)spath);
10247 }
10248 name_cache_lock();
10249
10250 tmpname = fvp->v_name;
10251 fvp->v_name = svp->v_name;
10252 svp->v_name = tmpname;
10253
10254 if (fvp->v_parent != svp->v_parent) {
10255 vnode_t tmp;
10256
10257 tmp = fvp->v_parent;
10258 fvp->v_parent = svp->v_parent;
10259 svp->v_parent = tmp;
10260 }
10261 name_cache_unlock();
10262
10263 #if CONFIG_FSE
10264 if (fpath != NULL && spath != NULL) {
10265 add_fsevent(FSE_EXCHANGE, ctx,
10266 FSE_ARG_STRING, flen, fpath,
10267 FSE_ARG_FINFO, &f_finfo,
10268 FSE_ARG_STRING, slen, spath,
10269 FSE_ARG_FINFO, &s_finfo,
10270 FSE_ARG_DONE);
10271 }
10272 #endif
10273 }
10274
10275 out:
10276 if (fpath != NULL) {
10277 RELEASE_PATH(fpath);
10278 }
10279 if (spath != NULL) {
10280 RELEASE_PATH(spath);
10281 }
10282 vnode_put(svp);
10283 vnode_put(fvp);
10284 out2:
10285 return error;
10286 }
10287
10288 /*
10289 * Return (in MB) the amount of freespace on the given vnode's volume.
10290 */
10291 uint32_t freespace_mb(vnode_t vp);
10292
10293 uint32_t
freespace_mb(vnode_t vp)10294 freespace_mb(vnode_t vp)
10295 {
10296 vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
10297 return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10298 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10299 }
10300
10301 #if CONFIG_SEARCHFS
10302
10303 /* ARGSUSED */
10304
10305 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)10306 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10307 {
10308 vnode_t vp, tvp;
10309 int i, error = 0;
10310 int fserror = 0;
10311 struct nameidata nd;
10312 struct user64_fssearchblock searchblock;
10313 struct searchstate *state;
10314 struct attrlist *returnattrs;
10315 struct timeval timelimit;
10316 void *searchparams1, *searchparams2;
10317 uio_t auio = NULL;
10318 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10319 uint32_t nummatches;
10320 size_t mallocsize;
10321 uint32_t nameiflags;
10322 vfs_context_t ctx = vfs_context_current();
10323 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10324
10325 /* Start by copying in fsearchblock parameter list */
10326 if (IS_64BIT_PROCESS(p)) {
10327 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
10328 timelimit.tv_sec = searchblock.timelimit.tv_sec;
10329 timelimit.tv_usec = searchblock.timelimit.tv_usec;
10330 } else {
10331 struct user32_fssearchblock tmp_searchblock;
10332
10333 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
10334 // munge into 64-bit version
10335 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
10336 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
10337 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
10338 searchblock.maxmatches = tmp_searchblock.maxmatches;
10339 /*
10340 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
10341 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
10342 */
10343 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
10344 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
10345 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
10346 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
10347 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
10348 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
10349 searchblock.searchattrs = tmp_searchblock.searchattrs;
10350 }
10351 if (error) {
10352 return error;
10353 }
10354
10355 /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
10356 */
10357 if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10358 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10359 return EINVAL;
10360 }
10361
10362 /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
10363 /* It all has to do into local memory and it's not that big so we might as well put it all together. */
10364 /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
10365 /* block. */
10366 /* */
10367 /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate */
10368 /* due to the changes in rdar://problem/12438273. That way if a 3rd party file system */
10369 /* assumes the size is still 556 bytes it will continue to work */
10370
10371 mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
10372 sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
10373
10374 searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
10375
10376 /* Now set up the various pointers to the correct place in our newly allocated memory */
10377
10378 searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
10379 returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
10380 state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
10381
10382 /* Now copy in the stuff given our local variables. */
10383
10384 if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
10385 goto freeandexit;
10386 }
10387
10388 if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
10389 goto freeandexit;
10390 }
10391
10392 if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
10393 goto freeandexit;
10394 }
10395
10396 if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
10397 goto freeandexit;
10398 }
10399
10400 /*
10401 * When searching a union mount, need to set the
10402 * start flag at the first call on each layer to
10403 * reset state for the new volume.
10404 */
10405 if (uap->options & SRCHFS_START) {
10406 state->ss_union_layer = 0;
10407 } else {
10408 uap->options |= state->ss_union_flags;
10409 }
10410 state->ss_union_flags = 0;
10411
10412 /*
10413 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
10414 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
10415 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
10416 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
10417 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
10418 */
10419
10420 if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
10421 attrreference_t* string_ref;
10422 u_int32_t* start_length;
10423 user64_size_t param_length;
10424
10425 /* validate searchparams1 */
10426 param_length = searchblock.sizeofsearchparams1;
10427 /* skip the word that specifies length of the buffer */
10428 start_length = (u_int32_t*) searchparams1;
10429 start_length = start_length + 1;
10430 string_ref = (attrreference_t*) start_length;
10431
10432 /* ensure no negative offsets or too big offsets */
10433 if (string_ref->attr_dataoffset < 0) {
10434 error = EINVAL;
10435 goto freeandexit;
10436 }
10437 if (string_ref->attr_length > MAXPATHLEN) {
10438 error = EINVAL;
10439 goto freeandexit;
10440 }
10441
10442 /* Check for pointer overflow in the string ref */
10443 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
10444 error = EINVAL;
10445 goto freeandexit;
10446 }
10447
10448 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
10449 error = EINVAL;
10450 goto freeandexit;
10451 }
10452 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
10453 error = EINVAL;
10454 goto freeandexit;
10455 }
10456 }
10457
10458 /* set up the uio structure which will contain the users return buffer */
10459 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10460 uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
10461
10462 nameiflags = 0;
10463 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10464 nameiflags |= FOLLOW;
10465 }
10466 NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
10467 UIO_USERSPACE, uap->path, ctx);
10468
10469 error = namei(&nd);
10470 if (error) {
10471 goto freeandexit;
10472 }
10473 vp = nd.ni_vp;
10474 nameidone(&nd);
10475
10476 /*
10477 * Switch to the root vnode for the volume
10478 */
10479 error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
10480 vnode_put(vp);
10481 if (error) {
10482 goto freeandexit;
10483 }
10484 vp = tvp;
10485
10486 #if CONFIG_UNION_MOUNTS
10487 /*
10488 * If it's a union mount, the path lookup takes
10489 * us to the top layer. But we may need to descend
10490 * to a lower layer. For non-union mounts the layer
10491 * is always zero.
10492 */
10493 for (i = 0; i < (int) state->ss_union_layer; i++) {
10494 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
10495 break;
10496 }
10497 tvp = vp;
10498 vp = vp->v_mount->mnt_vnodecovered;
10499 if (vp == NULL) {
10500 vnode_put(tvp);
10501 error = ENOENT;
10502 goto freeandexit;
10503 }
10504 error = vnode_getwithref(vp);
10505 vnode_put(tvp);
10506 if (error) {
10507 goto freeandexit;
10508 }
10509 }
10510 #endif /* CONFIG_UNION_MOUNTS */
10511
10512 #if CONFIG_MACF
10513 error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
10514 if (error) {
10515 vnode_put(vp);
10516 goto freeandexit;
10517 }
10518 #endif
10519
10520
10521 /*
10522 * If searchblock.maxmatches == 0, then skip the search. This has happened
10523 * before and sometimes the underlying code doesnt deal with it well.
10524 */
10525 if (searchblock.maxmatches == 0) {
10526 nummatches = 0;
10527 goto saveandexit;
10528 }
10529
10530 /*
10531 * Allright, we have everything we need, so lets make that call.
10532 *
10533 * We keep special track of the return value from the file system:
10534 * EAGAIN is an acceptable error condition that shouldn't keep us
10535 * from copying out any results...
10536 */
10537
10538 fserror = VNOP_SEARCHFS(vp,
10539 searchparams1,
10540 searchparams2,
10541 &searchblock.searchattrs,
10542 (uint32_t)searchblock.maxmatches,
10543 &timelimit,
10544 returnattrs,
10545 &nummatches,
10546 (uint32_t)uap->scriptcode,
10547 (uint32_t)uap->options,
10548 auio,
10549 (struct searchstate *) &state->ss_fsstate,
10550 ctx);
10551
10552 #if CONFIG_UNION_MOUNTS
10553 /*
10554 * If it's a union mount we need to be called again
10555 * to search the mounted-on filesystem.
10556 */
10557 if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
10558 state->ss_union_flags = SRCHFS_START;
10559 state->ss_union_layer++; // search next layer down
10560 fserror = EAGAIN;
10561 }
10562 #endif /* CONFIG_UNION_MOUNTS */
10563
10564 saveandexit:
10565
10566 vnode_put(vp);
10567
10568 /* Now copy out the stuff that needs copying out. That means the number of matches, the
10569 * search state. Everything was already put into he return buffer by the vop call. */
10570
10571 if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
10572 goto freeandexit;
10573 }
10574
10575 if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
10576 goto freeandexit;
10577 }
10578
10579 error = fserror;
10580
10581 freeandexit:
10582
10583 kfree_data(searchparams1, mallocsize);
10584
10585 return error;
10586 } /* end of searchfs system call */
10587
10588 #else /* CONFIG_SEARCHFS */
10589
10590 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)10591 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
10592 {
10593 return ENOTSUP;
10594 }
10595
10596 #endif /* CONFIG_SEARCHFS */
10597
10598
10599 #if CONFIG_DATALESS_FILES
10600
10601 /*
10602 * === Namespace Resolver Up-call Mechanism ===
10603 *
10604 * When I/O is performed to a dataless file or directory (read, write,
10605 * lookup-in, etc.), the file system performs an upcall to the namespace
10606 * resolver (filecoordinationd) to materialize the object.
10607 *
10608 * We need multiple up-calls to be in flight at once, and we need these
10609 * up-calls to be interruptible, thus the following implementation:
10610 *
10611 * => The nspace_resolver_request represents the in-kernel request state.
10612 * It contains a request ID, storage space for the errno code returned
10613 * by filecoordinationd, and flags.
10614 *
10615 * => The request ID is simply a global monotonically incrementing 32-bit
10616 * number. Outstanding requests are stored in a hash table, and the
10617 * hash function is extremely simple.
10618 *
10619 * => When an upcall is to be made to filecoordinationd, a request structure
10620 * is allocated on the stack (it is small, and needs to live only during
10621 * the duration of the call to resolve_nspace_item_ext()). It is
10622 * initialized and inserted into the table. Some backpressure from
10623 * filecoordinationd is applied by limiting the numnber of entries that
10624 * can be inserted into the table (and thus limiting the number of
10625 * outstanding requests issued to filecoordinationd); waiting for an
10626 * available slot is interruptible.
10627 *
10628 * => Once the request has been inserted into the table, the up-call is made
10629 * to filecoordinationd via a MiG-generated stub. The up-call returns
10630 * immediately and filecoordinationd processes the request asynchronously.
10631 *
10632 * => The caller now waits for the request to complete. Tnis is achieved by
10633 * sleeping on the address of the request structure and waiting for
10634 * filecoordinationd to mark the request structure as complete. This
10635 * is an interruptible sleep call; if interrupted, the request structure
10636 * is removed from the table and EINTR is returned to the caller. If
10637 * this occurs, an advisory up-call is made to filecoordinationd with
10638 * the request ID to indicate that the request can be aborted or
10639 * de-prioritized at the discretion of filecoordinationd.
10640 *
10641 * => When filecoordinationd has completed the request, it signals completion
10642 * by writing to the vfs.nspace.complete sysctl node. Only a process
10643 * decorated as a namespace resolver can write to this sysctl node. The
10644 * value is a request ID / errno tuple passed as an array of 2 uint32_t's.
10645 * The request ID is looked up in the table, and if the request is found,
10646 * the error code is stored in the request structure and a wakeup()
10647 * issued on the address of the request structure. If the request is not
10648 * found, we simply drop the completion notification, assuming that the
10649 * caller was interrupted.
10650 *
10651 * => When the waiting thread wakes up, it extracts the error code from the
10652 * request structure, removes the request from the table, and returns the
10653 * error code to the calling function. Fini!
10654 */
10655
10656 struct nspace_resolver_request {
10657 LIST_ENTRY(nspace_resolver_request) r_hashlink;
10658 vnode_t r_vp;
10659 uint32_t r_req_id;
10660 int r_resolver_error;
10661 int r_flags;
10662 };
10663
10664 #define RRF_COMPLETE 0x0001
10665
10666 static uint32_t
next_nspace_req_id(void)10667 next_nspace_req_id(void)
10668 {
10669 static uint32_t next_req_id;
10670
10671 return OSAddAtomic(1, &next_req_id);
10672 }
10673
10674 #define NSPACE_RESOLVER_REQ_HASHSIZE 32 /* XXX tune */
10675 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256 /* XXX tune */
10676
10677 static LIST_HEAD(nspace_resolver_requesthead,
10678 nspace_resolver_request) * nspace_resolver_request_hashtbl;
10679 static u_long nspace_resolver_request_hashmask;
10680 static u_int nspace_resolver_request_count;
10681 static bool nspace_resolver_request_wait_slot;
10682 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
10683 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
10684 &nspace_resolver_request_lck_grp);
10685
10686 #define NSPACE_REQ_LOCK() \
10687 lck_mtx_lock(&nspace_resolver_request_hash_mutex)
10688 #define NSPACE_REQ_UNLOCK() \
10689 lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
10690
10691 #define NSPACE_RESOLVER_HASH(req_id) \
10692 (&nspace_resolver_request_hashtbl[(req_id) & \
10693 nspace_resolver_request_hashmask])
10694
10695 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id)10696 nspace_resolver_req_lookup(uint32_t req_id)
10697 {
10698 struct nspace_resolver_requesthead *bucket;
10699 struct nspace_resolver_request *req;
10700
10701 bucket = NSPACE_RESOLVER_HASH(req_id);
10702 LIST_FOREACH(req, bucket, r_hashlink) {
10703 if (req->r_req_id == req_id) {
10704 return req;
10705 }
10706 }
10707
10708 return NULL;
10709 }
10710
10711 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)10712 nspace_resolver_req_add(struct nspace_resolver_request *req)
10713 {
10714 struct nspace_resolver_requesthead *bucket;
10715 int error;
10716
10717 while (nspace_resolver_request_count >=
10718 NSPACE_RESOLVER_MAX_OUTSTANDING) {
10719 nspace_resolver_request_wait_slot = true;
10720 error = msleep(&nspace_resolver_request_count,
10721 &nspace_resolver_request_hash_mutex,
10722 PVFS | PCATCH, "nspacerq", NULL);
10723 if (error) {
10724 return error;
10725 }
10726 }
10727
10728 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10729 #if DIAGNOSTIC
10730 assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
10731 #endif /* DIAGNOSTIC */
10732 LIST_INSERT_HEAD(bucket, req, r_hashlink);
10733 nspace_resolver_request_count++;
10734
10735 return 0;
10736 }
10737
10738 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)10739 nspace_resolver_req_remove(struct nspace_resolver_request *req)
10740 {
10741 struct nspace_resolver_requesthead *bucket;
10742
10743 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10744 #if DIAGNOSTIC
10745 assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
10746 #endif /* DIAGNOSTIC */
10747 LIST_REMOVE(req, r_hashlink);
10748 nspace_resolver_request_count--;
10749
10750 if (nspace_resolver_request_wait_slot) {
10751 nspace_resolver_request_wait_slot = false;
10752 wakeup(&nspace_resolver_request_count);
10753 }
10754 }
10755
10756 static void
nspace_resolver_req_cancel(uint32_t req_id)10757 nspace_resolver_req_cancel(uint32_t req_id)
10758 {
10759 kern_return_t kr;
10760 mach_port_t mp;
10761
10762 // Failures here aren't fatal -- the cancellation message
10763 // sent to the resolver is merely advisory.
10764
10765 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
10766 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
10767 return;
10768 }
10769
10770 kr = send_nspace_resolve_cancel(mp, req_id);
10771 if (kr != KERN_SUCCESS) {
10772 os_log_error(OS_LOG_DEFAULT,
10773 "NSPACE send_nspace_resolve_cancel failure: %d", kr);
10774 }
10775
10776 ipc_port_release_send(mp);
10777 }
10778
10779 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)10780 nspace_resolver_req_wait(struct nspace_resolver_request *req)
10781 {
10782 bool send_cancel_message = false;
10783 int error;
10784
10785 NSPACE_REQ_LOCK();
10786
10787 while ((req->r_flags & RRF_COMPLETE) == 0) {
10788 error = msleep(req, &nspace_resolver_request_hash_mutex,
10789 PVFS | PCATCH, "nspace", NULL);
10790 if (error && error != ERESTART) {
10791 req->r_resolver_error = (error == EINTR) ? EINTR :
10792 ETIMEDOUT;
10793 send_cancel_message = true;
10794 break;
10795 }
10796 }
10797
10798 nspace_resolver_req_remove(req);
10799
10800 NSPACE_REQ_UNLOCK();
10801
10802 if (send_cancel_message) {
10803 nspace_resolver_req_cancel(req->r_req_id);
10804 }
10805
10806 return req->r_resolver_error;
10807 }
10808
10809 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)10810 nspace_resolver_req_mark_complete(
10811 struct nspace_resolver_request *req,
10812 int resolver_error)
10813 {
10814 req->r_resolver_error = resolver_error;
10815 req->r_flags |= RRF_COMPLETE;
10816 wakeup(req);
10817 }
10818
10819 static void
nspace_resolver_req_completed(uint32_t req_id,int resolver_error,uint64_t orig_gencount)10820 nspace_resolver_req_completed(uint32_t req_id, int resolver_error, uint64_t orig_gencount)
10821 {
10822 struct nspace_resolver_request *req;
10823
10824 NSPACE_REQ_LOCK();
10825
10826 // If we don't find the request corresponding to our req_id,
10827 // just drop the completion signal on the floor; it's likely
10828 // that the requester interrupted with a signal.
10829
10830 req = nspace_resolver_req_lookup(req_id);
10831 if (req) {
10832 mount_t locked_mp = NULL;
10833
10834 locked_mp = req->r_vp->v_mount;
10835 mount_ref(locked_mp, 0);
10836 mount_lock_renames(locked_mp);
10837
10838 //
10839 // if the resolver isn't already returning an error and we have an
10840 // orig_gencount, then get an iocount on the request vnode and check
10841 // that the gencount on req->r_vp has not changed.
10842 //
10843 // note: a ref was taken on req->r_vp when the request was created
10844 // and that ref will be dropped by that thread when it wakes up.
10845 //
10846 if (resolver_error == 0 &&
10847 orig_gencount != 0 &&
10848 vnode_getwithref(req->r_vp) == 0) {
10849 struct vnode_attr va;
10850 uint64_t cur_gencount;
10851
10852 VATTR_INIT(&va);
10853 VATTR_WANTED(&va, va_recursive_gencount);
10854
10855 if (vnode_getattr(req->r_vp, &va, vfs_context_kernel()) == 0) {
10856 cur_gencount = va.va_recursive_gencount;
10857 } else {
10858 cur_gencount = 0;
10859 }
10860
10861 if (resolver_error == 0 && cur_gencount && orig_gencount && cur_gencount != orig_gencount) {
10862 printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n", orig_gencount, cur_gencount);
10863
10864 // this error will be returned to the thread that initiated the
10865 // materialization of req->r_vp.
10866 resolver_error = EBUSY;
10867
10868 // note: we explicitly do not return an error to the caller (i.e.
10869 // the thread that did the materialization) because they said they
10870 // don't want one.
10871 }
10872
10873 vnode_put(req->r_vp);
10874 }
10875
10876 mount_unlock_renames(locked_mp);
10877 mount_drop(locked_mp, 0);
10878
10879 nspace_resolver_req_mark_complete(req, resolver_error);
10880 }
10881
10882 NSPACE_REQ_UNLOCK();
10883
10884 return;
10885 }
10886
10887 static struct proc *nspace_resolver_proc;
10888
10889 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)10890 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
10891 {
10892 *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
10893 p == nspace_resolver_proc) ? 1 : 0;
10894 return 0;
10895 }
10896
10897 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)10898 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
10899 {
10900 vfs_context_t ctx = vfs_context_current();
10901 int error = 0;
10902
10903 //
10904 // The system filecoordinationd runs as uid == 0. This also
10905 // has the nice side-effect of filtering out filecoordinationd
10906 // running in the simulator.
10907 //
10908 if (!vfs_context_issuser(ctx)) {
10909 return EPERM;
10910 }
10911
10912 error = priv_check_cred(vfs_context_ucred(ctx),
10913 PRIV_VFS_DATALESS_RESOLVER, 0);
10914 if (error) {
10915 return error;
10916 }
10917
10918 if (is_resolver) {
10919 NSPACE_REQ_LOCK();
10920
10921 if (nspace_resolver_proc == NULL) {
10922 proc_lock(p);
10923 p->p_lflag |= P_LNSPACE_RESOLVER;
10924 proc_unlock(p);
10925 nspace_resolver_proc = p;
10926 } else {
10927 error = EBUSY;
10928 }
10929
10930 NSPACE_REQ_UNLOCK();
10931 } else {
10932 // This is basically just like the exit case.
10933 // nspace_resolver_exited() will verify that the
10934 // process is the resolver, and will clear the
10935 // global.
10936 nspace_resolver_exited(p);
10937 }
10938
10939 return error;
10940 }
10941
10942 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)10943 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
10944 {
10945 if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
10946 (p->p_vfs_iopolicy &
10947 P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
10948 *is_prevented = 1;
10949 } else {
10950 *is_prevented = 0;
10951 }
10952 return 0;
10953 }
10954
10955 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)10956 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
10957 {
10958 if (p->p_lflag & P_LNSPACE_RESOLVER) {
10959 return is_prevented ? 0 : EBUSY;
10960 }
10961
10962 if (is_prevented) {
10963 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
10964 } else {
10965 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
10966 }
10967 return 0;
10968 }
10969
10970 static int
nspace_materialization_get_thread_state(int * is_prevented)10971 nspace_materialization_get_thread_state(int *is_prevented)
10972 {
10973 uthread_t ut = current_uthread();
10974
10975 *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
10976 return 0;
10977 }
10978
10979 static int
nspace_materialization_set_thread_state(int is_prevented)10980 nspace_materialization_set_thread_state(int is_prevented)
10981 {
10982 uthread_t ut = current_uthread();
10983
10984 if (is_prevented) {
10985 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
10986 } else {
10987 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
10988 }
10989 return 0;
10990 }
10991
10992 /* the vfs.nspace branch */
10993 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
10994
10995 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)10996 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
10997 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10998 {
10999 struct proc *p = req->p;
11000 int new_value, old_value, changed = 0;
11001 int error;
11002
11003 error = nspace_resolver_get_proc_state(p, &old_value);
11004 if (error) {
11005 return error;
11006 }
11007
11008 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11009 &changed);
11010 if (error == 0 && changed) {
11011 error = nspace_resolver_set_proc_state(p, new_value);
11012 }
11013 return error;
11014 }
11015
11016 /* decorate this process as the dataless file resolver */
11017 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11018 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11019 0, 0, sysctl_nspace_resolver, "I", "");
11020
11021 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11022 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11023 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11024 {
11025 struct proc *p = req->p;
11026 int new_value, old_value, changed = 0;
11027 int error;
11028
11029 error = nspace_materialization_get_proc_state(p, &old_value);
11030 if (error) {
11031 return error;
11032 }
11033
11034 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11035 &changed);
11036 if (error == 0 && changed) {
11037 error = nspace_materialization_set_proc_state(p, new_value);
11038 }
11039 return error;
11040 }
11041
11042 /* decorate this process as not wanting to materialize dataless files */
11043 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
11044 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11045 0, 0, sysctl_nspace_prevent_materialization, "I", "");
11046
11047 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11048 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
11049 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11050 {
11051 int new_value, old_value, changed = 0;
11052 int error;
11053
11054 error = nspace_materialization_get_thread_state(&old_value);
11055 if (error) {
11056 return error;
11057 }
11058
11059 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11060 &changed);
11061 if (error == 0 && changed) {
11062 error = nspace_materialization_set_thread_state(new_value);
11063 }
11064 return error;
11065 }
11066
11067 /* decorate this thread as not wanting to materialize dataless files */
11068 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
11069 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11070 0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
11071
11072 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11073 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
11074 __unused int arg2, struct sysctl_req *req)
11075 {
11076 struct proc *p = req->p;
11077 uint32_t req_status[2] = { 0, 0 };
11078 uint64_t gencount = 0;
11079 int error, is_resolver, changed = 0, gencount_changed;
11080
11081 error = nspace_resolver_get_proc_state(p, &is_resolver);
11082 if (error) {
11083 return error;
11084 }
11085
11086 if (!is_resolver) {
11087 return EPERM;
11088 }
11089
11090 error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11091 &changed);
11092 if (error) {
11093 return error;
11094 }
11095
11096 // get the gencount if it was passed
11097 error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11098 &gencount_changed);
11099 if (error) {
11100 gencount = 0;
11101 // we ignore the error because the gencount was optional
11102 error = 0;
11103 }
11104
11105 /*
11106 * req_status[0] is the req_id
11107 *
11108 * req_status[1] is the errno
11109 */
11110 if (error == 0 && changed) {
11111 nspace_resolver_req_completed(req_status[0],
11112 (int)req_status[1], gencount);
11113 }
11114 return error;
11115 }
11116
11117 /* Resolver reports completed reqs here. */
11118 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11119 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11120 0, 0, sysctl_nspace_complete, "-", "");
11121
11122 #endif /* CONFIG_DATALESS_FILES */
11123
11124 #if CONFIG_DATALESS_FILES
11125 #define __no_dataless_unused /* nothing */
11126 #else
11127 #define __no_dataless_unused __unused
11128 #endif
11129
11130 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)11131 vfs_context_dataless_materialization_is_prevented(
11132 vfs_context_t const ctx __no_dataless_unused)
11133 {
11134 #if CONFIG_DATALESS_FILES
11135 proc_t const p = vfs_context_proc(ctx);
11136 thread_t const t = vfs_context_thread(ctx);
11137 uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11138
11139 /*
11140 * Kernel context ==> return EDEADLK, as we would with any random
11141 * process decorated as no-materialize.
11142 */
11143 if (ctx == vfs_context_kernel()) {
11144 return EDEADLK;
11145 }
11146
11147 /*
11148 * If the process has the dataless-manipulation entitlement,
11149 * materialization is prevented, and depending on the kind
11150 * of file system operation, things get to proceed as if the
11151 * object is not dataless.
11152 */
11153 if (vfs_context_is_dataless_manipulator(ctx)) {
11154 return EJUSTRETURN;
11155 }
11156
11157 /*
11158 * Per-thread decorations override any process-wide decorations.
11159 * (Foundation uses this, and this overrides even the dataless-
11160 * manipulation entitlement so as to make API contracts consistent.)
11161 */
11162 if (ut != NULL) {
11163 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11164 return EDEADLK;
11165 }
11166 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11167 return 0;
11168 }
11169 }
11170
11171 /*
11172 * If the process's iopolicy specifies that dataless files
11173 * can be materialized, then we let it go ahead.
11174 */
11175 if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11176 return 0;
11177 }
11178 #endif /* CONFIG_DATALESS_FILES */
11179
11180 /*
11181 * The default behavior is to not materialize dataless files;
11182 * return to the caller that deadlock was detected.
11183 */
11184 return EDEADLK;
11185 }
11186
11187 void
nspace_resolver_init(void)11188 nspace_resolver_init(void)
11189 {
11190 #if CONFIG_DATALESS_FILES
11191 nspace_resolver_request_hashtbl =
11192 hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
11193 M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
11194 #endif /* CONFIG_DATALESS_FILES */
11195 }
11196
11197 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)11198 nspace_resolver_exited(struct proc *p __no_dataless_unused)
11199 {
11200 #if CONFIG_DATALESS_FILES
11201 struct nspace_resolver_requesthead *bucket;
11202 struct nspace_resolver_request *req;
11203 u_long idx;
11204
11205 NSPACE_REQ_LOCK();
11206
11207 if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11208 p == nspace_resolver_proc) {
11209 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
11210 bucket = &nspace_resolver_request_hashtbl[idx];
11211 LIST_FOREACH(req, bucket, r_hashlink) {
11212 nspace_resolver_req_mark_complete(req,
11213 ETIMEDOUT);
11214 }
11215 }
11216 nspace_resolver_proc = NULL;
11217 }
11218
11219 NSPACE_REQ_UNLOCK();
11220 #endif /* CONFIG_DATALESS_FILES */
11221 }
11222
11223 int
resolve_nspace_item(struct vnode * vp,uint64_t op)11224 resolve_nspace_item(struct vnode *vp, uint64_t op)
11225 {
11226 return resolve_nspace_item_ext(vp, op, NULL);
11227 }
11228
11229 #define DATALESS_RESOLVER_ENTITLEMENT \
11230 "com.apple.private.vfs.dataless-resolver"
11231 #define DATALESS_MANIPULATION_ENTITLEMENT \
11232 "com.apple.private.vfs.dataless-manipulation"
11233
11234 /*
11235 * Return TRUE if the vfs context is associated with a process entitled
11236 * for dataless manipulation.
11237 *
11238 * XXX Arguably belongs in vfs_subr.c, but is here because of the
11239 * complication around CONFIG_DATALESS_FILES.
11240 */
11241 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __unused)11242 vfs_context_is_dataless_manipulator(vfs_context_t ctx __unused)
11243 {
11244 #if CONFIG_DATALESS_FILES
11245 assert(ctx->vc_thread == current_thread());
11246 return IOCurrentTaskHasEntitlement( DATALESS_MANIPULATION_ENTITLEMENT) ||
11247 IOCurrentTaskHasEntitlement(DATALESS_RESOLVER_ENTITLEMENT);
11248 #else
11249 return false;
11250 #endif /* CONFIG_DATALESS_FILES */
11251 }
11252
11253 #if CONFIG_DATALESS_FILES
11254 static void
log_materialization_prevented(vnode_t vp,uint64_t op)11255 log_materialization_prevented(vnode_t vp, uint64_t op)
11256 {
11257 char p_name[MAXCOMLEN + 1];
11258 char *vntype;
11259 proc_selfname(&p_name[0], sizeof(p_name));
11260
11261 if (vp->v_type == VREG) {
11262 vntype = "File";
11263 } else if (vp->v_type == VDIR) {
11264 vntype = "Dir";
11265 } else if (vp->v_type == VLNK) {
11266 vntype = "SymLink";
11267 } else {
11268 vntype = "Other";
11269 }
11270
11271 #if DEVELOPMENT
11272 char *path = NULL;
11273 int len;
11274
11275 path = get_pathbuff();
11276 len = MAXPATHLEN;
11277 if (path) {
11278 vn_getpath(vp, path, &len);
11279 }
11280
11281 os_log_debug(OS_LOG_DEFAULT,
11282 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) path: %s",
11283 p_name, proc_selfpid(),
11284 op, vntype, path ? path : "<unknown-path>");
11285 if (path) {
11286 release_pathbuff(path);
11287 }
11288 #else
11289 os_log_debug(OS_LOG_DEFAULT,
11290 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
11291 p_name, proc_selfpid(),
11292 op, vntype);
11293 #endif
11294 }
11295 #endif /* CONFIG_DATALESS_FILES */
11296
11297
11298 static int
vfs_materialize_item(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused)11299 vfs_materialize_item(
11300 struct vnode *vp __no_dataless_unused,
11301 uint64_t op __no_dataless_unused,
11302 int64_t offset __no_dataless_unused,
11303 int64_t size __no_dataless_unused,
11304 char *lookup_name __no_dataless_unused,
11305 size_t const namelen __no_dataless_unused)
11306 {
11307 #if CONFIG_DATALESS_FILES
11308 struct nspace_resolver_request req;
11309 kern_return_t kern_ret;
11310 mach_port_t mach_port;
11311 char *path = NULL;
11312 vfs_context_t context;
11313 int path_len;
11314 int error;
11315 #ifdef DATALESS_FILES_USE_AUDIT_TOKEN
11316 audit_token_t atoken;
11317 #endif
11318
11319 /*
11320 * If this is a snapshot event and the vnode is on a disk image just
11321 * pretend nothing happened since any change to the disk image will
11322 * cause the disk image itself to get backed up and this avoids multi-
11323 * way deadlocks between the snapshot handler and the ever popular
11324 * diskimages-helper process. The variable nspace_allow_virtual_devs
11325 * allows this behavior to be overridden (for use by the Mobile
11326 * TimeMachine testing infrastructure which uses disk images).
11327 */
11328 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
11329 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
11330 return ENOTSUP;
11331 }
11332
11333 context = vfs_context_current();
11334
11335 error = vfs_context_dataless_materialization_is_prevented(context);
11336 if (error) {
11337 log_materialization_prevented(vp, op);
11338 return error;
11339 }
11340
11341 kern_ret = host_get_filecoordinationd_port(host_priv_self(),
11342 &mach_port);
11343 if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
11344 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
11345 /*
11346 * Treat this like being unable to access the backing store
11347 * server.
11348 */
11349 return ETIMEDOUT;
11350 }
11351
11352 path = zalloc(ZV_NAMEI);
11353 path_len = MAXPATHLEN;
11354
11355 error = vn_getpath(vp, path, &path_len);
11356 if (error) {
11357 goto out_release_port;
11358 }
11359
11360 #ifdef DATALESS_FILES_USE_AUDIT_TOKEN
11361 error = vfs_context_copy_audit_token(context, &atoken);
11362 if (error) {
11363 goto out_release_port;
11364 }
11365 #endif
11366
11367 req.r_req_id = next_nspace_req_id();
11368 req.r_resolver_error = 0;
11369 req.r_flags = 0;
11370 req.r_vp = vp;
11371
11372 NSPACE_REQ_LOCK();
11373 error = nspace_resolver_req_add(&req);
11374 NSPACE_REQ_UNLOCK();
11375 if (error) {
11376 goto out_release_port;
11377 }
11378
11379 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
11380 if (vp->v_type == VDIR) {
11381 char *tmpname = NULL;
11382
11383 /*
11384 * If the caller provided a lookup_name *and* a name length,
11385 * then we assume the lookup_name is not NUL-terminated.
11386 * Allocate a temporary buffer in this case to provide
11387 * a NUL-terminated path name to the IPC call.
11388 */
11389 if (lookup_name != NULL && namelen != 0) {
11390 if (namelen >= PATH_MAX) {
11391 error = EINVAL;
11392 goto out_release_port;
11393 }
11394 tmpname = zalloc(ZV_NAMEI);
11395 strlcpy(tmpname, lookup_name, namelen + 1);
11396 lookup_name = tmpname;
11397 } else if (lookup_name != NULL) {
11398 /*
11399 * If the caller provided a lookup_name with a
11400 * zero name length, then we assume it's NUL-
11401 * terminated. Verify it has a valid length.
11402 */
11403 if (strlen(lookup_name) >= PATH_MAX) {
11404 error = EINVAL;
11405 goto out_release_port;
11406 }
11407 }
11408
11409 #ifdef DATALESS_FILES_USE_AUDIT_TOKEN
11410 kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
11411 req.r_req_id, (uint32_t)(op & 0xffffffff),
11412 lookup_name == NULL ? "" : lookup_name, path, atoken);
11413 #else
11414 kern_ret = send_vfs_resolve_dir(mach_port, req.r_req_id,
11415 proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
11416 lookup_name == NULL ? "" : lookup_name, path);
11417 #endif /* DATALESS_FILES_USE_AUDIT_TOKEN */
11418
11419 if (tmpname != NULL) {
11420 zfree(ZV_NAMEI, tmpname);
11421
11422 /*
11423 * Poison lookup_name rather than reference
11424 * freed memory.
11425 */
11426 lookup_name = NULL;
11427 }
11428 } else {
11429 #ifdef DATALESS_FILES_USE_AUDIT_TOKEN
11430 kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
11431 req.r_req_id, (uint32_t)(op & 0xffffffff),
11432 offset, size, path, atoken);
11433 #else
11434 kern_ret = send_vfs_resolve_file(mach_port, req.r_req_id,
11435 proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
11436 offset, size, path);
11437 #endif /* DATALESS_FILES_USE_AUDIT_TOKEN */
11438 }
11439 if (kern_ret != KERN_SUCCESS) {
11440 /*
11441 * Also treat this like being unable to access the backing
11442 * store server.
11443 */
11444 os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
11445 kern_ret);
11446 error = ETIMEDOUT;
11447
11448 NSPACE_REQ_LOCK();
11449 nspace_resolver_req_remove(&req);
11450 NSPACE_REQ_UNLOCK();
11451 goto out_release_port;
11452 }
11453
11454 /*
11455 * Give back the memory we allocated earlier while we wait; we
11456 * no longer need it.
11457 */
11458 zfree(ZV_NAMEI, path);
11459 path = NULL;
11460
11461 /*
11462 * Request has been submitted to the resolver. Now (interruptibly)
11463 * wait for completion. Upon requrn, the request will have been
11464 * removed from the lookup table.
11465 */
11466 error = nspace_resolver_req_wait(&req);
11467
11468 out_release_port:
11469 if (path != NULL) {
11470 zfree(ZV_NAMEI, path);
11471 }
11472 ipc_port_release_send(mach_port);
11473
11474 return error;
11475 #else
11476 return ENOTSUP;
11477 #endif /* CONFIG_DATALESS_FILES */
11478 }
11479
11480 /*
11481 * vfs_materialize_file: Materialize a regular file.
11482 *
11483 * Inputs:
11484 * vp The dataless file to be materialized.
11485 *
11486 * op What kind of operation is being performed:
11487 * -> NAMESPACE_HANDLER_READ_OP
11488 * -> NAMESPACE_HANDLER_WRITE_OP
11489 * -> NAMESPACE_HANDLER_LINK_CREATE
11490 * -> NAMESPACE_HANDLER_DELETE_OP
11491 * -> NAMESPACE_HANDLER_TRUNCATE_OP
11492 * -> NAMESPACE_HANDLER_RENAME_OP
11493 *
11494 * offset offset of I/O for READ or WRITE. Ignored for
11495 * other ops.
11496 *
11497 * size size of I/O for READ or WRITE Ignored for
11498 * other ops.
11499 *
11500 * If offsize or size are -1 for a READ or WRITE, then the resolver should
11501 * consider the range to be unknown.
11502 *
11503 * Upon successful return, the caller may proceed with the operation.
11504 * N.B. the file may still be "dataless" in this case.
11505 */
11506 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)11507 vfs_materialize_file(
11508 struct vnode *vp,
11509 uint64_t op,
11510 int64_t offset,
11511 int64_t size)
11512 {
11513 if (vp->v_type != VREG) {
11514 return EFTYPE;
11515 }
11516 return vfs_materialize_item(vp, op, offset, size, NULL, 0);
11517 }
11518
11519 /*
11520 * vfs_materialize_dir:
11521 *
11522 * Inputs:
11523 * vp The dataless directory to be materialized.
11524 *
11525 * op What kind of operation is being performed:
11526 * -> NAMESPACE_HANDLER_READ_OP
11527 * -> NAMESPACE_HANDLER_WRITE_OP
11528 * -> NAMESPACE_HANDLER_DELETE_OP
11529 * -> NAMESPACE_HANDLER_RENAME_OP
11530 * -> NAMESPACE_HANDLER_LOOKUP_OP
11531 *
11532 * lookup_name Name being looked up for a LOOKUP op. Ignored for
11533 * other ops. May or may not be NUL-terminated; see below.
11534 *
11535 * namelen If non-zero, then lookup_name is assumed to not be NUL-
11536 * terminated and namelen is the number of valid bytes in
11537 * lookup_name. If zero, then lookup_name is assumed to be
11538 * NUL-terminated.
11539 *
11540 * Upon successful return, the caller may proceed with the operation.
11541 * N.B. the directory may still be "dataless" in this case.
11542 */
11543 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)11544 vfs_materialize_dir(
11545 struct vnode *vp,
11546 uint64_t op,
11547 char *lookup_name,
11548 size_t namelen)
11549 {
11550 if (vp->v_type != VDIR) {
11551 return EFTYPE;
11552 }
11553 if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
11554 return EINVAL;
11555 }
11556 return vfs_materialize_item(vp, op, 0, 0, lookup_name, namelen);
11557 }
11558
11559 int
resolve_nspace_item_ext(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,void * arg __unused)11560 resolve_nspace_item_ext(
11561 struct vnode *vp __no_dataless_unused,
11562 uint64_t op __no_dataless_unused,
11563 void *arg __unused)
11564 {
11565 #if CONFIG_DATALESS_FILES
11566 int error;
11567 mach_port_t mp;
11568 char *path = NULL;
11569 int path_len;
11570 kern_return_t kr;
11571 struct nspace_resolver_request req;
11572
11573 // only allow namespace events on regular files, directories and symlinks.
11574 if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
11575 return EFTYPE;
11576 }
11577
11578 //
11579 // if this is a snapshot event and the vnode is on a
11580 // disk image just pretend nothing happened since any
11581 // change to the disk image will cause the disk image
11582 // itself to get backed up and this avoids multi-way
11583 // deadlocks between the snapshot handler and the ever
11584 // popular diskimages-helper process. the variable
11585 // nspace_allow_virtual_devs allows this behavior to
11586 // be overridden (for use by the Mobile TimeMachine
11587 // testing infrastructure which uses disk images)
11588 //
11589 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
11590 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
11591 return ENOTSUP;
11592 }
11593
11594 error = vfs_context_dataless_materialization_is_prevented(
11595 vfs_context_current());
11596 if (error) {
11597 log_materialization_prevented(vp, op);
11598 return error;
11599 }
11600
11601 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11602 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11603 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
11604 // Treat this like being unable to access the backing
11605 // store server.
11606 return ETIMEDOUT;
11607 }
11608
11609 path = zalloc(ZV_NAMEI);
11610 path_len = MAXPATHLEN;
11611
11612 error = vn_getpath(vp, path, &path_len);
11613 if (error == 0) {
11614 int xxx_rdar44371223; /* XXX Mig bug */
11615 req.r_req_id = next_nspace_req_id();
11616 req.r_resolver_error = 0;
11617 req.r_flags = 0;
11618
11619 if ((error = vnode_ref(vp)) == 0) { // take a ref so that the vnode doesn't go away
11620 req.r_vp = vp;
11621 } else {
11622 goto out_release_port;
11623 }
11624
11625 NSPACE_REQ_LOCK();
11626 error = nspace_resolver_req_add(&req);
11627 NSPACE_REQ_UNLOCK();
11628 if (error) {
11629 vnode_rele(req.r_vp);
11630 goto out_release_port;
11631 }
11632
11633 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
11634 kr = send_nspace_resolve_path(mp, req.r_req_id,
11635 proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
11636 path, &xxx_rdar44371223);
11637 if (kr != KERN_SUCCESS) {
11638 // Also treat this like being unable to access
11639 // the backing store server.
11640 os_log_error(OS_LOG_DEFAULT,
11641 "NSPACE resolve_path failure: %d", kr);
11642 error = ETIMEDOUT;
11643
11644 NSPACE_REQ_LOCK();
11645 nspace_resolver_req_remove(&req);
11646 NSPACE_REQ_UNLOCK();
11647 vnode_rele(req.r_vp);
11648 goto out_release_port;
11649 }
11650
11651 // Give back the memory we allocated earlier while
11652 // we wait; we no longer need it.
11653 zfree(ZV_NAMEI, path);
11654 path = NULL;
11655
11656 // Request has been submitted to the resolver.
11657 // Now (interruptibly) wait for completion.
11658 // Upon requrn, the request will have been removed
11659 // from the lookup table.
11660 error = nspace_resolver_req_wait(&req);
11661
11662 vnode_rele(req.r_vp);
11663 }
11664
11665 out_release_port:
11666 if (path != NULL) {
11667 zfree(ZV_NAMEI, path);
11668 }
11669 ipc_port_release_send(mp);
11670
11671 return error;
11672 #else
11673 return ENOTSUP;
11674 #endif /* CONFIG_DATALESS_FILES */
11675 }
11676
11677 int
nspace_snapshot_event(__unused vnode_t vp,__unused time_t ctime,__unused uint64_t op_type,__unused void * arg)11678 nspace_snapshot_event(__unused vnode_t vp, __unused time_t ctime,
11679 __unused uint64_t op_type, __unused void *arg)
11680 {
11681 return 0;
11682 }
11683
11684 #if 0
11685 static int
11686 build_volfs_path(struct vnode *vp, char *path, int *len)
11687 {
11688 struct vnode_attr va;
11689 int ret;
11690
11691 VATTR_INIT(&va);
11692 VATTR_WANTED(&va, va_fsid);
11693 VATTR_WANTED(&va, va_fileid);
11694
11695 if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
11696 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
11697 ret = -1;
11698 } else {
11699 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
11700 ret = 0;
11701 }
11702
11703 return ret;
11704 }
11705 #endif
11706
11707 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)11708 fsctl_bogus_command_compat(unsigned long cmd)
11709 {
11710 switch (cmd) {
11711 case IOCBASECMD(FSIOC_SYNC_VOLUME):
11712 return FSIOC_SYNC_VOLUME;
11713 case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
11714 return FSIOC_ROUTEFS_SETROUTEID;
11715 case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
11716 return FSIOC_SET_PACKAGE_EXTS;
11717 case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
11718 return FSIOC_SET_FSTYPENAME_OVERRIDE;
11719 case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
11720 return DISK_CONDITIONER_IOC_GET;
11721 case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
11722 return DISK_CONDITIONER_IOC_SET;
11723 case IOCBASECMD(FSIOC_FIOSEEKHOLE):
11724 return FSIOC_FIOSEEKHOLE;
11725 case IOCBASECMD(FSIOC_FIOSEEKDATA):
11726 return FSIOC_FIOSEEKDATA;
11727 case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
11728 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
11729 case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
11730 return SPOTLIGHT_IOC_GET_LAST_MTIME;
11731 }
11732
11733 return cmd;
11734 }
11735
11736 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)11737 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
11738 {
11739 return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
11740 }
11741
11742 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)11743 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
11744 {
11745 struct vfs_attr vfa;
11746 mount_t mp = vp->v_mount;
11747 unsigned arg;
11748 int error;
11749
11750 /* record vid of vp so we can drop it below. */
11751 uint32_t vvid = vp->v_id;
11752
11753 /*
11754 * Then grab mount_iterref so that we can release the vnode.
11755 * Without this, a thread may call vnode_iterate_prepare then
11756 * get into a deadlock because we've never released the root vp
11757 */
11758 error = mount_iterref(mp, 0);
11759 if (error) {
11760 return error;
11761 }
11762 vnode_put(vp);
11763
11764 arg = MNT_NOWAIT;
11765 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
11766 arg = MNT_WAIT;
11767 }
11768
11769 /*
11770 * If the filessytem supports multiple filesytems in a
11771 * partition (For eg APFS volumes in a container, it knows
11772 * that the waitfor argument to VFS_SYNC are flags.
11773 */
11774 VFSATTR_INIT(&vfa);
11775 VFSATTR_WANTED(&vfa, f_capabilities);
11776 if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
11777 VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
11778 ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
11779 ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
11780 arg |= MNT_VOLUME;
11781 }
11782
11783 /* issue the sync for this volume */
11784 (void)sync_callback(mp, &arg);
11785
11786 /*
11787 * Then release the mount_iterref once we're done syncing; it's not
11788 * needed for the VNOP_IOCTL below
11789 */
11790 mount_iterdrop(mp);
11791
11792 if (arg & FSCTL_SYNC_FULLSYNC) {
11793 /* re-obtain vnode iocount on the root vp, if possible */
11794 error = vnode_getwithvid(vp, vvid);
11795 if (error == 0) {
11796 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
11797 vnode_put(vp);
11798 }
11799 }
11800 /* mark the argument VP as having been released */
11801 *arg_vp = NULL;
11802 return error;
11803 }
11804
11805 #if ROUTEFS
11806 static int __attribute__((noinline))
handle_routes(user_addr_t udata)11807 handle_routes(user_addr_t udata)
11808 {
11809 char routepath[MAXPATHLEN];
11810 size_t len = 0;
11811 int error;
11812
11813 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11814 return error;
11815 }
11816 bzero(routepath, MAXPATHLEN);
11817 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
11818 if (error) {
11819 return error;
11820 }
11821 error = routefs_kernel_mount(routepath);
11822 return error;
11823 }
11824 #endif
11825
11826 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)11827 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
11828 {
11829 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
11830 struct vnode_attr va;
11831 int error;
11832
11833 VATTR_INIT(&va);
11834 VATTR_SET(&va, va_flags, cas->new_flags);
11835
11836 error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
11837 return error;
11838 }
11839
11840 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)11841 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
11842 {
11843 struct mount *mp = NULL;
11844 errno_t rootauth = 0;
11845
11846 mp = vp->v_mount;
11847
11848 /*
11849 * query the underlying FS and see if it reports something
11850 * sane for this vnode. If volume is authenticated via
11851 * chunklist, leave that for the caller to determine.
11852 */
11853 rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
11854
11855 return rootauth;
11856 }
11857
11858 /*
11859 * Make a filesystem-specific control call:
11860 */
11861 /* ARGSUSED */
11862 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)11863 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
11864 {
11865 int error = 0;
11866 boolean_t is64bit;
11867 u_int size;
11868 #define STK_PARAMS 128
11869 char stkbuf[STK_PARAMS] = {0};
11870 caddr_t data, memp;
11871 vnode_t vp = *arg_vp;
11872
11873 if (vp->v_type == VCHR || vp->v_type == VBLK) {
11874 return ENOTTY;
11875 }
11876
11877 cmd = fsctl_bogus_command_compat(cmd);
11878
11879 size = IOCPARM_LEN(cmd);
11880 if (size > IOCPARM_MAX) {
11881 return EINVAL;
11882 }
11883
11884 is64bit = proc_is64bit(p);
11885
11886 memp = NULL;
11887
11888 if (size > sizeof(stkbuf)) {
11889 if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
11890 return ENOMEM;
11891 }
11892 data = memp;
11893 } else {
11894 data = &stkbuf[0];
11895 };
11896
11897 if (cmd & IOC_IN) {
11898 if (size) {
11899 error = copyin(udata, data, size);
11900 if (error) {
11901 if (memp) {
11902 kfree_data(memp, size);
11903 }
11904 return error;
11905 }
11906 } else {
11907 if (is64bit) {
11908 *(user_addr_t *)data = udata;
11909 } else {
11910 *(uint32_t *)data = (uint32_t)udata;
11911 }
11912 };
11913 } else if ((cmd & IOC_OUT) && size) {
11914 /*
11915 * Zero the buffer so the user always
11916 * gets back something deterministic.
11917 */
11918 bzero(data, size);
11919 } else if (cmd & IOC_VOID) {
11920 if (is64bit) {
11921 *(user_addr_t *)data = udata;
11922 } else {
11923 *(uint32_t *)data = (uint32_t)udata;
11924 }
11925 }
11926
11927 /* Check to see if it's a generic command */
11928 switch (cmd) {
11929 case FSIOC_SYNC_VOLUME:
11930 error = handle_sync_volume(vp, arg_vp, data, ctx);
11931 break;
11932
11933 case FSIOC_ROUTEFS_SETROUTEID:
11934 #if ROUTEFS
11935 error = handle_routes(udata);
11936 #endif
11937 break;
11938
11939 case FSIOC_SET_PACKAGE_EXTS: {
11940 user_addr_t ext_strings;
11941 uint32_t num_entries;
11942 uint32_t max_width;
11943
11944 if ((error = priv_check_cred(kauth_cred_get(), PRIV_PACKAGE_EXTENSIONS, 0))) {
11945 break;
11946 }
11947
11948 if ((is64bit && size != sizeof(user64_package_ext_info))
11949 || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
11950 // either you're 64-bit and passed a 64-bit struct or
11951 // you're 32-bit and passed a 32-bit struct. otherwise
11952 // it's not ok.
11953 error = EINVAL;
11954 break;
11955 }
11956
11957 if (is64bit) {
11958 if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
11959 assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
11960 }
11961 ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
11962 num_entries = ((user64_package_ext_info *)data)->num_entries;
11963 max_width = ((user64_package_ext_info *)data)->max_width;
11964 } else {
11965 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
11966 num_entries = ((user32_package_ext_info *)data)->num_entries;
11967 max_width = ((user32_package_ext_info *)data)->max_width;
11968 }
11969 error = set_package_extensions_table(ext_strings, num_entries, max_width);
11970 }
11971 break;
11972
11973 case FSIOC_SET_FSTYPENAME_OVERRIDE:
11974 {
11975 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11976 break;
11977 }
11978 if (vp->v_mount) {
11979 mount_lock(vp->v_mount);
11980 if (data[0] != 0) {
11981 int i;
11982 for (i = 0; i < MFSTYPENAMELEN; i++) {
11983 if (!data[i]) {
11984 goto continue_copy;
11985 }
11986 }
11987 /*
11988 * Getting here means we have a user data string which has no
11989 * NULL termination in its first MFSTYPENAMELEN bytes.
11990 * This is bogus, let's avoid strlcpy-ing the read data and
11991 * return an error.
11992 */
11993 error = EINVAL;
11994 goto unlock;
11995 continue_copy:
11996 strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
11997 vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
11998 if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
11999 vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
12000 vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
12001 }
12002 } else {
12003 if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
12004 vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
12005 }
12006 vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
12007 vp->v_mount->fstypename_override[0] = '\0';
12008 }
12009 unlock:
12010 mount_unlock(vp->v_mount);
12011 }
12012 }
12013 break;
12014
12015 case DISK_CONDITIONER_IOC_GET: {
12016 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12017 }
12018 break;
12019
12020 case DISK_CONDITIONER_IOC_SET: {
12021 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
12022 }
12023 break;
12024
12025 case FSIOC_CAS_BSDFLAGS:
12026 error = handle_flags(vp, data, ctx);
12027 break;
12028
12029 case FSIOC_FD_ONLY_OPEN_ONCE: {
12030 error = 0;
12031 if (vnode_usecount(vp) > 1) {
12032 vnode_lock_spin(vp);
12033 if (vp->v_lflag & VL_HASSTREAMS) {
12034 if (vnode_isinuse_locked(vp, 1, 1)) {
12035 error = EBUSY;
12036 }
12037 } else if (vnode_usecount(vp) > 1) {
12038 error = EBUSY;
12039 }
12040 vnode_unlock(vp);
12041 }
12042 }
12043 break;
12044
12045 case FSIOC_EVAL_ROOTAUTH:
12046 error = handle_auth(vp, cmd, data, options, ctx);
12047 break;
12048
12049 default: {
12050 /* other, known commands shouldn't be passed down here */
12051 switch (cmd) {
12052 case F_PUNCHHOLE:
12053 case F_TRIM_ACTIVE_FILE:
12054 case F_RDADVISE:
12055 case F_TRANSCODEKEY:
12056 case F_GETPROTECTIONLEVEL:
12057 case F_GETDEFAULTPROTLEVEL:
12058 case F_MAKECOMPRESSED:
12059 case F_SET_GREEDY_MODE:
12060 case F_SETSTATICCONTENT:
12061 case F_SETIOTYPE:
12062 case F_SETBACKINGSTORE:
12063 case F_GETPATH_MTMINFO:
12064 case APFSIOC_REVERT_TO_SNAPSHOT:
12065 case FSIOC_FIOSEEKHOLE:
12066 case FSIOC_FIOSEEKDATA:
12067 case HFS_GET_BOOT_INFO:
12068 case HFS_SET_BOOT_INFO:
12069 case FIOPINSWAP:
12070 case F_CHKCLEAN:
12071 case F_FULLFSYNC:
12072 case F_BARRIERFSYNC:
12073 case F_FREEZE_FS:
12074 case F_THAW_FS:
12075 case FSIOC_KERNEL_ROOTAUTH:
12076 error = EINVAL;
12077 goto outdrop;
12078 }
12079 /* Invoke the filesystem-specific code */
12080 error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12081 }
12082 } /* end switch stmt */
12083
12084 /*
12085 * if no errors, copy any data to user. Size was
12086 * already set and checked above.
12087 */
12088 if (error == 0 && (cmd & IOC_OUT) && size) {
12089 error = copyout(data, udata, size);
12090 }
12091
12092 outdrop:
12093 if (memp) {
12094 kfree_data(memp, size);
12095 }
12096
12097 return error;
12098 }
12099
12100 /* ARGSUSED */
12101 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)12102 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
12103 {
12104 int error;
12105 struct nameidata nd;
12106 uint32_t nameiflags;
12107 vnode_t vp = NULL;
12108 vfs_context_t ctx = vfs_context_current();
12109
12110 AUDIT_ARG(cmd, (int)uap->cmd);
12111 AUDIT_ARG(value32, uap->options);
12112 /* Get the vnode for the file we are getting info on: */
12113 nameiflags = 0;
12114 //
12115 // if we come through fsctl() then the file is by definition not open.
12116 // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
12117 // lest the caller mistakenly thinks the only open is their own (but in
12118 // reality it's someone elses).
12119 //
12120 if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
12121 return EINVAL;
12122 }
12123 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
12124 nameiflags |= FOLLOW;
12125 }
12126 if (uap->cmd == FSIOC_FIRMLINK_CTL) {
12127 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
12128 }
12129 NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
12130 UIO_USERSPACE, uap->path, ctx);
12131 if ((error = namei(&nd))) {
12132 goto done;
12133 }
12134 vp = nd.ni_vp;
12135 nameidone(&nd);
12136
12137 #if CONFIG_MACF
12138 error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
12139 if (error) {
12140 goto done;
12141 }
12142 #endif
12143
12144 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12145
12146 done:
12147 if (vp) {
12148 vnode_put(vp);
12149 }
12150 return error;
12151 }
12152 /* ARGSUSED */
12153 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)12154 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
12155 {
12156 int error;
12157 vnode_t vp = NULL;
12158 vfs_context_t ctx = vfs_context_current();
12159 int fd = -1;
12160
12161 AUDIT_ARG(fd, uap->fd);
12162 AUDIT_ARG(cmd, (int)uap->cmd);
12163 AUDIT_ARG(value32, uap->options);
12164
12165 /* Get the vnode for the file we are getting info on: */
12166 if ((error = file_vnode(uap->fd, &vp))) {
12167 return error;
12168 }
12169 fd = uap->fd;
12170 if ((error = vnode_getwithref(vp))) {
12171 file_drop(fd);
12172 return error;
12173 }
12174
12175 #if CONFIG_MACF
12176 if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
12177 file_drop(fd);
12178 vnode_put(vp);
12179 return error;
12180 }
12181 #endif
12182
12183 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12184
12185 file_drop(fd);
12186
12187 /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
12188 if (vp) {
12189 vnode_put(vp);
12190 }
12191
12192 return error;
12193 }
12194 /* end of fsctl system call */
12195
12196 #define FILESEC_ACCESS_ENTITLEMENT \
12197 "com.apple.private.vfs.filesec-access"
12198
12199 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)12200 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
12201 {
12202 if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
12203 /*
12204 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
12205 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
12206 */
12207 if ((!setting && vfs_context_issuser(ctx)) ||
12208 IOCurrentTaskHasEntitlement(FILESEC_ACCESS_ENTITLEMENT)) {
12209 return 0;
12210 }
12211 }
12212
12213 return EPERM;
12214 }
12215
12216 /*
12217 * Retrieve the data of an extended attribute.
12218 */
12219 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)12220 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
12221 {
12222 vnode_t vp;
12223 struct nameidata nd;
12224 char attrname[XATTR_MAXNAMELEN + 1];
12225 vfs_context_t ctx = vfs_context_current();
12226 uio_t auio = NULL;
12227 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12228 size_t attrsize = 0;
12229 size_t namelen;
12230 u_int32_t nameiflags;
12231 int error;
12232 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12233
12234 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12235 return EINVAL;
12236 }
12237
12238 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12239 NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
12240 if ((error = namei(&nd))) {
12241 return error;
12242 }
12243 vp = nd.ni_vp;
12244 nameidone(&nd);
12245
12246 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12247 if (error != 0) {
12248 goto out;
12249 }
12250 if (xattr_protected(attrname) &&
12251 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12252 goto out;
12253 }
12254 /*
12255 * the specific check for 0xffffffff is a hack to preserve
12256 * binaray compatibilty in K64 with applications that discovered
12257 * that passing in a buf pointer and a size of -1 resulted in
12258 * just the size of the indicated extended attribute being returned.
12259 * this isn't part of the documented behavior, but because of the
12260 * original implemtation's check for "uap->size > 0", this behavior
12261 * was allowed. In K32 that check turned into a signed comparison
12262 * even though uap->size is unsigned... in K64, we blow by that
12263 * check because uap->size is unsigned and doesn't get sign smeared
12264 * in the munger for a 32 bit user app. we also need to add a
12265 * check to limit the maximum size of the buffer being passed in...
12266 * unfortunately, the underlying fileystems seem to just malloc
12267 * the requested size even if the actual extended attribute is tiny.
12268 * because that malloc is for kernel wired memory, we have to put a
12269 * sane limit on it.
12270 *
12271 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
12272 * U64 running on K64 will yield -1 (64 bits wide)
12273 * U32/U64 running on K32 will yield -1 (32 bits wide)
12274 */
12275 if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
12276 goto no_uio;
12277 }
12278
12279 if (uap->value) {
12280 if (uap->size > (size_t)XATTR_MAXSIZE) {
12281 uap->size = XATTR_MAXSIZE;
12282 }
12283
12284 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12285 &uio_buf[0], sizeof(uio_buf));
12286 uio_addiov(auio, uap->value, uap->size);
12287 }
12288 no_uio:
12289 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
12290 out:
12291 vnode_put(vp);
12292
12293 if (auio) {
12294 *retval = uap->size - uio_resid(auio);
12295 } else {
12296 *retval = (user_ssize_t)attrsize;
12297 }
12298
12299 return error;
12300 }
12301
12302 /*
12303 * Retrieve the data of an extended attribute.
12304 */
12305 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)12306 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
12307 {
12308 vnode_t vp;
12309 char attrname[XATTR_MAXNAMELEN + 1];
12310 vfs_context_t ctx = vfs_context_current();
12311 uio_t auio = NULL;
12312 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12313 size_t attrsize = 0;
12314 size_t namelen;
12315 int error;
12316 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12317
12318 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12319 return EINVAL;
12320 }
12321
12322 if ((error = file_vnode(uap->fd, &vp))) {
12323 return error;
12324 }
12325 if ((error = vnode_getwithref(vp))) {
12326 file_drop(uap->fd);
12327 return error;
12328 }
12329 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12330 if (error != 0) {
12331 goto out;
12332 }
12333 if (xattr_protected(attrname) &&
12334 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12335 goto out;
12336 }
12337 if (uap->value && uap->size > 0) {
12338 if (uap->size > (size_t)XATTR_MAXSIZE) {
12339 uap->size = XATTR_MAXSIZE;
12340 }
12341
12342 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12343 &uio_buf[0], sizeof(uio_buf));
12344 uio_addiov(auio, uap->value, uap->size);
12345 }
12346
12347 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
12348 out:
12349 (void)vnode_put(vp);
12350 file_drop(uap->fd);
12351
12352 if (auio) {
12353 *retval = uap->size - uio_resid(auio);
12354 } else {
12355 *retval = (user_ssize_t)attrsize;
12356 }
12357 return error;
12358 }
12359
12360 /* struct for checkdirs iteration */
12361 struct setxattr_ctx {
12362 struct nameidata nd;
12363 char attrname[XATTR_MAXNAMELEN + 1];
12364 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12365 };
12366
12367 /*
12368 * Set the data of an extended attribute.
12369 */
12370 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)12371 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
12372 {
12373 vnode_t vp;
12374 vfs_context_t ctx = vfs_context_current();
12375 uio_t auio = NULL;
12376 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12377 size_t namelen;
12378 u_int32_t nameiflags;
12379 int error;
12380 struct setxattr_ctx *sactx;
12381
12382 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12383 return EINVAL;
12384 }
12385
12386 sactx = (struct setxattr_ctx *)kalloc_data(sizeof(struct setxattr_ctx), Z_WAITOK);
12387 if (sactx == NULL) {
12388 return ENOMEM;
12389 }
12390
12391 error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
12392 if (error != 0) {
12393 if (error == EPERM) {
12394 /* if the string won't fit in attrname, copyinstr emits EPERM */
12395 error = ENAMETOOLONG;
12396 }
12397 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
12398 goto out;
12399 }
12400 if (xattr_protected(sactx->attrname) &&
12401 (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
12402 goto out;
12403 }
12404 if (uap->size != 0 && uap->value == 0) {
12405 error = EINVAL;
12406 goto out;
12407 }
12408 if (uap->size > INT_MAX) {
12409 error = E2BIG;
12410 goto out;
12411 }
12412
12413 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12414 NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
12415 if ((error = namei(&sactx->nd))) {
12416 goto out;
12417 }
12418 vp = sactx->nd.ni_vp;
12419 nameidone(&sactx->nd);
12420
12421 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
12422 &sactx->uio_buf[0], sizeof(sactx->uio_buf));
12423 uio_addiov(auio, uap->value, uap->size);
12424
12425 error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
12426 #if CONFIG_FSE
12427 if (error == 0) {
12428 add_fsevent(FSE_XATTR_MODIFIED, ctx,
12429 FSE_ARG_VNODE, vp,
12430 FSE_ARG_DONE);
12431 }
12432 #endif
12433 vnode_put(vp);
12434 out:
12435 kfree_data(sactx, sizeof(struct setxattr_ctx));
12436 *retval = 0;
12437 return error;
12438 }
12439
12440 /*
12441 * Set the data of an extended attribute.
12442 */
12443 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)12444 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
12445 {
12446 vnode_t vp;
12447 char attrname[XATTR_MAXNAMELEN + 1];
12448 vfs_context_t ctx = vfs_context_current();
12449 uio_t auio = NULL;
12450 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12451 size_t namelen;
12452 int error;
12453 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12454
12455 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12456 return EINVAL;
12457 }
12458
12459 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12460 if (error != 0) {
12461 if (error == EPERM) {
12462 /* if the string won't fit in attrname, copyinstr emits EPERM */
12463 return ENAMETOOLONG;
12464 }
12465 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
12466 return error;
12467 }
12468 if (xattr_protected(attrname) &&
12469 (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
12470 return error;
12471 }
12472 if (uap->size != 0 && uap->value == 0) {
12473 return EINVAL;
12474 }
12475 if (uap->size > INT_MAX) {
12476 return E2BIG;
12477 }
12478 if ((error = file_vnode(uap->fd, &vp))) {
12479 return error;
12480 }
12481 if ((error = vnode_getwithref(vp))) {
12482 file_drop(uap->fd);
12483 return error;
12484 }
12485 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
12486 &uio_buf[0], sizeof(uio_buf));
12487 uio_addiov(auio, uap->value, uap->size);
12488
12489 error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
12490 #if CONFIG_FSE
12491 if (error == 0) {
12492 add_fsevent(FSE_XATTR_MODIFIED, ctx,
12493 FSE_ARG_VNODE, vp,
12494 FSE_ARG_DONE);
12495 }
12496 #endif
12497 vnode_put(vp);
12498 file_drop(uap->fd);
12499 *retval = 0;
12500 return error;
12501 }
12502
12503 /*
12504 * Remove an extended attribute.
12505 * XXX Code duplication here.
12506 */
12507 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)12508 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
12509 {
12510 vnode_t vp;
12511 struct nameidata nd;
12512 char attrname[XATTR_MAXNAMELEN + 1];
12513 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12514 vfs_context_t ctx = vfs_context_current();
12515 size_t namelen;
12516 u_int32_t nameiflags;
12517 int error;
12518
12519 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12520 return EINVAL;
12521 }
12522
12523 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12524 if (error != 0) {
12525 return error;
12526 }
12527 if (xattr_protected(attrname)) {
12528 return EPERM;
12529 }
12530 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12531 NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
12532 if ((error = namei(&nd))) {
12533 return error;
12534 }
12535 vp = nd.ni_vp;
12536 nameidone(&nd);
12537
12538 error = vn_removexattr(vp, attrname, uap->options, ctx);
12539 #if CONFIG_FSE
12540 if (error == 0) {
12541 add_fsevent(FSE_XATTR_REMOVED, ctx,
12542 FSE_ARG_VNODE, vp,
12543 FSE_ARG_DONE);
12544 }
12545 #endif
12546 vnode_put(vp);
12547 *retval = 0;
12548 return error;
12549 }
12550
12551 /*
12552 * Remove an extended attribute.
12553 * XXX Code duplication here.
12554 */
12555 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)12556 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
12557 {
12558 vnode_t vp;
12559 char attrname[XATTR_MAXNAMELEN + 1];
12560 size_t namelen;
12561 int error;
12562 #if CONFIG_FSE
12563 vfs_context_t ctx = vfs_context_current();
12564 #endif
12565
12566 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12567 return EINVAL;
12568 }
12569
12570 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12571 if (error != 0) {
12572 return error;
12573 }
12574 if (xattr_protected(attrname)) {
12575 return EPERM;
12576 }
12577 if ((error = file_vnode(uap->fd, &vp))) {
12578 return error;
12579 }
12580 if ((error = vnode_getwithref(vp))) {
12581 file_drop(uap->fd);
12582 return error;
12583 }
12584
12585 error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
12586 #if CONFIG_FSE
12587 if (error == 0) {
12588 add_fsevent(FSE_XATTR_REMOVED, ctx,
12589 FSE_ARG_VNODE, vp,
12590 FSE_ARG_DONE);
12591 }
12592 #endif
12593 vnode_put(vp);
12594 file_drop(uap->fd);
12595 *retval = 0;
12596 return error;
12597 }
12598
12599 /*
12600 * Retrieve the list of extended attribute names.
12601 * XXX Code duplication here.
12602 */
12603 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)12604 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
12605 {
12606 vnode_t vp;
12607 struct nameidata nd;
12608 vfs_context_t ctx = vfs_context_current();
12609 uio_t auio = NULL;
12610 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12611 size_t attrsize = 0;
12612 u_int32_t nameiflags;
12613 int error;
12614 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12615
12616 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12617 return EINVAL;
12618 }
12619
12620 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12621 NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
12622 if ((error = namei(&nd))) {
12623 return error;
12624 }
12625 vp = nd.ni_vp;
12626 nameidone(&nd);
12627 if (uap->namebuf != 0 && uap->bufsize > 0) {
12628 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
12629 &uio_buf[0], sizeof(uio_buf));
12630 uio_addiov(auio, uap->namebuf, uap->bufsize);
12631 }
12632
12633 error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
12634
12635 vnode_put(vp);
12636 if (auio) {
12637 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
12638 } else {
12639 *retval = (user_ssize_t)attrsize;
12640 }
12641 return error;
12642 }
12643
12644 /*
12645 * Retrieve the list of extended attribute names.
12646 * XXX Code duplication here.
12647 */
12648 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)12649 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
12650 {
12651 vnode_t vp;
12652 uio_t auio = NULL;
12653 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12654 size_t attrsize = 0;
12655 int error;
12656 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12657
12658 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12659 return EINVAL;
12660 }
12661
12662 if ((error = file_vnode(uap->fd, &vp))) {
12663 return error;
12664 }
12665 if ((error = vnode_getwithref(vp))) {
12666 file_drop(uap->fd);
12667 return error;
12668 }
12669 if (uap->namebuf != 0 && uap->bufsize > 0) {
12670 auio = uio_createwithbuffer(1, 0, spacetype,
12671 UIO_READ, &uio_buf[0], sizeof(uio_buf));
12672 uio_addiov(auio, uap->namebuf, uap->bufsize);
12673 }
12674
12675 error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
12676
12677 vnode_put(vp);
12678 file_drop(uap->fd);
12679 if (auio) {
12680 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
12681 } else {
12682 *retval = (user_ssize_t)attrsize;
12683 }
12684 return error;
12685 }
12686
12687 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)12688 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
12689 vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
12690 {
12691 int error;
12692 struct mount *mp = NULL;
12693 vnode_t vp;
12694 int length;
12695 int bpflags;
12696 /* maximum number of times to retry build_path */
12697 unsigned int retries = 0x10;
12698
12699 if (bufsize > PAGE_SIZE) {
12700 return EINVAL;
12701 }
12702
12703 if (buf == NULL) {
12704 return ENOMEM;
12705 }
12706
12707 retry:
12708 if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
12709 error = ENOTSUP; /* unexpected failure */
12710 return ENOTSUP;
12711 }
12712
12713 #if CONFIG_UNION_MOUNTS
12714 unionget:
12715 #endif /* CONFIG_UNION_MOUNTS */
12716 if (objid == 2) {
12717 struct vfs_attr vfsattr;
12718 int use_vfs_root = TRUE;
12719
12720 VFSATTR_INIT(&vfsattr);
12721 VFSATTR_WANTED(&vfsattr, f_capabilities);
12722 if (!(options & FSOPT_ISREALFSID) &&
12723 vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
12724 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
12725 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
12726 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
12727 use_vfs_root = FALSE;
12728 }
12729 }
12730
12731 if (use_vfs_root) {
12732 error = VFS_ROOT(mp, &vp, ctx);
12733 } else {
12734 error = VFS_VGET(mp, objid, &vp, ctx);
12735 }
12736 } else {
12737 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
12738 }
12739
12740 #if CONFIG_UNION_MOUNTS
12741 if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
12742 /*
12743 * If the fileid isn't found and we're in a union
12744 * mount volume, then see if the fileid is in the
12745 * mounted-on volume.
12746 */
12747 struct mount *tmp = mp;
12748 mp = vnode_mount(tmp->mnt_vnodecovered);
12749 vfs_unbusy(tmp);
12750 if (vfs_busy(mp, LK_NOWAIT) == 0) {
12751 goto unionget;
12752 }
12753 } else {
12754 vfs_unbusy(mp);
12755 }
12756 #else
12757 vfs_unbusy(mp);
12758 #endif /* CONFIG_UNION_MOUNTS */
12759
12760 if (error) {
12761 return error;
12762 }
12763
12764 #if CONFIG_MACF
12765 error = mac_vnode_check_fsgetpath(ctx, vp);
12766 if (error) {
12767 vnode_put(vp);
12768 return error;
12769 }
12770 #endif
12771
12772 /* Obtain the absolute path to this vnode. */
12773 bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
12774 if (options & FSOPT_NOFIRMLINKPATH) {
12775 bpflags |= BUILDPATH_NO_FIRMLINK;
12776 }
12777 bpflags |= BUILDPATH_CHECK_MOVED;
12778 error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
12779 vnode_put(vp);
12780
12781 if (error) {
12782 /* there was a race building the path, try a few more times */
12783 if (error == EAGAIN) {
12784 --retries;
12785 if (retries > 0) {
12786 goto retry;
12787 }
12788
12789 error = ENOENT;
12790 }
12791 goto out;
12792 }
12793
12794 AUDIT_ARG(text, buf);
12795
12796 if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
12797 unsigned long path_words[NUMPARMS];
12798 size_t path_len = sizeof(path_words);
12799
12800 if ((size_t)length < path_len) {
12801 memcpy((char *)path_words, buf, length);
12802 memset((char *)path_words + length, 0, path_len - length);
12803
12804 path_len = length;
12805 } else {
12806 memcpy((char *)path_words, buf + (length - path_len), path_len);
12807 }
12808
12809 kdebug_vfs_lookup(path_words, (int)path_len, vp,
12810 KDBG_VFS_LOOKUP_FLAG_LOOKUP);
12811 }
12812
12813 *pathlen = length; /* may be superseded by error */
12814
12815 out:
12816 return error;
12817 }
12818
12819 /*
12820 * Obtain the full pathname of a file system object by id.
12821 */
12822 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)12823 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
12824 uint32_t options, user_ssize_t *retval)
12825 {
12826 vfs_context_t ctx = vfs_context_current();
12827 fsid_t fsid;
12828 char *realpath;
12829 int length;
12830 int error;
12831
12832 if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
12833 return EINVAL;
12834 }
12835
12836 if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
12837 return error;
12838 }
12839 AUDIT_ARG(value32, fsid.val[0]);
12840 AUDIT_ARG(value64, objid);
12841 /* Restrict output buffer size for now. */
12842
12843 if (bufsize > PAGE_SIZE || bufsize <= 0) {
12844 return EINVAL;
12845 }
12846 realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
12847 if (realpath == NULL) {
12848 return ENOMEM;
12849 }
12850
12851 error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
12852 options, &length);
12853
12854 if (error) {
12855 goto out;
12856 }
12857
12858 error = copyout((caddr_t)realpath, buf, length);
12859
12860 *retval = (user_ssize_t)length; /* may be superseded by error */
12861 out:
12862 kfree_data(realpath, bufsize);
12863 return error;
12864 }
12865
12866 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)12867 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
12868 {
12869 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
12870 0, retval);
12871 }
12872
12873 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)12874 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
12875 {
12876 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
12877 uap->options, retval);
12878 }
12879
12880 /*
12881 * Common routine to handle various flavors of statfs data heading out
12882 * to user space.
12883 *
12884 * Returns: 0 Success
12885 * EFAULT
12886 */
12887 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)12888 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
12889 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
12890 boolean_t partial_copy)
12891 {
12892 int error;
12893 int my_size, copy_size;
12894
12895 if (is_64_bit) {
12896 struct user64_statfs sfs;
12897 my_size = copy_size = sizeof(sfs);
12898 bzero(&sfs, my_size);
12899 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12900 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
12901 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12902 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
12903 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
12904 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
12905 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
12906 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
12907 sfs.f_files = (user64_long_t)sfsp->f_files;
12908 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
12909 sfs.f_fsid = sfsp->f_fsid;
12910 sfs.f_owner = sfsp->f_owner;
12911 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12912 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12913 } else {
12914 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12915 }
12916 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12917 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12918
12919 if (partial_copy) {
12920 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12921 }
12922 error = copyout((caddr_t)&sfs, bufp, copy_size);
12923 } else {
12924 struct user32_statfs sfs;
12925
12926 my_size = copy_size = sizeof(sfs);
12927 bzero(&sfs, my_size);
12928
12929 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12930 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
12931 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12932
12933 /*
12934 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
12935 * have to fudge the numbers here in that case. We inflate the blocksize in order
12936 * to reflect the filesystem size as best we can.
12937 */
12938 if ((sfsp->f_blocks > INT_MAX)
12939 /* Hack for 4061702 . I think the real fix is for Carbon to
12940 * look for some volume capability and not depend on hidden
12941 * semantics agreed between a FS and carbon.
12942 * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
12943 * for Carbon to set bNoVolumeSizes volume attribute.
12944 * Without this the webdavfs files cannot be copied onto
12945 * disk as they look huge. This change should not affect
12946 * XSAN as they should not setting these to -1..
12947 */
12948 && (sfsp->f_blocks != 0xffffffffffffffffULL)
12949 && (sfsp->f_bfree != 0xffffffffffffffffULL)
12950 && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
12951 int shift;
12952
12953 /*
12954 * Work out how far we have to shift the block count down to make it fit.
12955 * Note that it's possible to have to shift so far that the resulting
12956 * blocksize would be unreportably large. At that point, we will clip
12957 * any values that don't fit.
12958 *
12959 * For safety's sake, we also ensure that f_iosize is never reported as
12960 * being smaller than f_bsize.
12961 */
12962 for (shift = 0; shift < 32; shift++) {
12963 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
12964 break;
12965 }
12966 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
12967 break;
12968 }
12969 }
12970 #define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
12971 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
12972 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
12973 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
12974 #undef __SHIFT_OR_CLIP
12975 sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
12976 sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
12977 } else {
12978 /* filesystem is small enough to be reported honestly */
12979 sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
12980 sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
12981 sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
12982 sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
12983 sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
12984 }
12985 sfs.f_files = (user32_long_t)sfsp->f_files;
12986 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
12987 sfs.f_fsid = sfsp->f_fsid;
12988 sfs.f_owner = sfsp->f_owner;
12989 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12990 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12991 } else {
12992 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12993 }
12994 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12995 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12996
12997 if (partial_copy) {
12998 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12999 }
13000 error = copyout((caddr_t)&sfs, bufp, copy_size);
13001 }
13002
13003 if (sizep != NULL) {
13004 *sizep = my_size;
13005 }
13006 return error;
13007 }
13008
13009 /*
13010 * copy stat structure into user_stat structure.
13011 */
13012 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)13013 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
13014 {
13015 bzero(usbp, sizeof(*usbp));
13016
13017 usbp->st_dev = sbp->st_dev;
13018 usbp->st_ino = sbp->st_ino;
13019 usbp->st_mode = sbp->st_mode;
13020 usbp->st_nlink = sbp->st_nlink;
13021 usbp->st_uid = sbp->st_uid;
13022 usbp->st_gid = sbp->st_gid;
13023 usbp->st_rdev = sbp->st_rdev;
13024 #ifndef _POSIX_C_SOURCE
13025 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13026 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13027 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13028 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13029 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13030 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13031 #else
13032 usbp->st_atime = sbp->st_atime;
13033 usbp->st_atimensec = sbp->st_atimensec;
13034 usbp->st_mtime = sbp->st_mtime;
13035 usbp->st_mtimensec = sbp->st_mtimensec;
13036 usbp->st_ctime = sbp->st_ctime;
13037 usbp->st_ctimensec = sbp->st_ctimensec;
13038 #endif
13039 usbp->st_size = sbp->st_size;
13040 usbp->st_blocks = sbp->st_blocks;
13041 usbp->st_blksize = sbp->st_blksize;
13042 usbp->st_flags = sbp->st_flags;
13043 usbp->st_gen = sbp->st_gen;
13044 usbp->st_lspare = sbp->st_lspare;
13045 usbp->st_qspare[0] = sbp->st_qspare[0];
13046 usbp->st_qspare[1] = sbp->st_qspare[1];
13047 }
13048
13049 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)13050 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
13051 {
13052 bzero(usbp, sizeof(*usbp));
13053
13054 usbp->st_dev = sbp->st_dev;
13055 usbp->st_ino = sbp->st_ino;
13056 usbp->st_mode = sbp->st_mode;
13057 usbp->st_nlink = sbp->st_nlink;
13058 usbp->st_uid = sbp->st_uid;
13059 usbp->st_gid = sbp->st_gid;
13060 usbp->st_rdev = sbp->st_rdev;
13061 #ifndef _POSIX_C_SOURCE
13062 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13063 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13064 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13065 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13066 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13067 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13068 #else
13069 usbp->st_atime = sbp->st_atime;
13070 usbp->st_atimensec = sbp->st_atimensec;
13071 usbp->st_mtime = sbp->st_mtime;
13072 usbp->st_mtimensec = sbp->st_mtimensec;
13073 usbp->st_ctime = sbp->st_ctime;
13074 usbp->st_ctimensec = sbp->st_ctimensec;
13075 #endif
13076 usbp->st_size = sbp->st_size;
13077 usbp->st_blocks = sbp->st_blocks;
13078 usbp->st_blksize = sbp->st_blksize;
13079 usbp->st_flags = sbp->st_flags;
13080 usbp->st_gen = sbp->st_gen;
13081 usbp->st_lspare = sbp->st_lspare;
13082 usbp->st_qspare[0] = sbp->st_qspare[0];
13083 usbp->st_qspare[1] = sbp->st_qspare[1];
13084 }
13085
13086 /*
13087 * copy stat64 structure into user_stat64 structure.
13088 */
13089 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)13090 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
13091 {
13092 bzero(usbp, sizeof(*usbp));
13093
13094 usbp->st_dev = sbp->st_dev;
13095 usbp->st_ino = sbp->st_ino;
13096 usbp->st_mode = sbp->st_mode;
13097 usbp->st_nlink = sbp->st_nlink;
13098 usbp->st_uid = sbp->st_uid;
13099 usbp->st_gid = sbp->st_gid;
13100 usbp->st_rdev = sbp->st_rdev;
13101 #ifndef _POSIX_C_SOURCE
13102 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13103 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13104 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13105 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13106 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13107 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13108 usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
13109 usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
13110 #else
13111 usbp->st_atime = sbp->st_atime;
13112 usbp->st_atimensec = sbp->st_atimensec;
13113 usbp->st_mtime = sbp->st_mtime;
13114 usbp->st_mtimensec = sbp->st_mtimensec;
13115 usbp->st_ctime = sbp->st_ctime;
13116 usbp->st_ctimensec = sbp->st_ctimensec;
13117 usbp->st_birthtime = sbp->st_birthtime;
13118 usbp->st_birthtimensec = sbp->st_birthtimensec;
13119 #endif
13120 usbp->st_size = sbp->st_size;
13121 usbp->st_blocks = sbp->st_blocks;
13122 usbp->st_blksize = sbp->st_blksize;
13123 usbp->st_flags = sbp->st_flags;
13124 usbp->st_gen = sbp->st_gen;
13125 usbp->st_lspare = sbp->st_lspare;
13126 usbp->st_qspare[0] = sbp->st_qspare[0];
13127 usbp->st_qspare[1] = sbp->st_qspare[1];
13128 }
13129
13130 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)13131 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
13132 {
13133 bzero(usbp, sizeof(*usbp));
13134
13135 usbp->st_dev = sbp->st_dev;
13136 usbp->st_ino = sbp->st_ino;
13137 usbp->st_mode = sbp->st_mode;
13138 usbp->st_nlink = sbp->st_nlink;
13139 usbp->st_uid = sbp->st_uid;
13140 usbp->st_gid = sbp->st_gid;
13141 usbp->st_rdev = sbp->st_rdev;
13142 #ifndef _POSIX_C_SOURCE
13143 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13144 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13145 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13146 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13147 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13148 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13149 usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
13150 usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
13151 #else
13152 usbp->st_atime = sbp->st_atime;
13153 usbp->st_atimensec = sbp->st_atimensec;
13154 usbp->st_mtime = sbp->st_mtime;
13155 usbp->st_mtimensec = sbp->st_mtimensec;
13156 usbp->st_ctime = sbp->st_ctime;
13157 usbp->st_ctimensec = sbp->st_ctimensec;
13158 usbp->st_birthtime = sbp->st_birthtime;
13159 usbp->st_birthtimensec = sbp->st_birthtimensec;
13160 #endif
13161 usbp->st_size = sbp->st_size;
13162 usbp->st_blocks = sbp->st_blocks;
13163 usbp->st_blksize = sbp->st_blksize;
13164 usbp->st_flags = sbp->st_flags;
13165 usbp->st_gen = sbp->st_gen;
13166 usbp->st_lspare = sbp->st_lspare;
13167 usbp->st_qspare[0] = sbp->st_qspare[0];
13168 usbp->st_qspare[1] = sbp->st_qspare[1];
13169 }
13170
13171 /*
13172 * Purge buffer cache for simulating cold starts
13173 */
13174 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)13175 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
13176 {
13177 ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
13178
13179 return VNODE_RETURNED;
13180 }
13181
13182 static int
vfs_purge_callback(mount_t mp,__unused void * arg)13183 vfs_purge_callback(mount_t mp, __unused void * arg)
13184 {
13185 vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
13186
13187 return VFS_RETURNED;
13188 }
13189
13190 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)13191 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
13192 {
13193 if (!kauth_cred_issuser(kauth_cred_get())) {
13194 return EPERM;
13195 }
13196
13197 vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
13198
13199 return 0;
13200 }
13201
13202 /*
13203 * gets the vnode associated with the (unnamed) snapshot directory
13204 * for a Filesystem. The snapshot directory vnode is returned with
13205 * an iocount on it.
13206 */
13207 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)13208 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
13209 {
13210 return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
13211 }
13212
13213 /*
13214 * Get the snapshot vnode.
13215 *
13216 * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
13217 * needs nameidone() on ndp.
13218 *
13219 * If the snapshot vnode exists it is returned in ndp->ni_vp.
13220 *
13221 * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
13222 * not needed.
13223 */
13224 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)13225 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
13226 user_addr_t name, struct nameidata *ndp, int32_t op,
13227 #if !CONFIG_TRIGGERS
13228 __unused
13229 #endif
13230 enum path_operation pathop,
13231 vfs_context_t ctx)
13232 {
13233 int error, i;
13234 caddr_t name_buf;
13235 size_t name_len;
13236 struct vfs_attr vfa;
13237
13238 *sdvpp = NULLVP;
13239 *rvpp = NULLVP;
13240
13241 error = vnode_getfromfd(ctx, dirfd, rvpp);
13242 if (error) {
13243 return error;
13244 }
13245
13246 if (!vnode_isvroot(*rvpp)) {
13247 error = EINVAL;
13248 goto out;
13249 }
13250
13251 /* Make sure the filesystem supports snapshots */
13252 VFSATTR_INIT(&vfa);
13253 VFSATTR_WANTED(&vfa, f_capabilities);
13254 if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
13255 !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
13256 !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
13257 VOL_CAP_INT_SNAPSHOT)) ||
13258 !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
13259 VOL_CAP_INT_SNAPSHOT))) {
13260 error = ENOTSUP;
13261 goto out;
13262 }
13263
13264 error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
13265 if (error) {
13266 goto out;
13267 }
13268
13269 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13270 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13271 if (error) {
13272 goto out1;
13273 }
13274
13275 /*
13276 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
13277 * (the length returned by copyinstr includes the terminating NUL)
13278 */
13279 if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
13280 (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
13281 error = EINVAL;
13282 goto out1;
13283 }
13284 for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
13285 ;
13286 }
13287 if (i < (int)name_len) {
13288 error = EINVAL;
13289 goto out1;
13290 }
13291
13292 #if CONFIG_MACF
13293 if (op == CREATE) {
13294 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
13295 name_buf);
13296 } else if (op == DELETE) {
13297 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
13298 name_buf);
13299 }
13300 if (error) {
13301 goto out1;
13302 }
13303 #endif
13304
13305 /* Check if the snapshot already exists ... */
13306 NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
13307 UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
13308 ndp->ni_dvp = *sdvpp;
13309
13310 error = namei(ndp);
13311 out1:
13312 zfree(ZV_NAMEI, name_buf);
13313 out:
13314 if (error) {
13315 if (*sdvpp) {
13316 vnode_put(*sdvpp);
13317 *sdvpp = NULLVP;
13318 }
13319 if (*rvpp) {
13320 vnode_put(*rvpp);
13321 *rvpp = NULLVP;
13322 }
13323 }
13324 return error;
13325 }
13326
13327 /*
13328 * create a filesystem snapshot (for supporting filesystems)
13329 *
13330 * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
13331 * We get to the (unnamed) snapshot directory vnode and create the vnode
13332 * for the snapshot in it.
13333 *
13334 * Restrictions:
13335 *
13336 * a) Passed in name for snapshot cannot have slashes.
13337 * b) name can't be "." or ".."
13338 *
13339 * Since this requires superuser privileges, vnode_authorize calls are not
13340 * made.
13341 */
13342 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13343 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
13344 vfs_context_t ctx)
13345 {
13346 vnode_t rvp, snapdvp;
13347 int error;
13348 struct nameidata *ndp;
13349
13350 ndp = kalloc_type(struct nameidata, Z_WAITOK);
13351
13352 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
13353 OP_LINK, ctx);
13354 if (error) {
13355 goto out;
13356 }
13357
13358 if (ndp->ni_vp) {
13359 vnode_put(ndp->ni_vp);
13360 error = EEXIST;
13361 } else {
13362 struct vnode_attr *vap;
13363 vnode_t vp = NULLVP;
13364
13365 vap = kalloc_type(struct vnode_attr, Z_WAITOK);
13366
13367 VATTR_INIT(vap);
13368 VATTR_SET(vap, va_type, VREG);
13369 VATTR_SET(vap, va_mode, 0);
13370
13371 error = vn_create(snapdvp, &vp, ndp, vap,
13372 VN_CREATE_NOAUTH | VN_CREATE_NOINHERIT, 0, NULL, ctx);
13373 if (!error && vp) {
13374 vnode_put(vp);
13375 }
13376
13377 kfree_type(struct vnode_attr, vap);
13378 }
13379
13380 nameidone(ndp);
13381 vnode_put(snapdvp);
13382 vnode_put(rvp);
13383 out:
13384 kfree_type(struct nameidata, ndp);
13385
13386 return error;
13387 }
13388
13389 /*
13390 * Delete a Filesystem snapshot
13391 *
13392 * get the vnode for the unnamed snapshot directory and the snapshot and
13393 * delete the snapshot.
13394 */
13395 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13396 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
13397 vfs_context_t ctx)
13398 {
13399 vnode_t rvp, snapdvp;
13400 int error;
13401 struct nameidata *ndp;
13402
13403 ndp = kalloc_type(struct nameidata, Z_WAITOK);
13404
13405 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
13406 OP_UNLINK, ctx);
13407 if (error) {
13408 goto out;
13409 }
13410
13411 error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
13412 VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
13413
13414 vnode_put(ndp->ni_vp);
13415 nameidone(ndp);
13416 vnode_put(snapdvp);
13417 vnode_put(rvp);
13418 out:
13419 kfree_type(struct nameidata, ndp);
13420
13421 return error;
13422 }
13423
13424 /*
13425 * Revert a filesystem to a snapshot
13426 *
13427 * Marks the filesystem to revert to the given snapshot on next mount.
13428 */
13429 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13430 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
13431 vfs_context_t ctx)
13432 {
13433 int error;
13434 vnode_t rvp;
13435 mount_t mp;
13436 struct fs_snapshot_revert_args revert_data;
13437 struct componentname cnp;
13438 caddr_t name_buf;
13439 size_t name_len;
13440
13441 error = vnode_getfromfd(ctx, dirfd, &rvp);
13442 if (error) {
13443 return error;
13444 }
13445 mp = vnode_mount(rvp);
13446
13447 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13448 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13449 if (error) {
13450 zfree(ZV_NAMEI, name_buf);
13451 vnode_put(rvp);
13452 return error;
13453 }
13454
13455 #if CONFIG_MACF
13456 error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
13457 if (error) {
13458 zfree(ZV_NAMEI, name_buf);
13459 vnode_put(rvp);
13460 return error;
13461 }
13462 #endif
13463
13464 /*
13465 * Grab mount_iterref so that we can release the vnode,
13466 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
13467 */
13468 error = mount_iterref(mp, 0);
13469 vnode_put(rvp);
13470 if (error) {
13471 zfree(ZV_NAMEI, name_buf);
13472 return error;
13473 }
13474
13475 memset(&cnp, 0, sizeof(cnp));
13476 cnp.cn_pnbuf = (char *)name_buf;
13477 cnp.cn_nameiop = LOOKUP;
13478 cnp.cn_flags = ISLASTCN | HASBUF;
13479 cnp.cn_pnlen = MAXPATHLEN;
13480 cnp.cn_nameptr = cnp.cn_pnbuf;
13481 cnp.cn_namelen = (int)name_len;
13482 revert_data.sr_cnp = &cnp;
13483
13484 error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
13485 mount_iterdrop(mp);
13486 zfree(ZV_NAMEI, name_buf);
13487
13488 if (error) {
13489 /* If there was any error, try again using VNOP_IOCTL */
13490
13491 vnode_t snapdvp;
13492 struct nameidata namend;
13493
13494 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
13495 OP_LOOKUP, ctx);
13496 if (error) {
13497 return error;
13498 }
13499
13500
13501 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
13502 0, ctx);
13503
13504 vnode_put(namend.ni_vp);
13505 nameidone(&namend);
13506 vnode_put(snapdvp);
13507 vnode_put(rvp);
13508 }
13509
13510 return error;
13511 }
13512
13513 /*
13514 * rename a Filesystem snapshot
13515 *
13516 * get the vnode for the unnamed snapshot directory and the snapshot and
13517 * rename the snapshot. This is a very specialised (and simple) case of
13518 * rename(2) (which has to deal with a lot more complications). It differs
13519 * slightly from rename(2) in that EEXIST is returned if the new name exists.
13520 */
13521 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)13522 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
13523 __unused uint32_t flags, vfs_context_t ctx)
13524 {
13525 vnode_t rvp, snapdvp;
13526 int error, i;
13527 caddr_t newname_buf;
13528 size_t name_len;
13529 vnode_t fvp;
13530 struct nameidata *fromnd, *tond;
13531 /* carving out a chunk for structs that are too big to be on stack. */
13532 struct {
13533 struct nameidata from_node;
13534 struct nameidata to_node;
13535 } * __rename_data;
13536
13537 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
13538 fromnd = &__rename_data->from_node;
13539 tond = &__rename_data->to_node;
13540
13541 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
13542 OP_UNLINK, ctx);
13543 if (error) {
13544 goto out;
13545 }
13546 fvp = fromnd->ni_vp;
13547
13548 newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13549 error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
13550 if (error) {
13551 goto out1;
13552 }
13553
13554 /*
13555 * Some sanity checks- new name can't be empty, "." or ".." or have
13556 * slashes.
13557 * (the length returned by copyinstr includes the terminating NUL)
13558 *
13559 * The FS rename VNOP is suppossed to handle this but we'll pick it
13560 * off here itself.
13561 */
13562 if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
13563 (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
13564 error = EINVAL;
13565 goto out1;
13566 }
13567 for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
13568 ;
13569 }
13570 if (i < (int)name_len) {
13571 error = EINVAL;
13572 goto out1;
13573 }
13574
13575 #if CONFIG_MACF
13576 error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
13577 newname_buf);
13578 if (error) {
13579 goto out1;
13580 }
13581 #endif
13582
13583 NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
13584 UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
13585 tond->ni_dvp = snapdvp;
13586
13587 error = namei(tond);
13588 if (error) {
13589 goto out2;
13590 } else if (tond->ni_vp) {
13591 /*
13592 * snapshot rename behaves differently than rename(2) - if the
13593 * new name exists, EEXIST is returned.
13594 */
13595 vnode_put(tond->ni_vp);
13596 error = EEXIST;
13597 goto out2;
13598 }
13599
13600 error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
13601 &tond->ni_cnd, ctx);
13602
13603 out2:
13604 nameidone(tond);
13605 out1:
13606 zfree(ZV_NAMEI, newname_buf);
13607 vnode_put(fvp);
13608 vnode_put(snapdvp);
13609 vnode_put(rvp);
13610 nameidone(fromnd);
13611 out:
13612 kfree_type(typeof(*__rename_data), __rename_data);
13613 return error;
13614 }
13615
13616 /*
13617 * Mount a Filesystem snapshot
13618 *
13619 * get the vnode for the unnamed snapshot directory and the snapshot and
13620 * mount the snapshot.
13621 */
13622 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)13623 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
13624 __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
13625 {
13626 mount_t mp;
13627 vnode_t rvp, snapdvp, snapvp, vp, pvp;
13628 struct fs_snapshot_mount_args smnt_data;
13629 int error;
13630 struct nameidata *snapndp, *dirndp;
13631 /* carving out a chunk for structs that are too big to be on stack. */
13632 struct {
13633 struct nameidata snapnd;
13634 struct nameidata dirnd;
13635 } * __snapshot_mount_data;
13636
13637 __snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
13638 snapndp = &__snapshot_mount_data->snapnd;
13639 dirndp = &__snapshot_mount_data->dirnd;
13640
13641 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
13642 OP_LOOKUP, ctx);
13643 if (error) {
13644 goto out;
13645 }
13646
13647 snapvp = snapndp->ni_vp;
13648 if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
13649 error = EIO;
13650 goto out1;
13651 }
13652
13653 /* Get the vnode to be covered */
13654 NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
13655 UIO_USERSPACE, directory, ctx);
13656 error = namei(dirndp);
13657 if (error) {
13658 goto out1;
13659 }
13660
13661 vp = dirndp->ni_vp;
13662 pvp = dirndp->ni_dvp;
13663 mp = vnode_mount(rvp);
13664
13665 if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
13666 error = EINVAL;
13667 goto out2;
13668 }
13669
13670 #if CONFIG_MACF
13671 error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
13672 mp->mnt_vfsstat.f_fstypename);
13673 if (error) {
13674 goto out2;
13675 }
13676 #endif
13677
13678 smnt_data.sm_mp = mp;
13679 smnt_data.sm_cnp = &snapndp->ni_cnd;
13680 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
13681 &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
13682 KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
13683
13684 out2:
13685 vnode_put(vp);
13686 vnode_put(pvp);
13687 nameidone(dirndp);
13688 out1:
13689 vnode_put(snapvp);
13690 vnode_put(snapdvp);
13691 vnode_put(rvp);
13692 nameidone(snapndp);
13693 out:
13694 kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
13695 return error;
13696 }
13697
13698 /*
13699 * Root from a snapshot of the filesystem
13700 *
13701 * Marks the filesystem to root from the given snapshot on next boot.
13702 */
13703 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13704 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
13705 vfs_context_t ctx)
13706 {
13707 int error;
13708 vnode_t rvp;
13709 mount_t mp;
13710 struct fs_snapshot_root_args root_data;
13711 struct componentname cnp;
13712 caddr_t name_buf;
13713 size_t name_len;
13714
13715 error = vnode_getfromfd(ctx, dirfd, &rvp);
13716 if (error) {
13717 return error;
13718 }
13719 mp = vnode_mount(rvp);
13720
13721 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13722 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13723 if (error) {
13724 zfree(ZV_NAMEI, name_buf);
13725 vnode_put(rvp);
13726 return error;
13727 }
13728
13729 // XXX MAC checks ?
13730
13731 /*
13732 * Grab mount_iterref so that we can release the vnode,
13733 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
13734 */
13735 error = mount_iterref(mp, 0);
13736 vnode_put(rvp);
13737 if (error) {
13738 zfree(ZV_NAMEI, name_buf);
13739 return error;
13740 }
13741
13742 memset(&cnp, 0, sizeof(cnp));
13743 cnp.cn_pnbuf = (char *)name_buf;
13744 cnp.cn_nameiop = LOOKUP;
13745 cnp.cn_flags = ISLASTCN | HASBUF;
13746 cnp.cn_pnlen = MAXPATHLEN;
13747 cnp.cn_nameptr = cnp.cn_pnbuf;
13748 cnp.cn_namelen = (int)name_len;
13749 root_data.sr_cnp = &cnp;
13750
13751 error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
13752
13753 mount_iterdrop(mp);
13754 zfree(ZV_NAMEI, name_buf);
13755
13756 return error;
13757 }
13758
13759 /*
13760 * FS snapshot operations dispatcher
13761 */
13762 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)13763 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
13764 __unused int32_t *retval)
13765 {
13766 int error;
13767 vfs_context_t ctx = vfs_context_current();
13768
13769 AUDIT_ARG(fd, uap->dirfd);
13770 AUDIT_ARG(value32, uap->op);
13771
13772 error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_SNAPSHOT, 0);
13773 if (error) {
13774 return error;
13775 }
13776
13777 /*
13778 * Enforce user authorization for snapshot modification operations,
13779 * or if trying to root from snapshot.
13780 */
13781 if (uap->op != SNAPSHOT_OP_MOUNT) {
13782 vnode_t dvp = NULLVP;
13783 vnode_t devvp = NULLVP;
13784 mount_t mp;
13785
13786 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
13787 if (error) {
13788 return error;
13789 }
13790 mp = vnode_mount(dvp);
13791 devvp = mp->mnt_devvp;
13792
13793 /* get an iocount on devvp */
13794 if (devvp == NULLVP) {
13795 error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
13796 /* for mounts which arent block devices */
13797 if (error == ENOENT) {
13798 error = ENXIO;
13799 }
13800 } else {
13801 error = vnode_getwithref(devvp);
13802 }
13803
13804 if (error) {
13805 vnode_put(dvp);
13806 return error;
13807 }
13808
13809 if ((vfs_context_issuser(ctx) == 0) &&
13810 (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
13811 (!IOCurrentTaskHasEntitlement("com.apple.private.vfs.snapshot.user"))) {
13812 error = EPERM;
13813 }
13814 vnode_put(dvp);
13815 vnode_put(devvp);
13816
13817 if (error) {
13818 return error;
13819 }
13820 }
13821
13822 switch (uap->op) {
13823 case SNAPSHOT_OP_CREATE:
13824 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
13825 break;
13826 case SNAPSHOT_OP_DELETE:
13827 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
13828 break;
13829 case SNAPSHOT_OP_RENAME:
13830 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
13831 uap->flags, ctx);
13832 break;
13833 case SNAPSHOT_OP_MOUNT:
13834 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
13835 uap->data, uap->flags, ctx);
13836 break;
13837 case SNAPSHOT_OP_REVERT:
13838 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
13839 break;
13840 #if CONFIG_MNT_ROOTSNAP
13841 case SNAPSHOT_OP_ROOT:
13842 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
13843 break;
13844 #endif /* CONFIG_MNT_ROOTSNAP */
13845 default:
13846 error = ENOSYS;
13847 }
13848
13849 return error;
13850 }
13851