1 /*
2 * Copyright (c) 1995-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 * (c) UNIX System Laboratories, Inc.
32 * All or some portions of this file are derived from material licensed
33 * to the University of California by American Telephone and Telegraph
34 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35 * the permission of UNIX System Laboratories, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 * must display the following acknowledgement:
47 * This product includes software developed by the University of
48 * California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95
66 */
67 /*
68 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69 * support for mandatory and extensible security protections. This notice
70 * is included in support of clause 2.2 (b) of the Apple Public License,
71 * Version 2.0.
72 */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/fsctl.h>
101 #include <sys/ubc_internal.h>
102 #include <sys/disk.h>
103 #include <sys/content_protection.h>
104 #include <sys/clonefile.h>
105 #include <sys/snapshot.h>
106 #include <sys/priv.h>
107 #include <sys/fsgetpath.h>
108 #include <machine/cons.h>
109 #include <machine/limits.h>
110 #include <miscfs/specfs/specdev.h>
111
112 #include <vfs/vfs_disk_conditioner.h>
113
114 #include <security/audit/audit.h>
115 #include <bsm/audit_kevents.h>
116
117 #include <mach/mach_types.h>
118 #include <kern/kern_types.h>
119 #include <kern/kalloc.h>
120 #include <kern/task.h>
121
122 #include <vm/vm_pageout.h>
123 #include <vm/vm_protos.h>
124
125 #include <libkern/OSAtomic.h>
126 #include <os/atomic_private.h>
127 #include <pexpert/pexpert.h>
128 #include <IOKit/IOBSD.h>
129
130 // deps for MIG call
131 #include <kern/host.h>
132 #include <kern/ipc_misc.h>
133 #include <mach/host_priv.h>
134 #include <mach/vfs_nspace.h>
135 #include <os/log.h>
136
137 #include <nfs/nfs_conf.h>
138
139 #if ROUTEFS
140 #include <miscfs/routefs/routefs.h>
141 #endif /* ROUTEFS */
142
143 #if CONFIG_MACF
144 #include <security/mac.h>
145 #include <security/mac_framework.h>
146 #endif
147
148 #if CONFIG_FSE
149 #define GET_PATH(x) \
150 ((x) = get_pathbuff())
151 #define RELEASE_PATH(x) \
152 release_pathbuff(x)
153 #else
154 #define GET_PATH(x) \
155 ((x) = zalloc(ZV_NAMEI))
156 #define RELEASE_PATH(x) \
157 zfree(ZV_NAMEI, x)
158 #endif /* CONFIG_FSE */
159
160 #ifndef HFS_GET_BOOT_INFO
161 #define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004)
162 #endif
163
164 #ifndef HFS_SET_BOOT_INFO
165 #define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005)
166 #endif
167
168 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
169 #define APFSIOC_REVERT_TO_SNAPSHOT _IOW('J', 1, u_int64_t)
170 #endif
171
172 extern void disk_conditioner_unmount(mount_t mp);
173
174 /* struct for checkdirs iteration */
175 struct cdirargs {
176 vnode_t olddp;
177 vnode_t newdp;
178 };
179 /* callback for checkdirs iteration */
180 static int checkdirs_callback(proc_t p, void * arg);
181
182 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
183 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
184 void enablequotas(struct mount *mp, vfs_context_t ctx);
185 static int getfsstat_callback(mount_t mp, void * arg);
186 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
187 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
188 static int sync_callback(mount_t, void *);
189 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
190 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
191 boolean_t partial_copy);
192 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
193 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
194 struct componentname *cnp, user_addr_t fsmountargs,
195 int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
196 void vfs_notify_mount(vnode_t pdvp);
197
198 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
199
200 struct fd_vn_data * fg_vn_data_alloc(void);
201
202 /*
203 * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
204 * Concurrent lookups (or lookups by ids) on hard links can cause the
205 * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
206 * does) to return ENOENT as the path cannot be returned from the name cache
207 * alone. We have no option but to retry and hope to get one namei->reverse path
208 * generation done without an intervening lookup, lookup by id on the hard link
209 * item. This is only an issue for MAC hooks which cannot reenter the filesystem
210 * which currently are the MAC hooks for rename, unlink and rmdir.
211 */
212 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
213
214 /* Max retry limit for rename due to vnode recycling. */
215 #define MAX_RENAME_ERECYCLE_RETRIES 1024
216
217 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
218 int unlink_flags);
219
220 #ifdef CONFIG_IMGSRC_ACCESS
221 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
222 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
223 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
224 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
225 static void mount_end_update(mount_t mp);
226 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
227 #endif /* CONFIG_IMGSRC_ACCESS */
228
229 //snapshot functions
230 #if CONFIG_MNT_ROOTSNAP
231 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
232 #else
233 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
234 #endif
235
236 __private_extern__
237 int sync_internal(void);
238
239 __private_extern__
240 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
241
242 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
243 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
244
245 /* vars for sync mutex */
246 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
247 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
248
249 extern lck_rw_t rootvnode_rw_lock;
250
251 /*
252 * incremented each time a mount or unmount operation occurs
253 * used to invalidate the cached value of the rootvp in the
254 * mount structure utilized by cache_lookup_path
255 */
256 uint32_t mount_generation = 0;
257
258 /* counts number of mount and unmount operations */
259 unsigned int vfs_nummntops = 0;
260
261 /* system-wide, per-boot unique mount ID */
262 static _Atomic uint64_t mount_unique_id = 1;
263
264 extern const struct fileops vnops;
265 #if CONFIG_APPLEDOUBLE
266 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
267 #endif /* CONFIG_APPLEDOUBLE */
268
269 /*
270 * Virtual File System System Calls
271 */
272
273 /*
274 * Private in-kernel mounting spi (specific use-cases only)
275 */
276 boolean_t
vfs_iskernelmount(mount_t mp)277 vfs_iskernelmount(mount_t mp)
278 {
279 return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
280 }
281
282 __private_extern__
283 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)284 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
285 void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
286 vfs_context_t ctx)
287 {
288 struct nameidata nd;
289 boolean_t did_namei;
290 int error;
291
292 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
293 UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
294
295 kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
296
297 /*
298 * Get the vnode to be covered if it's not supplied
299 */
300 if (vp == NULLVP) {
301 error = namei(&nd);
302 if (error) {
303 if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
304 printf("failed to locate mount-on path: %s ", path);
305 }
306 return error;
307 }
308 vp = nd.ni_vp;
309 pvp = nd.ni_dvp;
310 did_namei = TRUE;
311 } else {
312 char *pnbuf = CAST_DOWN(char *, path);
313
314 nd.ni_cnd.cn_pnbuf = pnbuf;
315 nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
316 did_namei = FALSE;
317 }
318
319 kern_flags |= KERNEL_MOUNT_KMOUNT;
320 error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
321 syscall_flags, kern_flags, NULL, ctx);
322
323 if (did_namei) {
324 vnode_put(vp);
325 vnode_put(pvp);
326 nameidone(&nd);
327 }
328
329 return error;
330 }
331
332 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)333 vfs_mount_at_path(const char *fstype, const char *path,
334 vnode_t pvp, vnode_t vp, void *data, size_t datalen,
335 int mnt_flags, int flags)
336 {
337 int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
338 int error, km_flags = 0;
339
340 /*
341 * This call is currently restricted to specific use cases.
342 */
343 if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
344 return ENOTSUP;
345 }
346
347 #if !defined(XNU_TARGET_OS_OSX)
348 if (strcmp(fstype, "lifs") == 0) {
349 syscall_flags |= MNT_NOEXEC;
350 }
351 #endif
352
353 if (flags & VFS_MOUNT_FLAG_NOAUTH) {
354 km_flags |= KERNEL_MOUNT_NOAUTH;
355 }
356 if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
357 km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
358 }
359
360 error = kernel_mount(fstype, pvp, vp, path, data, datalen,
361 syscall_flags, km_flags, vfs_context_kernel());
362 if (error) {
363 printf("%s: mount on %s failed, error %d\n", __func__, path,
364 error);
365 }
366
367 return error;
368 }
369
370 int
vfs_mount_override_type_name(mount_t mp,const char * name)371 vfs_mount_override_type_name(mount_t mp, const char *name)
372 {
373 if (mp == NULL || name == NULL) {
374 return EINVAL;
375 }
376
377 /* Override the FS type name. */
378 mount_lock_spin(mp);
379 strlcpy(mp->fstypename_override, name, sizeof(mp->fstypename_override));
380 mp->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
381 mount_unlock(mp);
382
383 return 0;
384 }
385
386 /*
387 * Mount a file system.
388 */
389 /* ARGSUSED */
390 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)391 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
392 {
393 struct __mac_mount_args muap;
394
395 muap.type = uap->type;
396 muap.path = uap->path;
397 muap.flags = uap->flags;
398 muap.data = uap->data;
399 muap.mac_p = USER_ADDR_NULL;
400 return __mac_mount(p, &muap, retval);
401 }
402
403 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)404 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
405 {
406 struct componentname cn;
407 vfs_context_t ctx = vfs_context_current();
408 size_t dummy = 0;
409 int error;
410 int flags = uap->flags;
411 char fstypename[MFSNAMELEN];
412 char *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
413 vnode_t pvp;
414 vnode_t vp;
415
416 AUDIT_ARG(fd, uap->fd);
417 AUDIT_ARG(fflags, flags);
418 /* fstypename will get audited by mount_common */
419
420 /* Sanity check the flags */
421 if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
422 return ENOTSUP;
423 }
424
425 if (flags & MNT_UNION) {
426 return EPERM;
427 }
428
429 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
430 if (error) {
431 return error;
432 }
433
434 if ((error = file_vnode(uap->fd, &vp)) != 0) {
435 return error;
436 }
437
438 if ((error = vnode_getwithref(vp)) != 0) {
439 file_drop(uap->fd);
440 return error;
441 }
442
443 pvp = vnode_getparent(vp);
444 if (pvp == NULL) {
445 vnode_put(vp);
446 file_drop(uap->fd);
447 return EINVAL;
448 }
449
450 memset(&cn, 0, sizeof(struct componentname));
451 cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
452 cn.cn_pnlen = MAXPATHLEN;
453
454 if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
455 zfree(ZV_NAMEI, cn.cn_pnbuf);
456 vnode_put(pvp);
457 vnode_put(vp);
458 file_drop(uap->fd);
459 return error;
460 }
461
462 error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
463
464 zfree(ZV_NAMEI, cn.cn_pnbuf);
465 vnode_put(pvp);
466 vnode_put(vp);
467 file_drop(uap->fd);
468
469 return error;
470 }
471
472 void
vfs_notify_mount(vnode_t pdvp)473 vfs_notify_mount(vnode_t pdvp)
474 {
475 vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
476 lock_vnode_and_post(pdvp, NOTE_WRITE);
477 }
478
479 /*
480 * __mac_mount:
481 * Mount a file system taking into account MAC label behavior.
482 * See mount(2) man page for more information
483 *
484 * Parameters: p Process requesting the mount
485 * uap User argument descriptor (see below)
486 * retval (ignored)
487 *
488 * Indirect: uap->type Filesystem type
489 * uap->path Path to mount
490 * uap->data Mount arguments
491 * uap->mac_p MAC info
492 * uap->flags Mount flags
493 *
494 *
495 * Returns: 0 Success
496 * !0 Not success
497 */
498 boolean_t root_fs_upgrade_try = FALSE;
499
500 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)501 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
502 {
503 vnode_t pvp = NULL;
504 vnode_t vp = NULL;
505 int need_nameidone = 0;
506 vfs_context_t ctx = vfs_context_current();
507 char fstypename[MFSNAMELEN];
508 struct nameidata nd;
509 size_t dummy = 0;
510 char *labelstr = NULL;
511 size_t labelsz = 0;
512 int flags = uap->flags;
513 int error;
514 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
515 boolean_t is_64bit = IS_64BIT_PROCESS(p);
516 #else
517 #pragma unused(p)
518 #endif
519 /*
520 * Get the fs type name from user space
521 */
522 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
523 if (error) {
524 return error;
525 }
526
527 /*
528 * Get the vnode to be covered
529 */
530 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
531 UIO_USERSPACE, uap->path, ctx);
532 error = namei(&nd);
533 if (error) {
534 goto out;
535 }
536 need_nameidone = 1;
537 vp = nd.ni_vp;
538 pvp = nd.ni_dvp;
539
540 #ifdef CONFIG_IMGSRC_ACCESS
541 /* Mounting image source cannot be batched with other operations */
542 if (flags == MNT_IMGSRC_BY_INDEX) {
543 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
544 ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
545 goto out;
546 }
547 #endif /* CONFIG_IMGSRC_ACCESS */
548
549 #if CONFIG_MACF
550 /*
551 * Get the label string (if any) from user space
552 */
553 if (uap->mac_p != USER_ADDR_NULL) {
554 struct user_mac mac;
555 size_t ulen = 0;
556
557 if (is_64bit) {
558 struct user64_mac mac64;
559 error = copyin(uap->mac_p, &mac64, sizeof(mac64));
560 mac.m_buflen = (user_size_t)mac64.m_buflen;
561 mac.m_string = (user_addr_t)mac64.m_string;
562 } else {
563 struct user32_mac mac32;
564 error = copyin(uap->mac_p, &mac32, sizeof(mac32));
565 mac.m_buflen = mac32.m_buflen;
566 mac.m_string = mac32.m_string;
567 }
568 if (error) {
569 goto out;
570 }
571 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
572 (mac.m_buflen < 2)) {
573 error = EINVAL;
574 goto out;
575 }
576 labelsz = mac.m_buflen;
577 labelstr = kalloc_data(labelsz, Z_WAITOK);
578 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
579 if (error) {
580 goto out;
581 }
582 AUDIT_ARG(mac_string, labelstr);
583 }
584 #endif /* CONFIG_MACF */
585
586 AUDIT_ARG(fflags, flags);
587
588 #if !CONFIG_UNION_MOUNTS
589 if (flags & MNT_UNION) {
590 error = EPERM;
591 goto out;
592 }
593 #endif
594
595 if ((vp->v_flag & VROOT) &&
596 (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
597 #if CONFIG_UNION_MOUNTS
598 if (!(flags & MNT_UNION)) {
599 flags |= MNT_UPDATE;
600 } else {
601 /*
602 * For a union mount on '/', treat it as fresh
603 * mount instead of update.
604 * Otherwise, union mouting on '/' used to panic the
605 * system before, since mnt_vnodecovered was found to
606 * be NULL for '/' which is required for unionlookup
607 * after it gets ENOENT on union mount.
608 */
609 flags = (flags & ~(MNT_UPDATE));
610 }
611 #else
612 flags |= MNT_UPDATE;
613 #endif /* CONFIG_UNION_MOUNTS */
614
615 #if SECURE_KERNEL
616 if ((flags & MNT_RDONLY) == 0) {
617 /* Release kernels are not allowed to mount "/" as rw */
618 error = EPERM;
619 goto out;
620 }
621 #endif
622
623 /*
624 * See 7392553 for more details on why this check exists.
625 * Suffice to say: If this check is ON and something tries
626 * to mount the rootFS RW, we'll turn off the codesign
627 * bitmap optimization.
628 */
629 #if CHECK_CS_VALIDATION_BITMAP
630 if ((flags & MNT_RDONLY) == 0) {
631 root_fs_upgrade_try = TRUE;
632 }
633 #endif
634 }
635
636 error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
637 labelstr, ctx);
638
639 out:
640
641 #if CONFIG_MACF
642 kfree_data(labelstr, labelsz);
643 #endif /* CONFIG_MACF */
644
645 if (vp) {
646 vnode_put(vp);
647 }
648 if (pvp) {
649 vnode_put(pvp);
650 }
651 if (need_nameidone) {
652 nameidone(&nd);
653 }
654
655 return error;
656 }
657
658 /*
659 * common mount implementation (final stage of mounting)
660 *
661 * Arguments:
662 * fstypename file system type (ie it's vfs name)
663 * pvp parent of covered vnode
664 * vp covered vnode
665 * cnp component name (ie path) of covered vnode
666 * flags generic mount flags
667 * fsmountargs file system specific data
668 * labelstr optional MAC label
669 * kernelmount TRUE for mounts initiated from inside the kernel
670 * ctx caller's context
671 */
672 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)673 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
674 struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
675 char *labelstr, vfs_context_t ctx)
676 {
677 #if !CONFIG_MACF
678 #pragma unused(labelstr)
679 #endif
680 struct vnode *devvp = NULLVP;
681 struct vnode *device_vnode = NULLVP;
682 #if CONFIG_MACF
683 struct vnode *rvp;
684 #endif
685 struct mount *mp;
686 struct vfstable *vfsp = (struct vfstable *)0;
687 struct proc *p = vfs_context_proc(ctx);
688 int error, flag = 0;
689 bool flag_set = false;
690 user_addr_t devpath = USER_ADDR_NULL;
691 int ronly = 0;
692 int mntalloc = 0;
693 boolean_t vfsp_ref = FALSE;
694 boolean_t is_rwlock_locked = FALSE;
695 boolean_t did_rele = FALSE;
696 boolean_t have_usecount = FALSE;
697 boolean_t did_set_lmount = FALSE;
698 boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
699
700 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
701 /* Check for mutually-exclusive flag bits */
702 uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
703 int bitcount = 0;
704 while (checkflags != 0) {
705 checkflags &= (checkflags - 1);
706 bitcount++;
707 }
708
709 if (bitcount > 1) {
710 //not allowed to request multiple mount-by-role flags
711 error = EINVAL;
712 goto out1;
713 }
714 #endif
715
716 /*
717 * Process an update for an existing mount
718 */
719 if (flags & MNT_UPDATE) {
720 if ((vp->v_flag & VROOT) == 0) {
721 error = EINVAL;
722 goto out1;
723 }
724 mp = vp->v_mount;
725
726 /* if unmount or mount in progress, return error */
727 mount_lock_spin(mp);
728 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
729 mount_unlock(mp);
730 error = EBUSY;
731 goto out1;
732 }
733 mp->mnt_lflag |= MNT_LMOUNT;
734 did_set_lmount = TRUE;
735 mount_unlock(mp);
736 lck_rw_lock_exclusive(&mp->mnt_rwlock);
737 is_rwlock_locked = TRUE;
738 /*
739 * We only allow the filesystem to be reloaded if it
740 * is currently mounted read-only.
741 */
742 if ((flags & MNT_RELOAD) &&
743 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
744 error = ENOTSUP;
745 goto out1;
746 }
747
748 /*
749 * If content protection is enabled, update mounts are not
750 * allowed to turn it off.
751 */
752 if ((mp->mnt_flag & MNT_CPROTECT) &&
753 ((flags & MNT_CPROTECT) == 0)) {
754 error = EINVAL;
755 goto out1;
756 }
757
758 /*
759 * can't turn off MNT_REMOVABLE either but it may be an unexpected
760 * failure to return an error for this so we'll just silently
761 * add it if it is not passed in.
762 */
763 if ((mp->mnt_flag & MNT_REMOVABLE) &&
764 ((flags & MNT_REMOVABLE) == 0)) {
765 flags |= MNT_REMOVABLE;
766 }
767
768 /* Can't downgrade the backer of the root FS */
769 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
770 (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
771 error = ENOTSUP;
772 goto out1;
773 }
774
775 /*
776 * Only root, or the user that did the original mount is
777 * permitted to update it.
778 */
779 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
780 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
781 goto out1;
782 }
783 #if CONFIG_MACF
784 error = mac_mount_check_remount(ctx, mp);
785 if (error != 0) {
786 goto out1;
787 }
788 #endif
789 /*
790 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
791 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
792 */
793 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
794 flags |= MNT_NOSUID | MNT_NODEV;
795 if (mp->mnt_flag & MNT_NOEXEC) {
796 flags |= MNT_NOEXEC;
797 }
798 }
799 flag = mp->mnt_flag;
800 flag_set = true;
801
802
803
804 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
805
806 vfsp = mp->mnt_vtable;
807 goto update;
808 } // MNT_UPDATE
809
810 /*
811 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
812 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
813 */
814 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
815 flags |= MNT_NOSUID | MNT_NODEV;
816 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
817 flags |= MNT_NOEXEC;
818 }
819 }
820
821 /* XXXAUDIT: Should we capture the type on the error path as well? */
822 /* XXX cast-away const (audit_arg_text() does not modify its input) */
823 AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
824 mount_list_lock();
825 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
826 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
827 vfsp->vfc_refcount++;
828 vfsp_ref = TRUE;
829 break;
830 }
831 }
832 mount_list_unlock();
833 if (vfsp == NULL) {
834 error = ENODEV;
835 goto out1;
836 }
837
838 /*
839 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
840 * except in ROSV configs and for the initial BaseSystem root.
841 */
842 if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
843 ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
844 ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
845 error = EINVAL; /* unsupported request */
846 goto out1;
847 }
848
849 error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
850 if (error != 0) {
851 goto out1;
852 }
853
854 /*
855 * Allocate and initialize the filesystem (mount_t)
856 */
857 mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
858 mntalloc = 1;
859
860 /* Initialize the default IO constraints */
861 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
862 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
863 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
864 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
865 mp->mnt_devblocksize = DEV_BSIZE;
866 mp->mnt_alignmentmask = PAGE_MASK;
867 mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
868 mp->mnt_ioscale = 1;
869 mp->mnt_ioflags = 0;
870 mp->mnt_realrootvp = NULLVP;
871 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
872
873 mp->mnt_lflag |= MNT_LMOUNT;
874 did_set_lmount = TRUE;
875
876 TAILQ_INIT(&mp->mnt_vnodelist);
877 TAILQ_INIT(&mp->mnt_workerqueue);
878 TAILQ_INIT(&mp->mnt_newvnodes);
879 mount_lock_init(mp);
880 lck_rw_lock_exclusive(&mp->mnt_rwlock);
881 is_rwlock_locked = TRUE;
882 mp->mnt_op = vfsp->vfc_vfsops;
883 mp->mnt_vtable = vfsp;
884 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
885 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
886 strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
887 do {
888 int pathlen = MAXPATHLEN;
889
890 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
891 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
892 }
893 } while (0);
894 mp->mnt_vnodecovered = vp;
895 mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
896 mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
897 mp->mnt_devbsdunit = 0;
898 mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
899
900 /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
901 vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
902
903 if (kernelmount) {
904 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
905 }
906 if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
907 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
908 }
909
910 if (KERNEL_MOUNT_DEVFS & internal_flags) {
911 // kernel mounted devfs
912 mp->mnt_kern_flag |= MNTK_SYSTEM;
913 }
914
915 update:
916
917 /*
918 * Set the mount level flags.
919 */
920 if (flags & MNT_RDONLY) {
921 mp->mnt_flag |= MNT_RDONLY;
922 } else if (mp->mnt_flag & MNT_RDONLY) {
923 // disallow read/write upgrades of file systems that
924 // had the TYPENAME_OVERRIDE feature set.
925 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
926 error = EPERM;
927 goto out1;
928 }
929 mp->mnt_kern_flag |= MNTK_WANTRDWR;
930 }
931 mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
932 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
933 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
934 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
935 MNT_QUARANTINE | MNT_CPROTECT);
936
937 #if SECURE_KERNEL
938 #if !CONFIG_MNT_SUID
939 /*
940 * On release builds of iOS based platforms, always enforce NOSUID on
941 * all mounts. We do this here because we can catch update mounts as well as
942 * non-update mounts in this case.
943 */
944 mp->mnt_flag |= (MNT_NOSUID);
945 #endif
946 #endif
947
948 mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
949 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
950 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
951 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
952 MNT_QUARANTINE | MNT_CPROTECT);
953
954 #if CONFIG_MACF
955 if (flags & MNT_MULTILABEL) {
956 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
957 error = EINVAL;
958 goto out1;
959 }
960 mp->mnt_flag |= MNT_MULTILABEL;
961 }
962 #endif
963 /*
964 * Process device path for local file systems if requested.
965 *
966 * Snapshot and mount-by-role mounts do not use this path; they are
967 * passing other opaque data in the device path field.
968 *
969 * Basesystemroot mounts pass a device path to be resolved here,
970 * but it's just a char * already inside the kernel, which
971 * kernel_mount() shoved into a user_addr_t to call us. So for such
972 * mounts we must skip copyin (both of the address and of the string
973 * (in NDINIT).
974 */
975 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
976 !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
977 boolean_t do_copyin_devpath = true;
978 #if CONFIG_BASESYSTEMROOT
979 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
980 // KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
981 // We have been passed fsmountargs, which is typed as a user_addr_t,
982 // but is actually a char ** pointing to a (kernelspace) string.
983 // We manually unpack it with a series of casts and dereferences
984 // that reverses what was done just above us on the stack in
985 // imageboot_pivot_image().
986 // After retrieving the path to the dev node (which we will NDINIT
987 // in a moment), we pass NULL fsmountargs on to the filesystem.
988 _Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
989 char **devnamepp = (char **)fsmountargs;
990 char *devnamep = *devnamepp;
991 devpath = CAST_USER_ADDR_T(devnamep);
992 do_copyin_devpath = false;
993 fsmountargs = USER_ADDR_NULL;
994
995 //Now that we have a mp, denote that this mount is for the basesystem.
996 mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
997 }
998 #endif // CONFIG_BASESYSTEMROOT
999
1000 if (do_copyin_devpath) {
1001 if (vfs_context_is64bit(ctx)) {
1002 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1003 goto out1;
1004 }
1005 fsmountargs += sizeof(devpath);
1006 } else {
1007 user32_addr_t tmp;
1008 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1009 goto out1;
1010 }
1011 /* munge into LP64 addr */
1012 devpath = CAST_USER_ADDR_T(tmp);
1013 fsmountargs += sizeof(tmp);
1014 }
1015 }
1016
1017 /* Lookup device and authorize access to it */
1018 if ((devpath)) {
1019 struct nameidata nd;
1020
1021 enum uio_seg seg = UIO_USERSPACE;
1022 #if CONFIG_BASESYSTEMROOT
1023 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1024 seg = UIO_SYSSPACE;
1025 }
1026 #endif // CONFIG_BASESYSTEMROOT
1027
1028 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1029 if ((error = namei(&nd))) {
1030 goto out1;
1031 }
1032
1033 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1034 devvp = nd.ni_vp;
1035
1036 nameidone(&nd);
1037
1038 if (devvp->v_type != VBLK) {
1039 error = ENOTBLK;
1040 goto out2;
1041 }
1042 if (major(devvp->v_rdev) >= nblkdev) {
1043 error = ENXIO;
1044 goto out2;
1045 }
1046 /*
1047 * If mount by non-root, then verify that user has necessary
1048 * permissions on the device.
1049 */
1050 if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1051 mode_t accessmode = KAUTH_VNODE_READ_DATA;
1052
1053 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1054 accessmode |= KAUTH_VNODE_WRITE_DATA;
1055 }
1056 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1057 goto out2;
1058 }
1059 }
1060 }
1061 /* On first mount, preflight and open device */
1062 if (devpath && ((flags & MNT_UPDATE) == 0)) {
1063 if ((error = vnode_ref(devvp))) {
1064 goto out2;
1065 }
1066 /*
1067 * Disallow multiple mounts of the same device.
1068 * Disallow mounting of a device that is currently in use
1069 * (except for root, which might share swap device for miniroot).
1070 * Flush out any old buffers remaining from a previous use.
1071 */
1072 if ((error = vfs_mountedon(devvp))) {
1073 goto out3;
1074 }
1075
1076 if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1077 error = EBUSY;
1078 goto out3;
1079 }
1080 if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1081 error = ENOTBLK;
1082 goto out3;
1083 }
1084 if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1085 goto out3;
1086 }
1087
1088 ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1089 #if CONFIG_MACF
1090 error = mac_vnode_check_open(ctx,
1091 devvp,
1092 ronly ? FREAD : FREAD | FWRITE);
1093 if (error) {
1094 goto out3;
1095 }
1096 #endif /* MAC */
1097 if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1098 goto out3;
1099 }
1100
1101 mp->mnt_devvp = devvp;
1102 device_vnode = devvp;
1103 } else if ((mp->mnt_flag & MNT_RDONLY) &&
1104 (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1105 (device_vnode = mp->mnt_devvp)) {
1106 dev_t dev;
1107 int maj;
1108 /*
1109 * If upgrade to read-write by non-root, then verify
1110 * that user has necessary permissions on the device.
1111 */
1112 vnode_getalways(device_vnode);
1113
1114 if (suser(vfs_context_ucred(ctx), NULL) &&
1115 (error = vnode_authorize(device_vnode, NULL,
1116 KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1117 ctx)) != 0) {
1118 vnode_put(device_vnode);
1119 goto out2;
1120 }
1121
1122 /* Tell the device that we're upgrading */
1123 dev = (dev_t)device_vnode->v_rdev;
1124 maj = major(dev);
1125
1126 if ((u_int)maj >= (u_int)nblkdev) {
1127 panic("Volume mounted on a device with invalid major number.");
1128 }
1129
1130 error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1131 vnode_put(device_vnode);
1132 device_vnode = NULLVP;
1133 if (error != 0) {
1134 goto out2;
1135 }
1136 }
1137 } // localargs && !(snapshot | data | vm)
1138
1139 #if CONFIG_MACF
1140 if ((flags & MNT_UPDATE) == 0) {
1141 mac_mount_label_init(mp);
1142 mac_mount_label_associate(ctx, mp);
1143 }
1144 if (labelstr) {
1145 if ((flags & MNT_UPDATE) != 0) {
1146 error = mac_mount_check_label_update(ctx, mp);
1147 if (error != 0) {
1148 goto out3;
1149 }
1150 }
1151 }
1152 #endif
1153 /*
1154 * Mount the filesystem. We already asserted that internal_flags
1155 * cannot have more than one mount-by-role bit set.
1156 */
1157 if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1158 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1159 (caddr_t)fsmountargs, 0, ctx);
1160 } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1161 #if CONFIG_ROSV_STARTUP
1162 struct mount *origin_mp = (struct mount*)fsmountargs;
1163 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1164 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1165 if (error) {
1166 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1167 } else {
1168 /* Mark volume associated with system volume */
1169 mp->mnt_kern_flag |= MNTK_SYSTEM;
1170
1171 /* Attempt to acquire the mnt_devvp and set it up */
1172 struct vnode *mp_devvp = NULL;
1173 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1174 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1175 0, &mp_devvp, vfs_context_kernel());
1176 if (!lerr) {
1177 mp->mnt_devvp = mp_devvp;
1178 //vnode_lookup took an iocount, need to drop it.
1179 vnode_put(mp_devvp);
1180 // now set `device_vnode` to the devvp that was acquired.
1181 // this is needed in order to ensure vfs_init_io_attributes is invoked.
1182 // note that though the iocount above was dropped, the mount acquires
1183 // an implicit reference against the device.
1184 device_vnode = mp_devvp;
1185 }
1186 }
1187 }
1188 #else
1189 error = EINVAL;
1190 #endif
1191 } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1192 #if CONFIG_MOUNT_VM
1193 struct mount *origin_mp = (struct mount*)fsmountargs;
1194 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1195 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1196 if (error) {
1197 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1198 } else {
1199 /* Mark volume associated with system volume and a swap mount */
1200 mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1201 /* Attempt to acquire the mnt_devvp and set it up */
1202 struct vnode *mp_devvp = NULL;
1203 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1204 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1205 0, &mp_devvp, vfs_context_kernel());
1206 if (!lerr) {
1207 mp->mnt_devvp = mp_devvp;
1208 //vnode_lookup took an iocount, need to drop it.
1209 vnode_put(mp_devvp);
1210
1211 // now set `device_vnode` to the devvp that was acquired.
1212 // note that though the iocount above was dropped, the mount acquires
1213 // an implicit reference against the device.
1214 device_vnode = mp_devvp;
1215 }
1216 }
1217 }
1218 #else
1219 error = EINVAL;
1220 #endif
1221 } else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1222 #if CONFIG_MOUNT_PREBOOTRECOVERY
1223 struct mount *origin_mp = (struct mount*)fsmountargs;
1224 uint32_t mount_role = 0;
1225 if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1226 mount_role = VFS_PREBOOT_ROLE;
1227 } else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1228 mount_role = VFS_RECOVERY_ROLE;
1229 }
1230
1231 if (mount_role != 0) {
1232 fs_role_mount_args_t frma = {origin_mp, mount_role};
1233 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1234 if (error) {
1235 printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1236 } else {
1237 // NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1238 /* Mark volume associated with system volume */
1239 //mp->mnt_kern_flag |= MNTK_SYSTEM;
1240 /* Attempt to acquire the mnt_devvp and set it up */
1241 struct vnode *mp_devvp = NULL;
1242 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1243 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1244 0, &mp_devvp, vfs_context_kernel());
1245 if (!lerr) {
1246 mp->mnt_devvp = mp_devvp;
1247 //vnode_lookup took an iocount, need to drop it.
1248 vnode_put(mp_devvp);
1249
1250 // now set `device_vnode` to the devvp that was acquired.
1251 // note that though the iocount above was dropped, the mount acquires
1252 // an implicit reference against the device.
1253 device_vnode = mp_devvp;
1254 }
1255 }
1256 }
1257 } else {
1258 printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1259 error = EINVAL;
1260 }
1261 #else
1262 error = EINVAL;
1263 #endif
1264 } else {
1265 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1266 }
1267
1268 if (flags & MNT_UPDATE) {
1269 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1270 mp->mnt_flag &= ~MNT_RDONLY;
1271 }
1272 mp->mnt_flag &= ~
1273 (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1274 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1275 if (error) {
1276 mp->mnt_flag = flag; /* restore flag value */
1277 }
1278 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1279 lck_rw_done(&mp->mnt_rwlock);
1280 is_rwlock_locked = FALSE;
1281 if (!error) {
1282 enablequotas(mp, ctx);
1283 }
1284 goto exit;
1285 }
1286
1287 /*
1288 * Put the new filesystem on the mount list after root.
1289 */
1290 if (error == 0) {
1291 struct vfs_attr vfsattr;
1292 #if CONFIG_MACF
1293 error = mac_mount_check_mount_late(ctx, mp);
1294 if (error != 0) {
1295 goto out4;
1296 }
1297
1298 if (vfs_flags(mp) & MNT_MULTILABEL) {
1299 error = VFS_ROOT(mp, &rvp, ctx);
1300 if (error) {
1301 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1302 goto out4;
1303 }
1304 error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1305 /*
1306 * drop reference provided by VFS_ROOT
1307 */
1308 vnode_put(rvp);
1309
1310 if (error) {
1311 goto out4;
1312 }
1313 }
1314 #endif /* MAC */
1315
1316 vnode_lock_spin(vp);
1317 CLR(vp->v_flag, VMOUNT);
1318 vp->v_mountedhere = mp;
1319 vnode_unlock(vp);
1320
1321 /*
1322 * taking the name_cache_lock exclusively will
1323 * insure that everyone is out of the fast path who
1324 * might be trying to use a now stale copy of
1325 * vp->v_mountedhere->mnt_realrootvp
1326 * bumping mount_generation causes the cached values
1327 * to be invalidated
1328 */
1329 name_cache_lock();
1330 mount_generation++;
1331 name_cache_unlock();
1332
1333 error = vnode_ref(vp);
1334 if (error != 0) {
1335 goto out4;
1336 }
1337
1338 have_usecount = TRUE;
1339
1340 error = checkdirs(vp, ctx);
1341 if (error != 0) {
1342 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1343 goto out4;
1344 }
1345 /*
1346 * there is no cleanup code here so I have made it void
1347 * we need to revisit this
1348 */
1349 (void)VFS_START(mp, 0, ctx);
1350
1351 if (mount_list_add(mp) != 0) {
1352 /*
1353 * The system is shutting down trying to umount
1354 * everything, so fail with a plausible errno.
1355 */
1356 error = EBUSY;
1357 goto out4;
1358 }
1359 lck_rw_done(&mp->mnt_rwlock);
1360 is_rwlock_locked = FALSE;
1361
1362 /* Check if this mounted file system supports EAs or named streams. */
1363 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1364 VFSATTR_INIT(&vfsattr);
1365 VFSATTR_WANTED(&vfsattr, f_capabilities);
1366 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1367 vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1368 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1369 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1370 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1371 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1372 }
1373 #if NAMEDSTREAMS
1374 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1375 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1376 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1377 }
1378 #endif
1379 /* Check if this file system supports path from id lookups. */
1380 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1381 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1382 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1383 } else if (mp->mnt_flag & MNT_DOVOLFS) {
1384 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1385 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1386 }
1387
1388 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1389 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1390 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1391 }
1392 }
1393 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1394 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1395 }
1396 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1397 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1398 }
1399 /* increment the operations count */
1400 OSAddAtomic(1, &vfs_nummntops);
1401 enablequotas(mp, ctx);
1402
1403 if (device_vnode) {
1404 device_vnode->v_specflags |= SI_MOUNTEDON;
1405
1406 /*
1407 * cache the IO attributes for the underlying physical media...
1408 * an error return indicates the underlying driver doesn't
1409 * support all the queries necessary... however, reasonable
1410 * defaults will have been set, so no reason to bail or care
1411 */
1412 vfs_init_io_attributes(device_vnode, mp);
1413 }
1414
1415 /* Now that mount is setup, notify the listeners */
1416 vfs_notify_mount(pvp);
1417 IOBSDMountChange(mp, kIOMountChangeMount);
1418 } else {
1419 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1420 if (mp->mnt_vnodelist.tqh_first != NULL) {
1421 panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1422 mp->mnt_vtable->vfc_name, error);
1423 }
1424
1425 vnode_lock_spin(vp);
1426 CLR(vp->v_flag, VMOUNT);
1427 vnode_unlock(vp);
1428 mount_list_lock();
1429 mp->mnt_vtable->vfc_refcount--;
1430 mount_list_unlock();
1431
1432 if (device_vnode) {
1433 vnode_rele(device_vnode);
1434 VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1435 }
1436 lck_rw_done(&mp->mnt_rwlock);
1437 is_rwlock_locked = FALSE;
1438
1439 /*
1440 * if we get here, we have a mount structure that needs to be freed,
1441 * but since the coveredvp hasn't yet been updated to point at it,
1442 * no need to worry about other threads holding a crossref on this mp
1443 * so it's ok to just free it
1444 */
1445 mount_lock_destroy(mp);
1446 #if CONFIG_MACF
1447 mac_mount_label_destroy(mp);
1448 #endif
1449 zfree(mount_zone, mp);
1450 did_set_lmount = false;
1451 }
1452 exit:
1453 /*
1454 * drop I/O count on the device vp if there was one
1455 */
1456 if (devpath && devvp) {
1457 vnode_put(devvp);
1458 }
1459
1460 if (did_set_lmount) {
1461 mount_lock_spin(mp);
1462 mp->mnt_lflag &= ~MNT_LMOUNT;
1463 mount_unlock(mp);
1464 }
1465
1466 return error;
1467
1468 /* Error condition exits */
1469 out4:
1470 (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1471
1472 /*
1473 * If the mount has been placed on the covered vp,
1474 * it may have been discovered by now, so we have
1475 * to treat this just like an unmount
1476 */
1477 mount_lock_spin(mp);
1478 mp->mnt_lflag |= MNT_LDEAD;
1479 mount_unlock(mp);
1480
1481 if (device_vnode != NULLVP) {
1482 vnode_rele(device_vnode);
1483 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1484 ctx);
1485 did_rele = TRUE;
1486 }
1487
1488 vnode_lock_spin(vp);
1489
1490 mp->mnt_crossref++;
1491 vp->v_mountedhere = (mount_t) 0;
1492
1493 vnode_unlock(vp);
1494
1495 if (have_usecount) {
1496 vnode_rele(vp);
1497 }
1498 out3:
1499 if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1500 vnode_rele(devvp);
1501 }
1502 out2:
1503 if (devpath && devvp) {
1504 vnode_put(devvp);
1505 }
1506 out1:
1507 /* Release mnt_rwlock only when it was taken */
1508 if (is_rwlock_locked == TRUE) {
1509 if (flag_set) {
1510 mp->mnt_flag = flag; /* restore mnt_flag value */
1511 }
1512 lck_rw_done(&mp->mnt_rwlock);
1513 }
1514
1515 if (did_set_lmount) {
1516 mount_lock_spin(mp);
1517 mp->mnt_lflag &= ~MNT_LMOUNT;
1518 mount_unlock(mp);
1519 }
1520
1521 if (mntalloc) {
1522 if (mp->mnt_crossref) {
1523 mount_dropcrossref(mp, vp, 0);
1524 } else {
1525 mount_lock_destroy(mp);
1526 #if CONFIG_MACF
1527 mac_mount_label_destroy(mp);
1528 #endif
1529 zfree(mount_zone, mp);
1530 }
1531 }
1532 if (vfsp_ref) {
1533 mount_list_lock();
1534 vfsp->vfc_refcount--;
1535 mount_list_unlock();
1536 }
1537
1538 return error;
1539 }
1540
1541 /*
1542 * Flush in-core data, check for competing mount attempts,
1543 * and set VMOUNT
1544 */
1545 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)1546 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1547 {
1548 #if !CONFIG_MACF
1549 #pragma unused(cnp,fsname)
1550 #endif
1551 struct vnode_attr va;
1552 int error;
1553 boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1554 boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1555 boolean_t is_busy;
1556
1557 if (!skip_auth) {
1558 /*
1559 * If the user is not root, ensure that they own the directory
1560 * onto which we are attempting to mount.
1561 */
1562 VATTR_INIT(&va);
1563 VATTR_WANTED(&va, va_uid);
1564 if ((error = vnode_getattr(vp, &va, ctx)) ||
1565 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1566 (!vfs_context_issuser(ctx)))) {
1567 error = EPERM;
1568 goto out;
1569 }
1570 }
1571
1572 if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1573 goto out;
1574 }
1575
1576 if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1577 goto out;
1578 }
1579
1580 if (vp->v_type != VDIR) {
1581 error = ENOTDIR;
1582 goto out;
1583 }
1584
1585 vnode_lock_spin(vp);
1586 is_busy = is_fmount ?
1587 (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL)) :
1588 (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL));
1589 if (is_busy) {
1590 vnode_unlock(vp);
1591 error = EBUSY;
1592 goto out;
1593 }
1594 SET(vp->v_flag, VMOUNT);
1595 vnode_unlock(vp);
1596
1597 #if CONFIG_MACF
1598 error = mac_mount_check_mount(ctx, vp,
1599 cnp, fsname);
1600 if (error != 0) {
1601 vnode_lock_spin(vp);
1602 CLR(vp->v_flag, VMOUNT);
1603 vnode_unlock(vp);
1604 }
1605 #endif
1606
1607 out:
1608 return error;
1609 }
1610
1611 #if CONFIG_IMGSRC_ACCESS
1612
1613 #define DEBUG_IMGSRC 0
1614
1615 #if DEBUG_IMGSRC
1616 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1617 #else
1618 #define IMGSRC_DEBUG(args...) do { } while(0)
1619 #endif
1620
1621 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)1622 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1623 {
1624 struct nameidata nd;
1625 vnode_t vp, realdevvp;
1626 mode_t accessmode;
1627 int error;
1628 enum uio_seg uio = UIO_USERSPACE;
1629
1630 if (ctx == vfs_context_kernel()) {
1631 uio = UIO_SYSSPACE;
1632 }
1633
1634 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1635 if ((error = namei(&nd))) {
1636 IMGSRC_DEBUG("namei() failed with %d\n", error);
1637 return error;
1638 }
1639
1640 vp = nd.ni_vp;
1641
1642 if (!vnode_isblk(vp)) {
1643 IMGSRC_DEBUG("Not block device.\n");
1644 error = ENOTBLK;
1645 goto out;
1646 }
1647
1648 realdevvp = mp->mnt_devvp;
1649 if (realdevvp == NULLVP) {
1650 IMGSRC_DEBUG("No device backs the mount.\n");
1651 error = ENXIO;
1652 goto out;
1653 }
1654
1655 error = vnode_getwithref(realdevvp);
1656 if (error != 0) {
1657 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1658 goto out;
1659 }
1660
1661 if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1662 IMGSRC_DEBUG("Wrong dev_t.\n");
1663 error = ENXIO;
1664 goto out1;
1665 }
1666
1667 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1668
1669 /*
1670 * If mount by non-root, then verify that user has necessary
1671 * permissions on the device.
1672 */
1673 if (!vfs_context_issuser(ctx)) {
1674 accessmode = KAUTH_VNODE_READ_DATA;
1675 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1676 accessmode |= KAUTH_VNODE_WRITE_DATA;
1677 }
1678 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1679 IMGSRC_DEBUG("Access denied.\n");
1680 goto out1;
1681 }
1682 }
1683
1684 *devvpp = vp;
1685
1686 out1:
1687 vnode_put(realdevvp);
1688
1689 out:
1690 nameidone(&nd);
1691
1692 if (error) {
1693 vnode_put(vp);
1694 }
1695
1696 return error;
1697 }
1698
1699 /*
1700 * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1701 * and call checkdirs()
1702 */
1703 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)1704 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1705 {
1706 int error;
1707
1708 mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1709
1710 IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
1711 mp->mnt_vtable->vfc_name, vnode_getname(vp));
1712
1713 vnode_lock_spin(vp);
1714 CLR(vp->v_flag, VMOUNT);
1715 vp->v_mountedhere = mp;
1716 vnode_unlock(vp);
1717
1718 /*
1719 * taking the name_cache_lock exclusively will
1720 * insure that everyone is out of the fast path who
1721 * might be trying to use a now stale copy of
1722 * vp->v_mountedhere->mnt_realrootvp
1723 * bumping mount_generation causes the cached values
1724 * to be invalidated
1725 */
1726 name_cache_lock();
1727 mount_generation++;
1728 name_cache_unlock();
1729
1730 error = vnode_ref(vp);
1731 if (error != 0) {
1732 goto out;
1733 }
1734
1735 error = checkdirs(vp, ctx);
1736 if (error != 0) {
1737 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1738 vnode_rele(vp);
1739 goto out;
1740 }
1741
1742 out:
1743 if (error != 0) {
1744 mp->mnt_vnodecovered = NULLVP;
1745 }
1746 return error;
1747 }
1748
1749 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)1750 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1751 {
1752 vnode_rele(vp);
1753 vnode_lock_spin(vp);
1754 vp->v_mountedhere = (mount_t)NULL;
1755 vnode_unlock(vp);
1756
1757 mp->mnt_vnodecovered = NULLVP;
1758 }
1759
1760 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)1761 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1762 {
1763 int error;
1764
1765 /* unmount in progress return error */
1766 mount_lock_spin(mp);
1767 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1768 mount_unlock(mp);
1769 return EBUSY;
1770 }
1771 mount_unlock(mp);
1772 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1773
1774 /*
1775 * We only allow the filesystem to be reloaded if it
1776 * is currently mounted read-only.
1777 */
1778 if ((flags & MNT_RELOAD) &&
1779 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1780 error = ENOTSUP;
1781 goto out;
1782 }
1783
1784 /*
1785 * Only root, or the user that did the original mount is
1786 * permitted to update it.
1787 */
1788 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1789 (!vfs_context_issuser(ctx))) {
1790 error = EPERM;
1791 goto out;
1792 }
1793 #if CONFIG_MACF
1794 error = mac_mount_check_remount(ctx, mp);
1795 if (error != 0) {
1796 goto out;
1797 }
1798 #endif
1799
1800 out:
1801 if (error) {
1802 lck_rw_done(&mp->mnt_rwlock);
1803 }
1804
1805 return error;
1806 }
1807
1808 static void
mount_end_update(mount_t mp)1809 mount_end_update(mount_t mp)
1810 {
1811 lck_rw_done(&mp->mnt_rwlock);
1812 }
1813
1814 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)1815 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1816 {
1817 vnode_t vp;
1818
1819 if (height >= MAX_IMAGEBOOT_NESTING) {
1820 return EINVAL;
1821 }
1822
1823 vp = imgsrc_rootvnodes[height];
1824 if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1825 *rvpp = vp;
1826 return 0;
1827 } else {
1828 return ENOENT;
1829 }
1830 }
1831
1832 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)1833 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
1834 struct componentname *cnp, const char *fsname, vfs_context_t ctx,
1835 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1836 {
1837 int error;
1838 mount_t mp;
1839 boolean_t placed = FALSE;
1840 struct vfstable *vfsp;
1841 user_addr_t devpath;
1842 char *old_mntonname;
1843 vnode_t rvp;
1844 vnode_t devvp;
1845 uint32_t height;
1846 uint32_t flags;
1847
1848 /* If we didn't imageboot, nothing to move */
1849 if (imgsrc_rootvnodes[0] == NULLVP) {
1850 return EINVAL;
1851 }
1852
1853 /* Only root can do this */
1854 if (!vfs_context_issuser(ctx)) {
1855 return EPERM;
1856 }
1857
1858 IMGSRC_DEBUG("looking for root vnode.\n");
1859
1860 /*
1861 * Get root vnode of filesystem we're moving.
1862 */
1863 if (by_index) {
1864 if (is64bit) {
1865 struct user64_mnt_imgsrc_args mia64;
1866 error = copyin(fsmountargs, &mia64, sizeof(mia64));
1867 if (error != 0) {
1868 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1869 return error;
1870 }
1871
1872 height = mia64.mi_height;
1873 flags = mia64.mi_flags;
1874 devpath = (user_addr_t)mia64.mi_devpath;
1875 } else {
1876 struct user32_mnt_imgsrc_args mia32;
1877 error = copyin(fsmountargs, &mia32, sizeof(mia32));
1878 if (error != 0) {
1879 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1880 return error;
1881 }
1882
1883 height = mia32.mi_height;
1884 flags = mia32.mi_flags;
1885 devpath = mia32.mi_devpath;
1886 }
1887 } else {
1888 /*
1889 * For binary compatibility--assumes one level of nesting.
1890 */
1891 if (is64bit) {
1892 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1893 return error;
1894 }
1895 } else {
1896 user32_addr_t tmp;
1897 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1898 return error;
1899 }
1900
1901 /* munge into LP64 addr */
1902 devpath = CAST_USER_ADDR_T(tmp);
1903 }
1904
1905 height = 0;
1906 flags = 0;
1907 }
1908
1909 if (flags != 0) {
1910 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1911 return EINVAL;
1912 }
1913
1914 error = get_imgsrc_rootvnode(height, &rvp);
1915 if (error != 0) {
1916 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
1917 return error;
1918 }
1919
1920 IMGSRC_DEBUG("got old root vnode\n");
1921
1922 old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
1923
1924 /* Can only move once */
1925 mp = vnode_mount(rvp);
1926 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1927 IMGSRC_DEBUG("Already moved.\n");
1928 error = EBUSY;
1929 goto out0;
1930 }
1931
1932 IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
1933 IMGSRC_DEBUG("Starting updated.\n");
1934
1935 /* Get exclusive rwlock on mount, authorize update on mp */
1936 error = mount_begin_update(mp, ctx, 0);
1937 if (error != 0) {
1938 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1939 goto out0;
1940 }
1941
1942 /*
1943 * It can only be moved once. Flag is set under the rwlock,
1944 * so we're now safe to proceed.
1945 */
1946 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1947 IMGSRC_DEBUG("Already moved [2]\n");
1948 goto out1;
1949 }
1950
1951 IMGSRC_DEBUG("Preparing coveredvp.\n");
1952
1953 /* Mark covered vnode as mount in progress, authorize placing mount on top */
1954 error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
1955 if (error != 0) {
1956 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1957 goto out1;
1958 }
1959
1960 IMGSRC_DEBUG("Covered vp OK.\n");
1961
1962 /* Sanity check the name caller has provided */
1963 vfsp = mp->mnt_vtable;
1964 if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1965 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
1966 vfsp->vfc_name, fsname);
1967 error = EINVAL;
1968 goto out2;
1969 }
1970
1971 /* Check the device vnode and update mount-from name, for local filesystems */
1972 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1973 IMGSRC_DEBUG("Local, doing device validation.\n");
1974
1975 if (devpath != USER_ADDR_NULL) {
1976 error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1977 if (error) {
1978 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1979 goto out2;
1980 }
1981
1982 vnode_put(devvp);
1983 }
1984 }
1985
1986 /*
1987 * Place mp on top of vnode, ref the vnode, call checkdirs(),
1988 * and increment the name cache's mount generation
1989 */
1990
1991 IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1992 error = place_mount_and_checkdirs(mp, vp, ctx);
1993 if (error != 0) {
1994 goto out2;
1995 }
1996
1997 placed = TRUE;
1998
1999 strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2000 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2001
2002 /* Forbid future moves */
2003 mount_lock(mp);
2004 mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2005 mount_unlock(mp);
2006
2007 /* Finally, add to mount list, completely ready to go */
2008 if (mount_list_add(mp) != 0) {
2009 /*
2010 * The system is shutting down trying to umount
2011 * everything, so fail with a plausible errno.
2012 */
2013 error = EBUSY;
2014 goto out3;
2015 }
2016
2017 mount_end_update(mp);
2018 vnode_put(rvp);
2019 zfree(ZV_NAMEI, old_mntonname);
2020
2021 vfs_notify_mount(pvp);
2022
2023 return 0;
2024 out3:
2025 strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2026
2027 mount_lock(mp);
2028 mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2029 mount_unlock(mp);
2030
2031 out2:
2032 /*
2033 * Placing the mp on the vnode clears VMOUNT,
2034 * so cleanup is different after that point
2035 */
2036 if (placed) {
2037 /* Rele the vp, clear VMOUNT and v_mountedhere */
2038 undo_place_on_covered_vp(mp, vp);
2039 } else {
2040 vnode_lock_spin(vp);
2041 CLR(vp->v_flag, VMOUNT);
2042 vnode_unlock(vp);
2043 }
2044 out1:
2045 mount_end_update(mp);
2046
2047 out0:
2048 vnode_put(rvp);
2049 zfree(ZV_NAMEI, old_mntonname);
2050 return error;
2051 }
2052
2053 #endif /* CONFIG_IMGSRC_ACCESS */
2054
2055 void
enablequotas(struct mount * mp,vfs_context_t ctx)2056 enablequotas(struct mount *mp, vfs_context_t ctx)
2057 {
2058 struct nameidata qnd;
2059 int type;
2060 char qfpath[MAXPATHLEN];
2061 const char *qfname = QUOTAFILENAME;
2062 const char *qfopsname = QUOTAOPSNAME;
2063 const char *qfextension[] = INITQFNAMES;
2064
2065 /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2066 if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2067 return;
2068 }
2069 /*
2070 * Enable filesystem disk quotas if necessary.
2071 * We ignore errors as this should not interfere with final mount
2072 */
2073 for (type = 0; type < MAXQUOTAS; type++) {
2074 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2075 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2076 CAST_USER_ADDR_T(qfpath), ctx);
2077 if (namei(&qnd) != 0) {
2078 continue; /* option file to trigger quotas is not present */
2079 }
2080 vnode_put(qnd.ni_vp);
2081 nameidone(&qnd);
2082 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2083
2084 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2085 }
2086 return;
2087 }
2088
2089
2090 static int
checkdirs_callback(proc_t p,void * arg)2091 checkdirs_callback(proc_t p, void * arg)
2092 {
2093 struct cdirargs *cdrp = (struct cdirargs *)arg;
2094 vnode_t olddp = cdrp->olddp;
2095 vnode_t newdp = cdrp->newdp;
2096 struct filedesc *fdp = &p->p_fd;
2097 vnode_t new_cvp = newdp;
2098 vnode_t new_rvp = newdp;
2099 vnode_t old_cvp = NULL;
2100 vnode_t old_rvp = NULL;
2101
2102 /*
2103 * XXX Also needs to iterate each thread in the process to see if it
2104 * XXX is using a per-thread current working directory, and, if so,
2105 * XXX update that as well.
2106 */
2107
2108 /*
2109 * First, with the proc_fdlock held, check to see if we will need
2110 * to do any work. If not, we will get out fast.
2111 */
2112 proc_fdlock(p);
2113 if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2114 proc_fdunlock(p);
2115 return PROC_RETURNED;
2116 }
2117 proc_fdunlock(p);
2118
2119 /*
2120 * Ok, we will have to do some work. Always take two refs
2121 * because we might need that many. We'll dispose of whatever
2122 * we ended up not using.
2123 */
2124 if (vnode_ref(newdp) != 0) {
2125 return PROC_RETURNED;
2126 }
2127 if (vnode_ref(newdp) != 0) {
2128 vnode_rele(newdp);
2129 return PROC_RETURNED;
2130 }
2131
2132 proc_dirs_lock_exclusive(p);
2133 /*
2134 * Now do the work. Note: we dropped the proc_fdlock, so we
2135 * have to do all of the checks again.
2136 */
2137 proc_fdlock(p);
2138 if (fdp->fd_cdir == olddp) {
2139 old_cvp = olddp;
2140 fdp->fd_cdir = newdp;
2141 new_cvp = NULL;
2142 }
2143 if (fdp->fd_rdir == olddp) {
2144 old_rvp = olddp;
2145 fdp->fd_rdir = newdp;
2146 new_rvp = NULL;
2147 }
2148 proc_fdunlock(p);
2149 proc_dirs_unlock_exclusive(p);
2150
2151 /*
2152 * Dispose of any references that are no longer needed.
2153 */
2154 if (old_cvp != NULL) {
2155 vnode_rele(old_cvp);
2156 }
2157 if (old_rvp != NULL) {
2158 vnode_rele(old_rvp);
2159 }
2160 if (new_cvp != NULL) {
2161 vnode_rele(new_cvp);
2162 }
2163 if (new_rvp != NULL) {
2164 vnode_rele(new_rvp);
2165 }
2166
2167 return PROC_RETURNED;
2168 }
2169
2170
2171
2172 /*
2173 * Scan all active processes to see if any of them have a current
2174 * or root directory onto which the new filesystem has just been
2175 * mounted. If so, replace them with the new mount point.
2176 */
2177 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2178 checkdirs(vnode_t olddp, vfs_context_t ctx)
2179 {
2180 vnode_t newdp;
2181 vnode_t tvp;
2182 int err;
2183 struct cdirargs cdr;
2184
2185 if (olddp->v_usecount == 1) {
2186 return 0;
2187 }
2188 err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2189
2190 if (err != 0) {
2191 #if DIAGNOSTIC
2192 panic("mount: lost mount: error %d", err);
2193 #endif
2194 return err;
2195 }
2196
2197 cdr.olddp = olddp;
2198 cdr.newdp = newdp;
2199 /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2200 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2201
2202 if (rootvnode == olddp) {
2203 vnode_ref(newdp);
2204 lck_rw_lock_exclusive(&rootvnode_rw_lock);
2205 tvp = rootvnode;
2206 rootvnode = newdp;
2207 lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2208 vnode_rele(tvp);
2209 }
2210
2211 vnode_put(newdp);
2212 return 0;
2213 }
2214
2215 /*
2216 * Unmount a file system.
2217 *
2218 * Note: unmount takes a path to the vnode mounted on as argument,
2219 * not special file (as before).
2220 */
2221 /* ARGSUSED */
2222 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2223 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2224 {
2225 vnode_t vp;
2226 struct mount *mp;
2227 int error;
2228 struct nameidata nd;
2229 vfs_context_t ctx = vfs_context_current();
2230
2231 NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2232 UIO_USERSPACE, uap->path, ctx);
2233 error = namei(&nd);
2234 if (error) {
2235 return error;
2236 }
2237 vp = nd.ni_vp;
2238 mp = vp->v_mount;
2239 nameidone(&nd);
2240
2241 #if CONFIG_MACF
2242 error = mac_mount_check_umount(ctx, mp);
2243 if (error != 0) {
2244 vnode_put(vp);
2245 return error;
2246 }
2247 #endif
2248 /*
2249 * Must be the root of the filesystem
2250 */
2251 if ((vp->v_flag & VROOT) == 0) {
2252 vnode_put(vp);
2253 return EINVAL;
2254 }
2255 mount_ref(mp, 0);
2256 vnode_put(vp);
2257 /* safedounmount consumes the mount ref */
2258 return safedounmount(mp, uap->flags, ctx);
2259 }
2260
2261 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2262 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2263 {
2264 mount_t mp;
2265
2266 mp = mount_list_lookupby_fsid(fsid, 0, 1);
2267 if (mp == (mount_t)0) {
2268 return ENOENT;
2269 }
2270 mount_ref(mp, 0);
2271 mount_iterdrop(mp);
2272 /* safedounmount consumes the mount ref */
2273 return safedounmount(mp, flags, ctx);
2274 }
2275
2276 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT \
2277 "com.apple.private.vfs.role-account-unmount"
2278
2279 /*
2280 * The mount struct comes with a mount ref which will be consumed.
2281 * Do the actual file system unmount, prevent some common foot shooting.
2282 */
2283 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2284 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2285 {
2286 int error;
2287 proc_t p = vfs_context_proc(ctx);
2288
2289 /*
2290 * If the file system is not responding and MNT_NOBLOCK
2291 * is set and not a forced unmount then return EBUSY.
2292 */
2293 if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2294 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2295 error = EBUSY;
2296 goto out;
2297 }
2298
2299 /*
2300 * Skip authorization in two cases:
2301 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2302 * This entitlement allows non-root processes unmount volumes mounted by
2303 * other processes.
2304 * - If the mount is tagged as permissive and this is not a forced-unmount
2305 * attempt.
2306 */
2307 if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2308 (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2309 /*
2310 * Only root, or the user that did the original mount is
2311 * permitted to unmount this filesystem.
2312 */
2313 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2314 (error = suser(kauth_cred_get(), &p->p_acflag))) {
2315 goto out;
2316 }
2317 }
2318 /*
2319 * Don't allow unmounting the root file system, or other volumes
2320 * associated with it (for example, the associated VM or DATA mounts) .
2321 */
2322 if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2323 if (!(mp->mnt_flag & MNT_ROOTFS)) {
2324 printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2325 mp->mnt_vfsstat.f_mntonname);
2326 }
2327 error = EBUSY; /* the root (or associated volumes) is always busy */
2328 goto out;
2329 }
2330
2331 /*
2332 * If the mount is providing the root filesystem's disk image
2333 * (i.e. imageboot), don't allow unmounting
2334 */
2335 if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2336 error = EBUSY;
2337 goto out;
2338 }
2339
2340 return dounmount(mp, flags, 1, ctx);
2341
2342 out:
2343 mount_drop(mp, 0);
2344 return error;
2345 }
2346
2347 /*
2348 * Do the actual file system unmount.
2349 */
2350 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2351 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2352 {
2353 vnode_t coveredvp = (vnode_t)0;
2354 int error;
2355 int needwakeup = 0;
2356 int forcedunmount = 0;
2357 int lflags = 0;
2358 struct vnode *devvp = NULLVP;
2359 #if CONFIG_TRIGGERS
2360 proc_t p = vfs_context_proc(ctx);
2361 int did_vflush = 0;
2362 int pflags_save = 0;
2363 #endif /* CONFIG_TRIGGERS */
2364
2365 #if CONFIG_FSE
2366 if (!(flags & MNT_FORCE)) {
2367 fsevent_unmount(mp, ctx); /* has to come first! */
2368 }
2369 #endif
2370
2371 mount_lock(mp);
2372
2373 /*
2374 * If already an unmount in progress just return EBUSY.
2375 * Even a forced unmount cannot override.
2376 */
2377 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2378 if (withref != 0) {
2379 mount_drop(mp, 1);
2380 }
2381 mount_unlock(mp);
2382 return EBUSY;
2383 }
2384
2385 if (flags & MNT_FORCE) {
2386 forcedunmount = 1;
2387 mp->mnt_lflag |= MNT_LFORCE;
2388 }
2389
2390 #if CONFIG_TRIGGERS
2391 if (flags & MNT_NOBLOCK && p != kernproc) {
2392 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2393 }
2394 #endif
2395
2396 mp->mnt_kern_flag |= MNTK_UNMOUNT;
2397 mp->mnt_lflag |= MNT_LUNMOUNT;
2398 mp->mnt_flag &= ~MNT_ASYNC;
2399 /*
2400 * anyone currently in the fast path that
2401 * trips over the cached rootvp will be
2402 * dumped out and forced into the slow path
2403 * to regenerate a new cached value
2404 */
2405 mp->mnt_realrootvp = NULLVP;
2406 mount_unlock(mp);
2407
2408 if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2409 /*
2410 * Force unmount any mounts in this filesystem.
2411 * If any unmounts fail - just leave them dangling.
2412 * Avoids recursion.
2413 */
2414 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2415 }
2416
2417 /*
2418 * taking the name_cache_lock exclusively will
2419 * insure that everyone is out of the fast path who
2420 * might be trying to use a now stale copy of
2421 * vp->v_mountedhere->mnt_realrootvp
2422 * bumping mount_generation causes the cached values
2423 * to be invalidated
2424 */
2425 name_cache_lock();
2426 mount_generation++;
2427 name_cache_unlock();
2428
2429
2430 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2431 if (withref != 0) {
2432 mount_drop(mp, 0);
2433 }
2434 error = 0;
2435 if (forcedunmount == 0) {
2436 ubc_umount(mp); /* release cached vnodes */
2437 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2438 error = VFS_SYNC(mp, MNT_WAIT, ctx);
2439 if (error) {
2440 mount_lock(mp);
2441 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2442 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2443 mp->mnt_lflag &= ~MNT_LFORCE;
2444 goto out;
2445 }
2446 }
2447 }
2448
2449 IOBSDMountChange(mp, kIOMountChangeUnmount);
2450
2451 #if CONFIG_TRIGGERS
2452 vfs_nested_trigger_unmounts(mp, flags, ctx);
2453 did_vflush = 1;
2454 #endif
2455 if (forcedunmount) {
2456 lflags |= FORCECLOSE;
2457 }
2458 error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM | SKIPROOT | lflags);
2459 if ((forcedunmount == 0) && error) {
2460 mount_lock(mp);
2461 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2462 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2463 mp->mnt_lflag &= ~MNT_LFORCE;
2464 goto out;
2465 }
2466
2467 /* make sure there are no one in the mount iterations or lookup */
2468 mount_iterdrain(mp);
2469
2470 error = VFS_UNMOUNT(mp, flags, ctx);
2471 if (error) {
2472 mount_iterreset(mp);
2473 mount_lock(mp);
2474 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2475 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2476 mp->mnt_lflag &= ~MNT_LFORCE;
2477 goto out;
2478 }
2479
2480 /* increment the operations count */
2481 if (!error) {
2482 OSAddAtomic(1, &vfs_nummntops);
2483 }
2484
2485 if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2486 /* hold an io reference and drop the usecount before close */
2487 devvp = mp->mnt_devvp;
2488 vnode_getalways(devvp);
2489 vnode_rele(devvp);
2490 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2491 ctx);
2492 vnode_clearmountedon(devvp);
2493 vnode_put(devvp);
2494 }
2495 lck_rw_done(&mp->mnt_rwlock);
2496 mount_list_remove(mp);
2497 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2498
2499 /* mark the mount point hook in the vp but not drop the ref yet */
2500 if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2501 /*
2502 * The covered vnode needs special handling. Trying to get an
2503 * iocount must not block here as this may lead to deadlocks
2504 * if the Filesystem to which the covered vnode belongs is
2505 * undergoing forced unmounts. Since we hold a usecount, the
2506 * vnode cannot be reused (it can, however, still be terminated)
2507 */
2508 vnode_getalways(coveredvp);
2509 vnode_lock_spin(coveredvp);
2510
2511 mp->mnt_crossref++;
2512 coveredvp->v_mountedhere = (struct mount *)0;
2513 CLR(coveredvp->v_flag, VMOUNT);
2514
2515 vnode_unlock(coveredvp);
2516 vnode_put(coveredvp);
2517 }
2518
2519 mount_list_lock();
2520 mp->mnt_vtable->vfc_refcount--;
2521 mount_list_unlock();
2522
2523 cache_purgevfs(mp); /* remove cache entries for this file sys */
2524 vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2525 mount_lock(mp);
2526 mp->mnt_lflag |= MNT_LDEAD;
2527
2528 if (mp->mnt_lflag & MNT_LWAIT) {
2529 /*
2530 * do the wakeup here
2531 * in case we block in mount_refdrain
2532 * which will drop the mount lock
2533 * and allow anyone blocked in vfs_busy
2534 * to wakeup and see the LDEAD state
2535 */
2536 mp->mnt_lflag &= ~MNT_LWAIT;
2537 wakeup((caddr_t)mp);
2538 }
2539 mount_refdrain(mp);
2540
2541 /* free disk_conditioner_info structure for this mount */
2542 disk_conditioner_unmount(mp);
2543
2544 out:
2545 if (mp->mnt_lflag & MNT_LWAIT) {
2546 mp->mnt_lflag &= ~MNT_LWAIT;
2547 needwakeup = 1;
2548 }
2549
2550 #if CONFIG_TRIGGERS
2551 if (flags & MNT_NOBLOCK && p != kernproc) {
2552 // Restore P_NOREMOTEHANG bit to its previous value
2553 if ((pflags_save & P_NOREMOTEHANG) == 0) {
2554 OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2555 }
2556 }
2557
2558 /*
2559 * Callback and context are set together under the mount lock, and
2560 * never cleared, so we're safe to examine them here, drop the lock,
2561 * and call out.
2562 */
2563 if (mp->mnt_triggercallback != NULL) {
2564 mount_unlock(mp);
2565 if (error == 0) {
2566 mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2567 } else if (did_vflush) {
2568 mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2569 }
2570 } else {
2571 mount_unlock(mp);
2572 }
2573 #else
2574 mount_unlock(mp);
2575 #endif /* CONFIG_TRIGGERS */
2576
2577 lck_rw_done(&mp->mnt_rwlock);
2578
2579 if (needwakeup) {
2580 wakeup((caddr_t)mp);
2581 }
2582
2583 if (!error) {
2584 if ((coveredvp != NULLVP)) {
2585 vnode_t pvp = NULLVP;
2586
2587 /*
2588 * The covered vnode needs special handling. Trying to
2589 * get an iocount must not block here as this may lead
2590 * to deadlocks if the Filesystem to which the covered
2591 * vnode belongs is undergoing forced unmounts. Since we
2592 * hold a usecount, the vnode cannot be reused
2593 * (it can, however, still be terminated).
2594 */
2595 vnode_getalways(coveredvp);
2596
2597 mount_dropcrossref(mp, coveredvp, 0);
2598 /*
2599 * We'll _try_ to detect if this really needs to be
2600 * done. The coveredvp can only be in termination (or
2601 * terminated) if the coveredvp's mount point is in a
2602 * forced unmount (or has been) since we still hold the
2603 * ref.
2604 */
2605 if (!vnode_isrecycled(coveredvp)) {
2606 pvp = vnode_getparent(coveredvp);
2607 #if CONFIG_TRIGGERS
2608 if (coveredvp->v_resolve) {
2609 vnode_trigger_rearm(coveredvp, ctx);
2610 }
2611 #endif
2612 }
2613
2614 vnode_rele(coveredvp);
2615 vnode_put(coveredvp);
2616 coveredvp = NULLVP;
2617
2618 if (pvp) {
2619 lock_vnode_and_post(pvp, NOTE_WRITE);
2620 vnode_put(pvp);
2621 }
2622 } else if (mp->mnt_flag & MNT_ROOTFS) {
2623 mount_lock_destroy(mp);
2624 #if CONFIG_MACF
2625 mac_mount_label_destroy(mp);
2626 #endif
2627 zfree(mount_zone, mp);
2628 } else {
2629 panic("dounmount: no coveredvp");
2630 }
2631 }
2632 return error;
2633 }
2634
2635 /*
2636 * Unmount any mounts in this filesystem.
2637 */
2638 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)2639 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2640 {
2641 mount_t smp;
2642 fsid_t *fsids, fsid;
2643 int fsids_sz;
2644 int count = 0, i, m = 0;
2645 vnode_t vp;
2646
2647 mount_list_lock();
2648
2649 // Get an array to hold the submounts fsids.
2650 TAILQ_FOREACH(smp, &mountlist, mnt_list)
2651 count++;
2652 fsids_sz = count * sizeof(fsid_t);
2653 fsids = kalloc_data(fsids_sz, Z_NOWAIT);
2654 if (fsids == NULL) {
2655 mount_list_unlock();
2656 goto out;
2657 }
2658 fsids[0] = mp->mnt_vfsstat.f_fsid; // Prime the pump
2659
2660 /*
2661 * Fill the array with submount fsids.
2662 * Since mounts are always added to the tail of the mount list, the
2663 * list is always in mount order.
2664 * For each mount check if the mounted-on vnode belongs to a
2665 * mount that's already added to our array of mounts to be unmounted.
2666 */
2667 for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2668 vp = smp->mnt_vnodecovered;
2669 if (vp == NULL) {
2670 continue;
2671 }
2672 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid; // Underlying fsid
2673 for (i = 0; i <= m; i++) {
2674 if (fsids[i].val[0] == fsid.val[0] &&
2675 fsids[i].val[1] == fsid.val[1]) {
2676 fsids[++m] = smp->mnt_vfsstat.f_fsid;
2677 break;
2678 }
2679 }
2680 }
2681 mount_list_unlock();
2682
2683 // Unmount the submounts in reverse order. Ignore errors.
2684 for (i = m; i > 0; i--) {
2685 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2686 if (smp) {
2687 mount_ref(smp, 0);
2688 mount_iterdrop(smp);
2689 (void) dounmount(smp, flags, 1, ctx);
2690 }
2691 }
2692 out:
2693 kfree_data(fsids, fsids_sz);
2694 }
2695
2696 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)2697 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2698 {
2699 vnode_lock(dp);
2700 mp->mnt_crossref--;
2701
2702 if (mp->mnt_crossref < 0) {
2703 panic("mount cross refs -ve");
2704 }
2705
2706 if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2707 if (need_put) {
2708 vnode_put_locked(dp);
2709 }
2710 vnode_unlock(dp);
2711
2712 mount_lock_destroy(mp);
2713 #if CONFIG_MACF
2714 mac_mount_label_destroy(mp);
2715 #endif
2716 zfree(mount_zone, mp);
2717 return;
2718 }
2719 if (need_put) {
2720 vnode_put_locked(dp);
2721 }
2722 vnode_unlock(dp);
2723 }
2724
2725
2726 /*
2727 * Sync each mounted filesystem.
2728 */
2729 #if DIAGNOSTIC
2730 int syncprt = 0;
2731 #endif
2732
2733 int print_vmpage_stat = 0;
2734
2735 /*
2736 * sync_callback: simple wrapper that calls VFS_SYNC() on volumes
2737 * mounted read-write with the passed waitfor value.
2738 *
2739 * Parameters: mp mount-point descriptor per mounted file-system instance.
2740 * arg user argument (please see below)
2741 *
2742 * User argument is a pointer to 32 bit unsigned integer which describes the
2743 * type of waitfor value to set for calling VFS_SYNC(). If user argument is
2744 * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
2745 * waitfor value.
2746 *
2747 * Returns: VFS_RETURNED
2748 */
2749 static int
sync_callback(mount_t mp,void * arg)2750 sync_callback(mount_t mp, void *arg)
2751 {
2752 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2753 int asyncflag = mp->mnt_flag & MNT_ASYNC;
2754 unsigned waitfor = MNT_NOWAIT;
2755
2756 if (arg) {
2757 waitfor = *(uint32_t*)arg;
2758 }
2759
2760 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
2761 if (waitfor != MNT_WAIT &&
2762 waitfor != (MNT_WAIT | MNT_VOLUME) &&
2763 waitfor != MNT_NOWAIT &&
2764 waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
2765 waitfor != MNT_DWAIT &&
2766 waitfor != (MNT_DWAIT | MNT_VOLUME)) {
2767 panic("Passed inappropriate waitfor %u to "
2768 "sync_callback()", waitfor);
2769 }
2770
2771 mp->mnt_flag &= ~MNT_ASYNC;
2772 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
2773 if (asyncflag) {
2774 mp->mnt_flag |= MNT_ASYNC;
2775 }
2776 }
2777
2778 return VFS_RETURNED;
2779 }
2780
2781 /* ARGSUSED */
2782 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)2783 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2784 {
2785 vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2786
2787 if (print_vmpage_stat) {
2788 vm_countdirtypages();
2789 }
2790
2791 #if DIAGNOSTIC
2792 if (syncprt) {
2793 vfs_bufstats();
2794 }
2795 #endif /* DIAGNOSTIC */
2796 return 0;
2797 }
2798
2799 typedef enum {
2800 SYNC_ALL = 0,
2801 SYNC_ONLY_RELIABLE_MEDIA = 1,
2802 SYNC_ONLY_UNRELIABLE_MEDIA = 2
2803 } sync_type_t;
2804
2805 static int
sync_internal_callback(mount_t mp,void * arg)2806 sync_internal_callback(mount_t mp, void *arg)
2807 {
2808 if (arg) {
2809 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
2810 (mp->mnt_flag & MNT_LOCAL);
2811 sync_type_t sync_type = *((sync_type_t *)arg);
2812
2813 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
2814 return VFS_RETURNED;
2815 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
2816 return VFS_RETURNED;
2817 }
2818 }
2819
2820 (void)sync_callback(mp, NULL);
2821
2822 return VFS_RETURNED;
2823 }
2824
2825 int sync_thread_state = 0;
2826 int sync_timeout_seconds = 5;
2827
2828 #define SYNC_THREAD_RUN 0x0001
2829 #define SYNC_THREAD_RUNNING 0x0002
2830
2831 #if CONFIG_PHYS_WRITE_ACCT
2832 thread_t pm_sync_thread;
2833 #endif /* CONFIG_PHYS_WRITE_ACCT */
2834
2835 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)2836 sync_thread(__unused void *arg, __unused wait_result_t wr)
2837 {
2838 sync_type_t sync_type;
2839 #if CONFIG_PHYS_WRITE_ACCT
2840 pm_sync_thread = current_thread();
2841 #endif /* CONFIG_PHYS_WRITE_ACCT */
2842
2843 lck_mtx_lock(&sync_mtx_lck);
2844 while (sync_thread_state & SYNC_THREAD_RUN) {
2845 sync_thread_state &= ~SYNC_THREAD_RUN;
2846 lck_mtx_unlock(&sync_mtx_lck);
2847
2848 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
2849 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2850 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
2851 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2852
2853 lck_mtx_lock(&sync_mtx_lck);
2854 }
2855 /*
2856 * This wakeup _has_ to be issued before the lock is released otherwise
2857 * we may end up waking up a thread in sync_internal which is
2858 * expecting a wakeup from a thread it just created and not from this
2859 * thread which is about to exit.
2860 */
2861 wakeup(&sync_thread_state);
2862 sync_thread_state &= ~SYNC_THREAD_RUNNING;
2863 #if CONFIG_PHYS_WRITE_ACCT
2864 pm_sync_thread = NULL;
2865 #endif /* CONFIG_PHYS_WRITE_ACCT */
2866 lck_mtx_unlock(&sync_mtx_lck);
2867
2868 if (print_vmpage_stat) {
2869 vm_countdirtypages();
2870 }
2871
2872 #if DIAGNOSTIC
2873 if (syncprt) {
2874 vfs_bufstats();
2875 }
2876 #endif /* DIAGNOSTIC */
2877 }
2878
2879 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
2880
2881 /*
2882 * An in-kernel sync for power management to call.
2883 * This function always returns within sync_timeout seconds.
2884 */
2885 __private_extern__ int
sync_internal(void)2886 sync_internal(void)
2887 {
2888 thread_t thd;
2889 int error;
2890 int thread_created = FALSE;
2891 struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
2892
2893 lck_mtx_lock(&sync_mtx_lck);
2894 sync_thread_state |= SYNC_THREAD_RUN;
2895 if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
2896 int kr;
2897
2898 sync_thread_state |= SYNC_THREAD_RUNNING;
2899 kr = kernel_thread_start(sync_thread, NULL, &thd);
2900 if (kr != KERN_SUCCESS) {
2901 sync_thread_state &= ~SYNC_THREAD_RUNNING;
2902 lck_mtx_unlock(&sync_mtx_lck);
2903 printf("sync_thread failed\n");
2904 return 0;
2905 }
2906 thread_created = TRUE;
2907 }
2908
2909 error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
2910 (PVFS | PDROP | PCATCH), "sync_thread", &ts);
2911 if (error) {
2912 struct timeval now;
2913
2914 microtime(&now);
2915 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
2916 printf("sync timed out: %d sec\n", sync_timeout_seconds);
2917 sync_timeout_last_print.tv_sec = now.tv_sec;
2918 }
2919 }
2920
2921 if (thread_created) {
2922 thread_deallocate(thd);
2923 }
2924
2925 return 0;
2926 } /* end of sync_internal call */
2927
2928 /*
2929 * Change filesystem quotas.
2930 */
2931 #if QUOTA
2932 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)2933 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2934 {
2935 struct mount *mp;
2936 int error, quota_cmd, quota_status = 0;
2937 caddr_t datap;
2938 size_t fnamelen;
2939 struct nameidata nd;
2940 vfs_context_t ctx = vfs_context_current();
2941 struct dqblk my_dqblk = {};
2942
2943 AUDIT_ARG(uid, uap->uid);
2944 AUDIT_ARG(cmd, uap->cmd);
2945 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2946 uap->path, ctx);
2947 error = namei(&nd);
2948 if (error) {
2949 return error;
2950 }
2951 mp = nd.ni_vp->v_mount;
2952 mount_ref(mp, 0);
2953 vnode_put(nd.ni_vp);
2954 nameidone(&nd);
2955
2956 #if CONFIG_MACF
2957 error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
2958 if (error != 0) {
2959 goto out;
2960 }
2961 #endif
2962
2963 /* copyin any data we will need for downstream code */
2964 quota_cmd = uap->cmd >> SUBCMDSHIFT;
2965
2966 switch (quota_cmd) {
2967 case Q_QUOTAON:
2968 /* uap->arg specifies a file from which to take the quotas */
2969 fnamelen = MAXPATHLEN;
2970 datap = zalloc(ZV_NAMEI);
2971 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2972 break;
2973 case Q_GETQUOTA:
2974 /* uap->arg is a pointer to a dqblk structure. */
2975 datap = (caddr_t) &my_dqblk;
2976 break;
2977 case Q_SETQUOTA:
2978 case Q_SETUSE:
2979 /* uap->arg is a pointer to a dqblk structure. */
2980 datap = (caddr_t) &my_dqblk;
2981 if (proc_is64bit(p)) {
2982 struct user_dqblk my_dqblk64;
2983 error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
2984 if (error == 0) {
2985 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2986 }
2987 } else {
2988 error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
2989 }
2990 break;
2991 case Q_QUOTASTAT:
2992 /* uap->arg is a pointer to an integer */
2993 datap = (caddr_t) "a_status;
2994 break;
2995 default:
2996 datap = NULL;
2997 break;
2998 } /* switch */
2999
3000 if (error == 0) {
3001 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3002 }
3003
3004 switch (quota_cmd) {
3005 case Q_QUOTAON:
3006 if (datap != NULL) {
3007 zfree(ZV_NAMEI, datap);
3008 }
3009 break;
3010 case Q_GETQUOTA:
3011 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
3012 if (error == 0) {
3013 if (proc_is64bit(p)) {
3014 struct user_dqblk my_dqblk64;
3015
3016 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3017 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3018 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3019 } else {
3020 error = copyout(datap, uap->arg, sizeof(struct dqblk));
3021 }
3022 }
3023 break;
3024 case Q_QUOTASTAT:
3025 /* uap->arg is a pointer to an integer */
3026 if (error == 0) {
3027 error = copyout(datap, uap->arg, sizeof(quota_status));
3028 }
3029 break;
3030 default:
3031 break;
3032 } /* switch */
3033
3034 out:
3035 mount_drop(mp, 0);
3036 return error;
3037 }
3038 #else
3039 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3040 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3041 {
3042 return EOPNOTSUPP;
3043 }
3044 #endif /* QUOTA */
3045
3046 /*
3047 * Get filesystem statistics.
3048 *
3049 * Returns: 0 Success
3050 * namei:???
3051 * vfs_update_vfsstat:???
3052 * munge_statfs:EFAULT
3053 */
3054 /* ARGSUSED */
3055 int
statfs(__unused proc_t p,struct statfs_args * uap,__unused int32_t * retval)3056 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3057 {
3058 struct mount *mp;
3059 struct vfsstatfs *sp;
3060 int error;
3061 struct nameidata nd;
3062 vfs_context_t ctx = vfs_context_current();
3063 vnode_t vp;
3064
3065 NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3066 UIO_USERSPACE, uap->path, ctx);
3067 error = namei(&nd);
3068 if (error != 0) {
3069 return error;
3070 }
3071 vp = nd.ni_vp;
3072 mp = vp->v_mount;
3073 sp = &mp->mnt_vfsstat;
3074 nameidone(&nd);
3075
3076 #if CONFIG_MACF
3077 error = mac_mount_check_stat(ctx, mp);
3078 if (error != 0) {
3079 vnode_put(vp);
3080 return error;
3081 }
3082 #endif
3083
3084 error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3085 if (error != 0) {
3086 vnode_put(vp);
3087 return error;
3088 }
3089
3090 error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
3091 vnode_put(vp);
3092 return error;
3093 }
3094
3095 /*
3096 * Get filesystem statistics.
3097 */
3098 /* ARGSUSED */
3099 int
fstatfs(__unused proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3100 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3101 {
3102 vnode_t vp;
3103 struct mount *mp;
3104 struct vfsstatfs *sp;
3105 int error;
3106
3107 AUDIT_ARG(fd, uap->fd);
3108
3109 if ((error = file_vnode(uap->fd, &vp))) {
3110 return error;
3111 }
3112
3113 error = vnode_getwithref(vp);
3114 if (error) {
3115 file_drop(uap->fd);
3116 return error;
3117 }
3118
3119 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3120
3121 mp = vp->v_mount;
3122 if (!mp) {
3123 error = EBADF;
3124 goto out;
3125 }
3126
3127 #if CONFIG_MACF
3128 error = mac_mount_check_stat(vfs_context_current(), mp);
3129 if (error != 0) {
3130 goto out;
3131 }
3132 #endif
3133
3134 sp = &mp->mnt_vfsstat;
3135 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3136 goto out;
3137 }
3138
3139 error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
3140
3141 out:
3142 file_drop(uap->fd);
3143 vnode_put(vp);
3144
3145 return error;
3146 }
3147
3148 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3149 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3150 {
3151 struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3152
3153 bzero(sfs, sizeof(*sfs));
3154
3155 sfs->f_bsize = vsfs->f_bsize;
3156 sfs->f_iosize = (int32_t)vsfs->f_iosize;
3157 sfs->f_blocks = vsfs->f_blocks;
3158 sfs->f_bfree = vsfs->f_bfree;
3159 sfs->f_bavail = vsfs->f_bavail;
3160 sfs->f_files = vsfs->f_files;
3161 sfs->f_ffree = vsfs->f_ffree;
3162 sfs->f_fsid = vsfs->f_fsid;
3163 sfs->f_owner = vsfs->f_owner;
3164 sfs->f_type = mp->mnt_vtable->vfc_typenum;
3165 sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3166 sfs->f_fssubtype = vsfs->f_fssubtype;
3167 sfs->f_flags_ext = (mp->mnt_kern_flag & MNTK_SYSTEMDATA) ? MNT_EXT_ROOT_DATA_VOL : 0;
3168 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3169 strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3170 } else {
3171 strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
3172 }
3173 strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3174 strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3175 }
3176
3177 /*
3178 * Get file system statistics in 64-bit mode
3179 */
3180 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3181 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3182 {
3183 struct mount *mp;
3184 int error;
3185 struct nameidata *ndp;
3186 struct statfs64 *sfsp;
3187 vfs_context_t ctxp = vfs_context_current();
3188 vnode_t vp;
3189 struct {
3190 struct nameidata nd;
3191 struct statfs64 sfs;
3192 } *__nameidata_statfs64;
3193
3194 __nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3195 Z_WAITOK);
3196 ndp = &__nameidata_statfs64->nd;
3197
3198 NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3199 UIO_USERSPACE, uap->path, ctxp);
3200 error = namei(ndp);
3201 if (error != 0) {
3202 goto out;
3203 }
3204 vp = ndp->ni_vp;
3205 mp = vp->v_mount;
3206 nameidone(ndp);
3207
3208 #if CONFIG_MACF
3209 error = mac_mount_check_stat(ctxp, mp);
3210 if (error != 0) {
3211 vnode_put(vp);
3212 goto out;
3213 }
3214 #endif
3215
3216 error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3217 if (error != 0) {
3218 vnode_put(vp);
3219 goto out;
3220 }
3221
3222 sfsp = &__nameidata_statfs64->sfs;
3223 vfs_get_statfs64(mp, sfsp);
3224 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3225 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3226 /* This process does not want to see a seperate data volume mountpoint */
3227 strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3228 }
3229 error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3230 vnode_put(vp);
3231
3232 out:
3233 kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3234
3235 return error;
3236 }
3237
3238 /*
3239 * Get file system statistics in 64-bit mode
3240 */
3241 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3242 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3243 {
3244 struct vnode *vp;
3245 struct mount *mp;
3246 struct statfs64 sfs;
3247 int error;
3248
3249 AUDIT_ARG(fd, uap->fd);
3250
3251 if ((error = file_vnode(uap->fd, &vp))) {
3252 return error;
3253 }
3254
3255 error = vnode_getwithref(vp);
3256 if (error) {
3257 file_drop(uap->fd);
3258 return error;
3259 }
3260
3261 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3262
3263 mp = vp->v_mount;
3264 if (!mp) {
3265 error = EBADF;
3266 goto out;
3267 }
3268
3269 #if CONFIG_MACF
3270 error = mac_mount_check_stat(vfs_context_current(), mp);
3271 if (error != 0) {
3272 goto out;
3273 }
3274 #endif
3275
3276 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3277 goto out;
3278 }
3279
3280 vfs_get_statfs64(mp, &sfs);
3281 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3282 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3283 /* This process does not want to see a seperate data volume mountpoint */
3284 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3285 }
3286 error = copyout(&sfs, uap->buf, sizeof(sfs));
3287
3288 out:
3289 file_drop(uap->fd);
3290 vnode_put(vp);
3291
3292 return error;
3293 }
3294
3295 struct getfsstat_struct {
3296 user_addr_t sfsp;
3297 user_addr_t *mp;
3298 int count;
3299 int maxcount;
3300 int flags;
3301 int error;
3302 };
3303
3304
3305 static int
getfsstat_callback(mount_t mp,void * arg)3306 getfsstat_callback(mount_t mp, void * arg)
3307 {
3308 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3309 struct vfsstatfs *sp;
3310 int error, my_size;
3311 vfs_context_t ctx = vfs_context_current();
3312
3313 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3314 #if CONFIG_MACF
3315 error = mac_mount_check_stat(ctx, mp);
3316 if (error != 0) {
3317 fstp->error = error;
3318 return VFS_RETURNED_DONE;
3319 }
3320 #endif
3321 sp = &mp->mnt_vfsstat;
3322 /*
3323 * If MNT_NOWAIT is specified, do not refresh the
3324 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3325 */
3326 if ((mp->mnt_lflag & MNT_LDEAD) ||
3327 (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3328 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3329 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3330 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3331 return VFS_RETURNED;
3332 }
3333
3334 /*
3335 * Need to handle LP64 version of struct statfs
3336 */
3337 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3338 if (error) {
3339 fstp->error = error;
3340 return VFS_RETURNED_DONE;
3341 }
3342 fstp->sfsp += my_size;
3343
3344 if (fstp->mp) {
3345 #if CONFIG_MACF
3346 error = mac_mount_label_get(mp, *fstp->mp);
3347 if (error) {
3348 fstp->error = error;
3349 return VFS_RETURNED_DONE;
3350 }
3351 #endif
3352 fstp->mp++;
3353 }
3354 }
3355 fstp->count++;
3356 return VFS_RETURNED;
3357 }
3358
3359 /*
3360 * Get statistics on all filesystems.
3361 */
3362 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3363 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3364 {
3365 struct __mac_getfsstat_args muap;
3366
3367 muap.buf = uap->buf;
3368 muap.bufsize = uap->bufsize;
3369 muap.mac = USER_ADDR_NULL;
3370 muap.macsize = 0;
3371 muap.flags = uap->flags;
3372
3373 return __mac_getfsstat(p, &muap, retval);
3374 }
3375
3376 /*
3377 * __mac_getfsstat: Get MAC-related file system statistics
3378 *
3379 * Parameters: p (ignored)
3380 * uap User argument descriptor (see below)
3381 * retval Count of file system statistics (N stats)
3382 *
3383 * Indirect: uap->bufsize Buffer size
3384 * uap->macsize MAC info size
3385 * uap->buf Buffer where information will be returned
3386 * uap->mac MAC info
3387 * uap->flags File system flags
3388 *
3389 *
3390 * Returns: 0 Success
3391 * !0 Not success
3392 *
3393 */
3394 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3395 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3396 {
3397 user_addr_t sfsp;
3398 user_addr_t *mp;
3399 size_t count, maxcount, bufsize, macsize;
3400 struct getfsstat_struct fst;
3401
3402 if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3403 return EINVAL;
3404 }
3405
3406 bufsize = (size_t) uap->bufsize;
3407 macsize = (size_t) uap->macsize;
3408
3409 if (IS_64BIT_PROCESS(p)) {
3410 maxcount = bufsize / sizeof(struct user64_statfs);
3411 } else {
3412 maxcount = bufsize / sizeof(struct user32_statfs);
3413 }
3414 sfsp = uap->buf;
3415 count = 0;
3416
3417 mp = NULL;
3418
3419 #if CONFIG_MACF
3420 if (uap->mac != USER_ADDR_NULL) {
3421 u_int32_t *mp0;
3422 int error;
3423 unsigned int i;
3424
3425 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3426 if (count != maxcount) {
3427 return EINVAL;
3428 }
3429
3430 /* Copy in the array */
3431 mp0 = kalloc_data(macsize, Z_WAITOK);
3432 if (mp0 == NULL) {
3433 return ENOMEM;
3434 }
3435
3436 error = copyin(uap->mac, mp0, macsize);
3437 if (error) {
3438 kfree_data(mp0, macsize);
3439 return error;
3440 }
3441
3442 /* Normalize to an array of user_addr_t */
3443 mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3444 if (mp == NULL) {
3445 kfree_data(mp0, macsize);
3446 return ENOMEM;
3447 }
3448
3449 for (i = 0; i < count; i++) {
3450 if (IS_64BIT_PROCESS(p)) {
3451 mp[i] = ((user_addr_t *)mp0)[i];
3452 } else {
3453 mp[i] = (user_addr_t)mp0[i];
3454 }
3455 }
3456 kfree_data(mp0, macsize);
3457 }
3458 #endif
3459
3460
3461 fst.sfsp = sfsp;
3462 fst.mp = mp;
3463 fst.flags = uap->flags;
3464 fst.count = 0;
3465 fst.error = 0;
3466 fst.maxcount = (int)maxcount;
3467
3468
3469 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3470
3471 if (mp) {
3472 kfree_data(mp, count * sizeof(user_addr_t));
3473 }
3474
3475 if (fst.error) {
3476 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3477 return fst.error;
3478 }
3479
3480 if (fst.sfsp && fst.count > fst.maxcount) {
3481 *retval = fst.maxcount;
3482 } else {
3483 *retval = fst.count;
3484 }
3485 return 0;
3486 }
3487
3488 static int
getfsstat64_callback(mount_t mp,void * arg)3489 getfsstat64_callback(mount_t mp, void * arg)
3490 {
3491 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3492 struct vfsstatfs *sp;
3493 struct statfs64 sfs;
3494 int error;
3495
3496 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3497 #if CONFIG_MACF
3498 error = mac_mount_check_stat(vfs_context_current(), mp);
3499 if (error != 0) {
3500 fstp->error = error;
3501 return VFS_RETURNED_DONE;
3502 }
3503 #endif
3504 sp = &mp->mnt_vfsstat;
3505 /*
3506 * If MNT_NOWAIT is specified, do not refresh the fsstat
3507 * cache. MNT_WAIT overrides MNT_NOWAIT.
3508 *
3509 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3510 * getfsstat, since the constants are out of the same
3511 * namespace.
3512 */
3513 if ((mp->mnt_lflag & MNT_LDEAD) ||
3514 ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3515 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3516 (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3517 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3518 return VFS_RETURNED;
3519 }
3520
3521 vfs_get_statfs64(mp, &sfs);
3522 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3523 if (error) {
3524 fstp->error = error;
3525 return VFS_RETURNED_DONE;
3526 }
3527 fstp->sfsp += sizeof(sfs);
3528 }
3529 fstp->count++;
3530 return VFS_RETURNED;
3531 }
3532
3533 /*
3534 * Get statistics on all file systems in 64 bit mode.
3535 */
3536 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)3537 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3538 {
3539 user_addr_t sfsp;
3540 int count, maxcount;
3541 struct getfsstat_struct fst;
3542
3543 maxcount = uap->bufsize / sizeof(struct statfs64);
3544
3545 sfsp = uap->buf;
3546 count = 0;
3547
3548 fst.sfsp = sfsp;
3549 fst.flags = uap->flags;
3550 fst.count = 0;
3551 fst.error = 0;
3552 fst.maxcount = maxcount;
3553
3554 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3555
3556 if (fst.error) {
3557 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3558 return fst.error;
3559 }
3560
3561 if (fst.sfsp && fst.count > fst.maxcount) {
3562 *retval = fst.maxcount;
3563 } else {
3564 *retval = fst.count;
3565 }
3566
3567 return 0;
3568 }
3569
3570 /*
3571 * gets the associated vnode with the file descriptor passed.
3572 * as input
3573 *
3574 * INPUT
3575 * ctx - vfs context of caller
3576 * fd - file descriptor for which vnode is required.
3577 * vpp - Pointer to pointer to vnode to be returned.
3578 *
3579 * The vnode is returned with an iocount so any vnode obtained
3580 * by this call needs a vnode_put
3581 *
3582 */
3583 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)3584 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3585 {
3586 int error;
3587 vnode_t vp;
3588 struct fileproc *fp;
3589 proc_t p = vfs_context_proc(ctx);
3590
3591 *vpp = NULLVP;
3592
3593 error = fp_getfvp(p, fd, &fp, &vp);
3594 if (error) {
3595 return error;
3596 }
3597
3598 error = vnode_getwithref(vp);
3599 if (error) {
3600 (void)fp_drop(p, fd, fp, 0);
3601 return error;
3602 }
3603
3604 (void)fp_drop(p, fd, fp, 0);
3605 *vpp = vp;
3606 return error;
3607 }
3608
3609 /*
3610 * Wrapper function around namei to start lookup from a directory
3611 * specified by a file descriptor ni_dirfd.
3612 *
3613 * In addition to all the errors returned by namei, this call can
3614 * return ENOTDIR if the file descriptor does not refer to a directory.
3615 * and EBADF if the file descriptor is not valid.
3616 */
3617 int
nameiat(struct nameidata * ndp,int dirfd)3618 nameiat(struct nameidata *ndp, int dirfd)
3619 {
3620 if ((dirfd != AT_FDCWD) &&
3621 !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3622 !(ndp->ni_cnd.cn_flags & USEDVP)) {
3623 int error = 0;
3624 char c;
3625
3626 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3627 error = copyin(ndp->ni_dirp, &c, sizeof(char));
3628 if (error) {
3629 return error;
3630 }
3631 } else {
3632 c = *((char *)(ndp->ni_dirp));
3633 }
3634
3635 if (c != '/') {
3636 vnode_t dvp_at;
3637
3638 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3639 &dvp_at);
3640 if (error) {
3641 return error;
3642 }
3643
3644 if (vnode_vtype(dvp_at) != VDIR) {
3645 vnode_put(dvp_at);
3646 return ENOTDIR;
3647 }
3648
3649 ndp->ni_dvp = dvp_at;
3650 ndp->ni_cnd.cn_flags |= USEDVP;
3651 error = namei(ndp);
3652 ndp->ni_cnd.cn_flags &= ~USEDVP;
3653 vnode_put(dvp_at);
3654 return error;
3655 }
3656 }
3657
3658 return namei(ndp);
3659 }
3660
3661 /*
3662 * Change current working directory to a given file descriptor.
3663 */
3664 /* ARGSUSED */
3665 static int
common_fchdir(proc_t p,struct fchdir_args * uap,int per_thread)3666 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
3667 {
3668 vnode_t vp;
3669 vnode_t tdp;
3670 vnode_t tvp;
3671 struct mount *mp;
3672 int error, should_put = 1;
3673 vfs_context_t ctx = vfs_context_current();
3674
3675 AUDIT_ARG(fd, uap->fd);
3676 if (per_thread && uap->fd == -1) {
3677 /*
3678 * Switching back from per-thread to per process CWD; verify we
3679 * in fact have one before proceeding. The only success case
3680 * for this code path is to return 0 preemptively after zapping
3681 * the thread structure contents.
3682 */
3683 thread_t th = vfs_context_thread(ctx);
3684 if (th) {
3685 uthread_t uth = get_bsdthread_info(th);
3686 tvp = uth->uu_cdir;
3687 uth->uu_cdir = NULLVP;
3688 if (tvp != NULLVP) {
3689 vnode_rele(tvp);
3690 return 0;
3691 }
3692 }
3693 return EBADF;
3694 }
3695
3696 if ((error = file_vnode(uap->fd, &vp))) {
3697 return error;
3698 }
3699 if ((error = vnode_getwithref(vp))) {
3700 file_drop(uap->fd);
3701 return error;
3702 }
3703
3704 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
3705
3706 if (vp->v_type != VDIR) {
3707 error = ENOTDIR;
3708 goto out;
3709 }
3710
3711 #if CONFIG_MACF
3712 error = mac_vnode_check_chdir(ctx, vp);
3713 if (error) {
3714 goto out;
3715 }
3716 #endif
3717 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3718 if (error) {
3719 goto out;
3720 }
3721
3722 while (!error && (mp = vp->v_mountedhere) != NULL) {
3723 if (vfs_busy(mp, LK_NOWAIT)) {
3724 error = EACCES;
3725 goto out;
3726 }
3727 error = VFS_ROOT(mp, &tdp, ctx);
3728 vfs_unbusy(mp);
3729 if (error) {
3730 break;
3731 }
3732 vnode_put(vp);
3733 vp = tdp;
3734 }
3735 if (error) {
3736 goto out;
3737 }
3738 if ((error = vnode_ref(vp))) {
3739 goto out;
3740 }
3741 vnode_put(vp);
3742 should_put = 0;
3743
3744 if (per_thread) {
3745 thread_t th = vfs_context_thread(ctx);
3746 if (th) {
3747 uthread_t uth = get_bsdthread_info(th);
3748 tvp = uth->uu_cdir;
3749 uth->uu_cdir = vp;
3750 OSBitOrAtomic(P_THCWD, &p->p_flag);
3751 } else {
3752 vnode_rele(vp);
3753 error = ENOENT;
3754 goto out;
3755 }
3756 } else {
3757 proc_dirs_lock_exclusive(p);
3758 proc_fdlock(p);
3759 tvp = p->p_fd.fd_cdir;
3760 p->p_fd.fd_cdir = vp;
3761 proc_fdunlock(p);
3762 proc_dirs_unlock_exclusive(p);
3763 }
3764
3765 if (tvp) {
3766 vnode_rele(tvp);
3767 }
3768
3769 out:
3770 if (should_put) {
3771 vnode_put(vp);
3772 }
3773 file_drop(uap->fd);
3774
3775 return error;
3776 }
3777
3778 int
fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)3779 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3780 {
3781 return common_fchdir(p, uap, 0);
3782 }
3783
3784 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)3785 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3786 {
3787 return common_fchdir(p, (void *)uap, 1);
3788 }
3789
3790
3791 /*
3792 * Change current working directory (".").
3793 *
3794 * Returns: 0 Success
3795 * change_dir:ENOTDIR
3796 * change_dir:???
3797 * vnode_ref:ENOENT No such file or directory
3798 */
3799 /* ARGSUSED */
3800 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)3801 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
3802 {
3803 int error;
3804 vnode_t tvp;
3805
3806 error = change_dir(ndp, ctx);
3807 if (error) {
3808 return error;
3809 }
3810 if ((error = vnode_ref(ndp->ni_vp))) {
3811 vnode_put(ndp->ni_vp);
3812 return error;
3813 }
3814 /*
3815 * drop the iocount we picked up in change_dir
3816 */
3817 vnode_put(ndp->ni_vp);
3818
3819 if (per_thread) {
3820 thread_t th = vfs_context_thread(ctx);
3821 if (th) {
3822 uthread_t uth = get_bsdthread_info(th);
3823 tvp = uth->uu_cdir;
3824 uth->uu_cdir = ndp->ni_vp;
3825 OSBitOrAtomic(P_THCWD, &p->p_flag);
3826 } else {
3827 vnode_rele(ndp->ni_vp);
3828 return ENOENT;
3829 }
3830 } else {
3831 proc_dirs_lock_exclusive(p);
3832 proc_fdlock(p);
3833 tvp = p->p_fd.fd_cdir;
3834 p->p_fd.fd_cdir = ndp->ni_vp;
3835 proc_fdunlock(p);
3836 proc_dirs_unlock_exclusive(p);
3837 }
3838
3839 if (tvp) {
3840 vnode_rele(tvp);
3841 }
3842
3843 return 0;
3844 }
3845
3846
3847 /*
3848 * Change current working directory (".").
3849 *
3850 * Returns: 0 Success
3851 * chdir_internal:ENOTDIR
3852 * chdir_internal:ENOENT No such file or directory
3853 * chdir_internal:???
3854 */
3855 /* ARGSUSED */
3856 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)3857 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3858 {
3859 struct nameidata nd;
3860 vfs_context_t ctx = vfs_context_current();
3861
3862 NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3863 UIO_USERSPACE, uap->path, ctx);
3864
3865 return chdir_internal(p, ctx, &nd, per_thread);
3866 }
3867
3868
3869 /*
3870 * chdir
3871 *
3872 * Change current working directory (".") for the entire process
3873 *
3874 * Parameters: p Process requesting the call
3875 * uap User argument descriptor (see below)
3876 * retval (ignored)
3877 *
3878 * Indirect parameters: uap->path Directory path
3879 *
3880 * Returns: 0 Success
3881 * common_chdir: ENOTDIR
3882 * common_chdir: ENOENT No such file or directory
3883 * common_chdir: ???
3884 *
3885 */
3886 int
chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)3887 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3888 {
3889 return common_chdir(p, (void *)uap, 0);
3890 }
3891
3892 /*
3893 * __pthread_chdir
3894 *
3895 * Change current working directory (".") for a single thread
3896 *
3897 * Parameters: p Process requesting the call
3898 * uap User argument descriptor (see below)
3899 * retval (ignored)
3900 *
3901 * Indirect parameters: uap->path Directory path
3902 *
3903 * Returns: 0 Success
3904 * common_chdir: ENOTDIR
3905 * common_chdir: ENOENT No such file or directory
3906 * common_chdir: ???
3907 *
3908 */
3909 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)3910 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3911 {
3912 return common_chdir(p, (void *)uap, 1);
3913 }
3914
3915
3916 /*
3917 * Change notion of root (``/'') directory.
3918 */
3919 /* ARGSUSED */
3920 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)3921 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3922 {
3923 struct filedesc *fdp = &p->p_fd;
3924 int error;
3925 struct nameidata nd;
3926 vnode_t tvp;
3927 vfs_context_t ctx = vfs_context_current();
3928
3929 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
3930 return error;
3931 }
3932
3933 NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3934 UIO_USERSPACE, uap->path, ctx);
3935 error = change_dir(&nd, ctx);
3936 if (error) {
3937 return error;
3938 }
3939
3940 #if CONFIG_MACF
3941 error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3942 &nd.ni_cnd);
3943 if (error) {
3944 vnode_put(nd.ni_vp);
3945 return error;
3946 }
3947 #endif
3948
3949 if ((error = vnode_ref(nd.ni_vp))) {
3950 vnode_put(nd.ni_vp);
3951 return error;
3952 }
3953 vnode_put(nd.ni_vp);
3954
3955 /*
3956 * This lock provides the guarantee that as long as you hold the lock
3957 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
3958 * on a referenced vnode in namei when determining the rootvnode for
3959 * a process.
3960 */
3961 /* needed for synchronization with lookup */
3962 proc_dirs_lock_exclusive(p);
3963 /* needed for setting the flag and other activities on the fd itself */
3964 proc_fdlock(p);
3965 tvp = fdp->fd_rdir;
3966 fdp->fd_rdir = nd.ni_vp;
3967 fdt_flag_set(fdp, FD_CHROOT);
3968 proc_fdunlock(p);
3969 proc_dirs_unlock_exclusive(p);
3970
3971 if (tvp != NULL) {
3972 vnode_rele(tvp);
3973 }
3974
3975 return 0;
3976 }
3977
3978 #define PATHSTATICBUFLEN 256
3979 #define PIVOT_ROOT_ENTITLEMENT \
3980 "com.apple.private.vfs.pivot-root"
3981
3982 #if defined(XNU_TARGET_OS_OSX)
3983 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)3984 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
3985 {
3986 int error;
3987 char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
3988 char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
3989 char *new_rootfs_path_before_buf = NULL;
3990 char *old_rootfs_path_after_buf = NULL;
3991 char *incoming = NULL;
3992 char *outgoing = NULL;
3993 vnode_t incoming_rootvp = NULLVP;
3994 size_t bytes_copied;
3995
3996 /*
3997 * XXX : Additional restrictions needed
3998 * - perhaps callable only once.
3999 */
4000 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4001 return error;
4002 }
4003
4004 /*
4005 * pivot_root can be executed by launchd only.
4006 * Enforce entitlement.
4007 */
4008 if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4009 return EPERM;
4010 }
4011
4012 error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4013 if (error == ENAMETOOLONG) {
4014 new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4015 error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4016 }
4017
4018 if (error) {
4019 goto out;
4020 }
4021
4022 error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4023 if (error == ENAMETOOLONG) {
4024 old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4025 error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4026 }
4027 if (error) {
4028 goto out;
4029 }
4030
4031 if (new_rootfs_path_before_buf) {
4032 incoming = new_rootfs_path_before_buf;
4033 } else {
4034 incoming = &new_rootfs_path_before[0];
4035 }
4036
4037 if (old_rootfs_path_after_buf) {
4038 outgoing = old_rootfs_path_after_buf;
4039 } else {
4040 outgoing = &old_rootfs_path_after[0];
4041 }
4042
4043 /*
4044 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4045 * Userland is not allowed to pivot to an image.
4046 */
4047 error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4048 if (error) {
4049 goto out;
4050 }
4051 error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4052 if (error) {
4053 goto out;
4054 }
4055
4056 error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4057
4058 out:
4059 if (incoming_rootvp != NULLVP) {
4060 vnode_put(incoming_rootvp);
4061 incoming_rootvp = NULLVP;
4062 }
4063
4064 if (old_rootfs_path_after_buf) {
4065 zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4066 }
4067
4068 if (new_rootfs_path_before_buf) {
4069 zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4070 }
4071
4072 return error;
4073 }
4074 #else
4075 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4076 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4077 {
4078 return nosys(p, NULL, retval);
4079 }
4080 #endif /* XNU_TARGET_OS_OSX */
4081
4082 /*
4083 * Common routine for chroot and chdir.
4084 *
4085 * Returns: 0 Success
4086 * ENOTDIR Not a directory
4087 * namei:??? [anything namei can return]
4088 * vnode_authorize:??? [anything vnode_authorize can return]
4089 */
4090 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4091 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4092 {
4093 vnode_t vp;
4094 int error;
4095
4096 if ((error = namei(ndp))) {
4097 return error;
4098 }
4099 nameidone(ndp);
4100 vp = ndp->ni_vp;
4101
4102 if (vp->v_type != VDIR) {
4103 vnode_put(vp);
4104 return ENOTDIR;
4105 }
4106
4107 #if CONFIG_MACF
4108 error = mac_vnode_check_chdir(ctx, vp);
4109 if (error) {
4110 vnode_put(vp);
4111 return error;
4112 }
4113 #endif
4114
4115 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4116 if (error) {
4117 vnode_put(vp);
4118 return error;
4119 }
4120
4121 return error;
4122 }
4123
4124 /*
4125 * Free the vnode data (for directories) associated with the file glob.
4126 */
4127 struct fd_vn_data *
fg_vn_data_alloc(void)4128 fg_vn_data_alloc(void)
4129 {
4130 struct fd_vn_data *fvdata;
4131
4132 /* Allocate per fd vnode data */
4133 fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4134 lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4135 return fvdata;
4136 }
4137
4138 /*
4139 * Free the vnode data (for directories) associated with the file glob.
4140 */
4141 void
fg_vn_data_free(void * fgvndata)4142 fg_vn_data_free(void *fgvndata)
4143 {
4144 struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4145
4146 kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4147 lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4148 kfree_type(struct fd_vn_data, fvdata);
4149 }
4150
4151 /*
4152 * Check permissions, allocate an open file structure,
4153 * and call the device open routine if any.
4154 *
4155 * Returns: 0 Success
4156 * EINVAL
4157 * EINTR
4158 * falloc:ENFILE
4159 * falloc:EMFILE
4160 * falloc:ENOMEM
4161 * vn_open_auth:???
4162 * dupfdopen:???
4163 * VNOP_ADVLOCK:???
4164 * vnode_setsize:???
4165 *
4166 * XXX Need to implement uid, gid
4167 */
4168 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval)4169 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4170 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval)
4171 {
4172 proc_t p = vfs_context_proc(ctx);
4173 uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4174 struct fileproc *fp;
4175 vnode_t vp;
4176 int flags, oflags;
4177 int type, indx, error;
4178 struct vfs_context context;
4179
4180 oflags = uflags;
4181
4182 if ((oflags & O_ACCMODE) == O_ACCMODE) {
4183 return EINVAL;
4184 }
4185
4186 flags = FFLAGS(uflags);
4187 CLR(flags, FENCRYPTED);
4188 CLR(flags, FUNENCRYPTED);
4189
4190 AUDIT_ARG(fflags, oflags);
4191 AUDIT_ARG(mode, vap->va_mode);
4192
4193 if ((error = falloc_withinit(p, &fp, &indx, ctx, fp_init, initarg)) != 0) {
4194 return error;
4195 }
4196 if (flags & O_CLOEXEC) {
4197 fp->fp_flags |= FP_CLOEXEC;
4198 }
4199 if (flags & O_CLOFORK) {
4200 fp->fp_flags |= FP_CLOFORK;
4201 }
4202
4203 /* setup state to recognize when fdesc_open was called */
4204 uu->uu_dupfd = -1;
4205
4206 if ((error = vn_open_auth(ndp, &flags, vap))) {
4207 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4208 if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4209 *retval = indx;
4210 return 0;
4211 }
4212 }
4213 if (error == ERESTART) {
4214 error = EINTR;
4215 }
4216 fp_free(p, indx, fp);
4217 return error;
4218 }
4219 uu->uu_dupfd = 0;
4220 vp = ndp->ni_vp;
4221
4222 fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4223 fp->fp_glob->fg_ops = &vnops;
4224 fp_set_data(fp, vp);
4225
4226 if (flags & (O_EXLOCK | O_SHLOCK)) {
4227 struct flock lf = {
4228 .l_whence = SEEK_SET,
4229 };
4230
4231 if (flags & O_EXLOCK) {
4232 lf.l_type = F_WRLCK;
4233 } else {
4234 lf.l_type = F_RDLCK;
4235 }
4236 type = F_FLOCK;
4237 if ((flags & FNONBLOCK) == 0) {
4238 type |= F_WAIT;
4239 }
4240 #if CONFIG_MACF
4241 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4242 F_SETLK, &lf);
4243 if (error) {
4244 goto bad;
4245 }
4246 #endif
4247 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4248 goto bad;
4249 }
4250 fp->fp_glob->fg_flag |= FWASLOCKED;
4251 }
4252
4253 /* try to truncate by setting the size attribute */
4254 if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4255 goto bad;
4256 }
4257
4258 /*
4259 * For directories we hold some additional information in the fd.
4260 */
4261 if (vnode_vtype(vp) == VDIR) {
4262 fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4263 } else {
4264 fp->fp_glob->fg_vn_data = NULL;
4265 }
4266
4267 vnode_put(vp);
4268
4269 /*
4270 * The first terminal open (without a O_NOCTTY) by a session leader
4271 * results in it being set as the controlling terminal.
4272 */
4273 if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4274 !(flags & O_NOCTTY)) {
4275 int tmp = 0;
4276
4277 (void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4278 (caddr_t)&tmp, ctx);
4279 }
4280
4281 proc_fdlock(p);
4282 procfdtbl_releasefd(p, indx, NULL);
4283
4284 #if CONFIG_SECLUDED_MEMORY
4285 if (secluded_for_filecache &&
4286 FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE &&
4287 vnode_vtype(vp) == VREG) {
4288 memory_object_control_t moc;
4289
4290 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4291
4292 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4293 /* nothing to do... */
4294 } else if (fp->fp_glob->fg_flag & FWRITE) {
4295 /* writable -> no longer eligible for secluded pages */
4296 memory_object_mark_eligible_for_secluded(moc,
4297 FALSE);
4298 } else if (secluded_for_filecache == 1) {
4299 char pathname[32] = { 0, };
4300 size_t copied;
4301 /* XXX FBDP: better way to detect /Applications/ ? */
4302 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4303 (void)copyinstr(ndp->ni_dirp,
4304 pathname,
4305 sizeof(pathname),
4306 &copied);
4307 } else {
4308 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4309 pathname,
4310 sizeof(pathname),
4311 &copied);
4312 }
4313 pathname[sizeof(pathname) - 1] = '\0';
4314 if (strncmp(pathname,
4315 "/Applications/",
4316 strlen("/Applications/")) == 0 &&
4317 strncmp(pathname,
4318 "/Applications/Camera.app/",
4319 strlen("/Applications/Camera.app/")) != 0) {
4320 /*
4321 * not writable
4322 * AND from "/Applications/"
4323 * AND not from "/Applications/Camera.app/"
4324 * ==> eligible for secluded
4325 */
4326 memory_object_mark_eligible_for_secluded(moc,
4327 TRUE);
4328 }
4329 } else if (secluded_for_filecache == 2) {
4330 size_t len = strlen(vp->v_name);
4331 if (!strncmp(vp->v_name, "dyld", len) ||
4332 !strncmp(vp->v_name, "launchd", len) ||
4333 !strncmp(vp->v_name, "Camera", len) ||
4334 !strncmp(vp->v_name, "mediaserverd", len) ||
4335 !strncmp(vp->v_name, "SpringBoard", len) ||
4336 !strncmp(vp->v_name, "backboardd", len)) {
4337 /*
4338 * This file matters when launching Camera:
4339 * do not store its contents in the secluded
4340 * pool that will be drained on Camera launch.
4341 */
4342 memory_object_mark_eligible_for_secluded(moc,
4343 FALSE);
4344 }
4345 }
4346 }
4347 #endif /* CONFIG_SECLUDED_MEMORY */
4348
4349 fp_drop(p, indx, fp, 1);
4350 proc_fdunlock(p);
4351
4352 *retval = indx;
4353
4354 return 0;
4355 bad:
4356 context = *vfs_context_current();
4357 context.vc_ucred = fp->fp_glob->fg_cred;
4358
4359 if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4360 (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4361 struct flock lf = {
4362 .l_whence = SEEK_SET,
4363 .l_type = F_UNLCK,
4364 };
4365
4366 (void)VNOP_ADVLOCK(
4367 vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4368 }
4369
4370 vn_close(vp, fp->fp_glob->fg_flag, &context);
4371 vnode_put(vp);
4372 fp_free(p, indx, fp);
4373
4374 return error;
4375 }
4376
4377 /*
4378 * While most of the *at syscall handlers can call nameiat() which
4379 * is a wrapper around namei, the use of namei and initialisation
4380 * of nameidata are far removed and in different functions - namei
4381 * gets called in vn_open_auth for open1. So we'll just do here what
4382 * nameiat() does.
4383 */
4384 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd)4385 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4386 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4387 int dirfd)
4388 {
4389 if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4390 int error;
4391 char c;
4392
4393 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4394 error = copyin(ndp->ni_dirp, &c, sizeof(char));
4395 if (error) {
4396 return error;
4397 }
4398 } else {
4399 c = *((char *)(ndp->ni_dirp));
4400 }
4401
4402 if (c != '/') {
4403 vnode_t dvp_at;
4404
4405 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4406 &dvp_at);
4407 if (error) {
4408 return error;
4409 }
4410
4411 if (vnode_vtype(dvp_at) != VDIR) {
4412 vnode_put(dvp_at);
4413 return ENOTDIR;
4414 }
4415
4416 ndp->ni_dvp = dvp_at;
4417 ndp->ni_cnd.cn_flags |= USEDVP;
4418 error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4419 retval);
4420 vnode_put(dvp_at);
4421 return error;
4422 }
4423 }
4424
4425 return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval);
4426 }
4427
4428 /*
4429 * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4430 *
4431 * Parameters: p Process requesting the open
4432 * uap User argument descriptor (see below)
4433 * retval Pointer to an area to receive the
4434 * return calue from the system call
4435 *
4436 * Indirect: uap->path Path to open (same as 'open')
4437 * uap->flags Flags to open (same as 'open'
4438 * uap->uid UID to set, if creating
4439 * uap->gid GID to set, if creating
4440 * uap->mode File mode, if creating (same as 'open')
4441 * uap->xsecurity ACL to set, if creating
4442 *
4443 * Returns: 0 Success
4444 * !0 errno value
4445 *
4446 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
4447 *
4448 * XXX: We should enummerate the possible errno values here, and where
4449 * in the code they originated.
4450 */
4451 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)4452 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4453 {
4454 int ciferror;
4455 kauth_filesec_t xsecdst;
4456 struct vnode_attr va;
4457 struct nameidata nd;
4458 int cmode;
4459
4460 AUDIT_ARG(owner, uap->uid, uap->gid);
4461
4462 xsecdst = NULL;
4463 if ((uap->xsecurity != USER_ADDR_NULL) &&
4464 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4465 return ciferror;
4466 }
4467
4468 VATTR_INIT(&va);
4469 cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4470 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4471 if (uap->uid != KAUTH_UID_NONE) {
4472 VATTR_SET(&va, va_uid, uap->uid);
4473 }
4474 if (uap->gid != KAUTH_GID_NONE) {
4475 VATTR_SET(&va, va_gid, uap->gid);
4476 }
4477 if (xsecdst != NULL) {
4478 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4479 va.va_vaflags |= VA_FILESEC_ACL;
4480 }
4481
4482 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4483 uap->path, vfs_context_current());
4484
4485 ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4486 NULL, NULL, retval);
4487 if (xsecdst != NULL) {
4488 kauth_filesec_free(xsecdst);
4489 }
4490
4491 return ciferror;
4492 }
4493
4494 /*
4495 * Go through the data-protected atomically controlled open (2)
4496 *
4497 * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4498 */
4499 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)4500 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
4501 {
4502 int flags = uap->flags;
4503 int class = uap->class;
4504 int dpflags = uap->dpflags;
4505
4506 /*
4507 * Follow the same path as normal open(2)
4508 * Look up the item if it exists, and acquire the vnode.
4509 */
4510 struct vnode_attr va;
4511 struct nameidata nd;
4512 int cmode;
4513 int error;
4514
4515 VATTR_INIT(&va);
4516 /* Mask off all but regular access permissions */
4517 cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4518 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4519
4520 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4521 uap->path, vfs_context_current());
4522
4523 /*
4524 * Initialize the extra fields in vnode_attr to pass down our
4525 * extra fields.
4526 * 1. target cprotect class.
4527 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4528 */
4529 if (flags & O_CREAT) {
4530 /* lower level kernel code validates that the class is valid before applying it. */
4531 if (class != PROTECTION_CLASS_DEFAULT) {
4532 /*
4533 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4534 * file behave the same as open (2)
4535 */
4536 VATTR_SET(&va, va_dataprotect_class, class);
4537 }
4538 }
4539
4540 if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED)) {
4541 if (flags & (O_RDWR | O_WRONLY)) {
4542 /* Not allowed to write raw encrypted bytes */
4543 return EINVAL;
4544 }
4545 if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
4546 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4547 }
4548 if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
4549 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4550 }
4551 }
4552
4553 error = open1(vfs_context_current(), &nd, uap->flags, &va,
4554 NULL, NULL, retval);
4555
4556 return error;
4557 }
4558
4559 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)4560 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4561 int fd, enum uio_seg segflg, int *retval)
4562 {
4563 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
4564 struct {
4565 struct vnode_attr va;
4566 struct nameidata nd;
4567 } *__open_data;
4568 struct vnode_attr *vap;
4569 struct nameidata *ndp;
4570 int cmode;
4571 int error;
4572
4573 __open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
4574 vap = &__open_data->va;
4575 ndp = &__open_data->nd;
4576
4577 VATTR_INIT(vap);
4578 /* Mask off all but regular access permissions */
4579 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4580 VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
4581
4582 NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
4583 segflg, path, ctx);
4584
4585 error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd);
4586
4587 kfree_type(typeof(*__open_data), __open_data);
4588
4589 return error;
4590 }
4591
4592 int
open(proc_t p,struct open_args * uap,int32_t * retval)4593 open(proc_t p, struct open_args *uap, int32_t *retval)
4594 {
4595 __pthread_testcancel(1);
4596 return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
4597 }
4598
4599 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)4600 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
4601 int32_t *retval)
4602 {
4603 return openat_internal(vfs_context_current(), uap->path, uap->flags,
4604 uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
4605 }
4606
4607 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)4608 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
4609 int32_t *retval)
4610 {
4611 return openat_internal(vfs_context_current(), uap->path, uap->flags,
4612 uap->mode, uap->fd, UIO_USERSPACE, retval);
4613 }
4614
4615 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)4616 openat(proc_t p, struct openat_args *uap, int32_t *retval)
4617 {
4618 __pthread_testcancel(1);
4619 return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
4620 }
4621
4622 /*
4623 * openbyid_np: open a file given a file system id and a file system object id
4624 * the hfs file system object id is an fsobj_id_t {uint32, uint32}
4625 * file systems that don't support object ids it is a node id (uint64_t).
4626 *
4627 * Parameters: p Process requesting the open
4628 * uap User argument descriptor (see below)
4629 * retval Pointer to an area to receive the
4630 * return calue from the system call
4631 *
4632 * Indirect: uap->path Path to open (same as 'open')
4633 *
4634 * uap->fsid id of target file system
4635 * uap->objid id of target file system object
4636 * uap->flags Flags to open (same as 'open')
4637 *
4638 * Returns: 0 Success
4639 * !0 errno value
4640 *
4641 *
4642 * XXX: We should enummerate the possible errno values here, and where
4643 * in the code they originated.
4644 */
4645 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)4646 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
4647 {
4648 fsid_t fsid;
4649 uint64_t objid;
4650 int error;
4651 char *buf = NULL;
4652 int buflen = MAXPATHLEN;
4653 int pathlen = 0;
4654 vfs_context_t ctx = vfs_context_current();
4655
4656 if ((error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_OPEN_BY_ID, 0))) {
4657 return error;
4658 }
4659
4660 if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
4661 return error;
4662 }
4663
4664 /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
4665 if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
4666 return error;
4667 }
4668
4669 AUDIT_ARG(value32, fsid.val[0]);
4670 AUDIT_ARG(value64, objid);
4671
4672 /*resolve path from fsis, objid*/
4673 do {
4674 buf = kalloc_data(buflen + 1, Z_WAITOK);
4675 if (buf == NULL) {
4676 return ENOMEM;
4677 }
4678
4679 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
4680 buf, FSOPT_ISREALFSID, &pathlen);
4681
4682 if (error) {
4683 kfree_data(buf, buflen + 1);
4684 buf = NULL;
4685 }
4686 } while (error == ENOSPC && (buflen += MAXPATHLEN));
4687
4688 if (error) {
4689 return error;
4690 }
4691
4692 buf[pathlen] = 0;
4693
4694 error = openat_internal(
4695 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
4696
4697 kfree_data(buf, buflen + 1);
4698
4699 return error;
4700 }
4701
4702
4703 /*
4704 * Create a special file.
4705 */
4706 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
4707
4708 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)4709 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
4710 {
4711 struct vnode_attr va;
4712 vfs_context_t ctx = vfs_context_current();
4713 int error;
4714 struct nameidata nd;
4715 vnode_t vp, dvp;
4716
4717 VATTR_INIT(&va);
4718 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
4719 VATTR_SET(&va, va_rdev, uap->dev);
4720
4721 /* If it's a mknod() of a FIFO, call mkfifo1() instead */
4722 if ((uap->mode & S_IFMT) == S_IFIFO) {
4723 return mkfifo1(ctx, uap->path, &va);
4724 }
4725
4726 AUDIT_ARG(mode, (mode_t)uap->mode);
4727 AUDIT_ARG(value32, uap->dev);
4728
4729 if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
4730 return error;
4731 }
4732 NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
4733 UIO_USERSPACE, uap->path, ctx);
4734 error = namei(&nd);
4735 if (error) {
4736 return error;
4737 }
4738 dvp = nd.ni_dvp;
4739 vp = nd.ni_vp;
4740
4741 if (vp != NULL) {
4742 error = EEXIST;
4743 goto out;
4744 }
4745
4746 switch (uap->mode & S_IFMT) {
4747 case S_IFCHR:
4748 VATTR_SET(&va, va_type, VCHR);
4749 break;
4750 case S_IFBLK:
4751 VATTR_SET(&va, va_type, VBLK);
4752 break;
4753 default:
4754 error = EINVAL;
4755 goto out;
4756 }
4757
4758 #if CONFIG_MACF
4759 error = mac_vnode_check_create(ctx,
4760 nd.ni_dvp, &nd.ni_cnd, &va);
4761 if (error) {
4762 goto out;
4763 }
4764 #endif
4765
4766 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
4767 goto out;
4768 }
4769
4770 if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0) {
4771 goto out;
4772 }
4773
4774 if (vp) {
4775 int update_flags = 0;
4776
4777 // Make sure the name & parent pointers are hooked up
4778 if (vp->v_name == NULL) {
4779 update_flags |= VNODE_UPDATE_NAME;
4780 }
4781 if (vp->v_parent == NULLVP) {
4782 update_flags |= VNODE_UPDATE_PARENT;
4783 }
4784
4785 if (update_flags) {
4786 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4787 }
4788
4789 #if CONFIG_FSE
4790 add_fsevent(FSE_CREATE_FILE, ctx,
4791 FSE_ARG_VNODE, vp,
4792 FSE_ARG_DONE);
4793 #endif
4794 }
4795
4796 out:
4797 /*
4798 * nameidone has to happen before we vnode_put(dvp)
4799 * since it may need to release the fs_nodelock on the dvp
4800 */
4801 nameidone(&nd);
4802
4803 if (vp) {
4804 vnode_put(vp);
4805 }
4806 vnode_put(dvp);
4807
4808 return error;
4809 }
4810
4811 /*
4812 * Create a named pipe.
4813 *
4814 * Returns: 0 Success
4815 * EEXIST
4816 * namei:???
4817 * vnode_authorize:???
4818 * vn_create:???
4819 */
4820 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap)4821 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
4822 {
4823 vnode_t vp, dvp;
4824 int error;
4825 struct nameidata nd;
4826
4827 NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
4828 UIO_USERSPACE, upath, ctx);
4829 error = namei(&nd);
4830 if (error) {
4831 return error;
4832 }
4833 dvp = nd.ni_dvp;
4834 vp = nd.ni_vp;
4835
4836 /* check that this is a new file and authorize addition */
4837 if (vp != NULL) {
4838 error = EEXIST;
4839 goto out;
4840 }
4841 VATTR_SET(vap, va_type, VFIFO);
4842
4843 if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
4844 goto out;
4845 }
4846
4847 error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
4848 out:
4849 /*
4850 * nameidone has to happen before we vnode_put(dvp)
4851 * since it may need to release the fs_nodelock on the dvp
4852 */
4853 nameidone(&nd);
4854
4855 if (vp) {
4856 vnode_put(vp);
4857 }
4858 vnode_put(dvp);
4859
4860 return error;
4861 }
4862
4863
4864 /*
4865 * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
4866 *
4867 * Parameters: p Process requesting the open
4868 * uap User argument descriptor (see below)
4869 * retval (Ignored)
4870 *
4871 * Indirect: uap->path Path to fifo (same as 'mkfifo')
4872 * uap->uid UID to set
4873 * uap->gid GID to set
4874 * uap->mode File mode to set (same as 'mkfifo')
4875 * uap->xsecurity ACL to set, if creating
4876 *
4877 * Returns: 0 Success
4878 * !0 errno value
4879 *
4880 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
4881 *
4882 * XXX: We should enummerate the possible errno values here, and where
4883 * in the code they originated.
4884 */
4885 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)4886 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
4887 {
4888 int ciferror;
4889 kauth_filesec_t xsecdst;
4890 struct vnode_attr va;
4891
4892 AUDIT_ARG(owner, uap->uid, uap->gid);
4893
4894 xsecdst = KAUTH_FILESEC_NONE;
4895 if (uap->xsecurity != USER_ADDR_NULL) {
4896 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
4897 return ciferror;
4898 }
4899 }
4900
4901 VATTR_INIT(&va);
4902 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
4903 if (uap->uid != KAUTH_UID_NONE) {
4904 VATTR_SET(&va, va_uid, uap->uid);
4905 }
4906 if (uap->gid != KAUTH_GID_NONE) {
4907 VATTR_SET(&va, va_gid, uap->gid);
4908 }
4909 if (xsecdst != KAUTH_FILESEC_NONE) {
4910 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4911 va.va_vaflags |= VA_FILESEC_ACL;
4912 }
4913
4914 ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
4915
4916 if (xsecdst != KAUTH_FILESEC_NONE) {
4917 kauth_filesec_free(xsecdst);
4918 }
4919 return ciferror;
4920 }
4921
4922 /* ARGSUSED */
4923 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)4924 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
4925 {
4926 struct vnode_attr va;
4927
4928 VATTR_INIT(&va);
4929 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
4930
4931 return mkfifo1(vfs_context_current(), uap->path, &va);
4932 }
4933
4934 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
4935 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4936 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4937
4938 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)4939 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
4940 {
4941 int ret, len = _len;
4942
4943 *truncated_path = 0;
4944
4945 if (firmlink) {
4946 ret = vn_getpath(dvp, path, &len);
4947 } else {
4948 ret = vn_getpath_no_firmlink(dvp, path, &len);
4949 }
4950 if (ret == 0 && len < (MAXPATHLEN - 1)) {
4951 if (leafname) {
4952 path[len - 1] = '/';
4953 len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
4954 if (len > MAXPATHLEN) {
4955 char *ptr;
4956
4957 // the string got truncated!
4958 *truncated_path = 1;
4959 ptr = strrchr(path, '/');
4960 if (ptr) {
4961 *ptr = '\0'; // chop off the string at the last directory component
4962 }
4963 len = (int)strlen(path) + 1;
4964 }
4965 }
4966 } else if (ret == 0) {
4967 *truncated_path = 1;
4968 } else if (ret != 0) {
4969 struct vnode *mydvp = dvp;
4970
4971 if (ret != ENOSPC) {
4972 printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
4973 dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
4974 }
4975 *truncated_path = 1;
4976
4977 do {
4978 if (mydvp->v_parent != NULL) {
4979 mydvp = mydvp->v_parent;
4980 } else if (mydvp->v_mount) {
4981 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4982 break;
4983 } else {
4984 // no parent and no mount point? only thing is to punt and say "/" changed
4985 strlcpy(path, "/", _len);
4986 len = 2;
4987 mydvp = NULL;
4988 }
4989
4990 if (mydvp == NULL) {
4991 break;
4992 }
4993
4994 len = _len;
4995 if (firmlink) {
4996 ret = vn_getpath(mydvp, path, &len);
4997 } else {
4998 ret = vn_getpath_no_firmlink(mydvp, path, &len);
4999 }
5000 } while (ret == ENOSPC);
5001 }
5002
5003 return len;
5004 }
5005
5006 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5007 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5008 {
5009 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5010 }
5011
5012 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5013 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5014 {
5015 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5016 }
5017
5018 /*
5019 * Make a hard file link.
5020 *
5021 * Returns: 0 Success
5022 * EPERM
5023 * EEXIST
5024 * EXDEV
5025 * namei:???
5026 * vnode_authorize:???
5027 * VNOP_LINK:???
5028 */
5029 /* ARGSUSED */
5030 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5031 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5032 user_addr_t link, int flag, enum uio_seg segflg)
5033 {
5034 vnode_t vp, pvp, dvp, lvp;
5035 struct nameidata nd;
5036 int follow;
5037 int error;
5038 #if CONFIG_FSE
5039 fse_info finfo;
5040 #endif
5041 int need_event, has_listeners, need_kpath2;
5042 char *target_path = NULL;
5043 char *no_firmlink_path = NULL;
5044 int truncated = 0;
5045 int truncated_no_firmlink_path = 0;
5046
5047 vp = dvp = lvp = NULLVP;
5048
5049 /* look up the object we are linking to */
5050 follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5051 NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5052 segflg, path, ctx);
5053
5054 error = nameiat(&nd, fd1);
5055 if (error) {
5056 return error;
5057 }
5058 vp = nd.ni_vp;
5059
5060 nameidone(&nd);
5061
5062 /*
5063 * Normally, linking to directories is not supported.
5064 * However, some file systems may have limited support.
5065 */
5066 if (vp->v_type == VDIR) {
5067 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5068 error = EPERM; /* POSIX */
5069 goto out;
5070 }
5071
5072 /* Linking to a directory requires ownership. */
5073 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5074 struct vnode_attr dva;
5075
5076 VATTR_INIT(&dva);
5077 VATTR_WANTED(&dva, va_uid);
5078 if (vnode_getattr(vp, &dva, ctx) != 0 ||
5079 !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5080 (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5081 error = EACCES;
5082 goto out;
5083 }
5084 }
5085 }
5086
5087 /* lookup the target node */
5088 #if CONFIG_TRIGGERS
5089 nd.ni_op = OP_LINK;
5090 #endif
5091 nd.ni_cnd.cn_nameiop = CREATE;
5092 nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5093 nd.ni_dirp = link;
5094 error = nameiat(&nd, fd2);
5095 if (error != 0) {
5096 goto out;
5097 }
5098 dvp = nd.ni_dvp;
5099 lvp = nd.ni_vp;
5100
5101 #if CONFIG_MACF
5102 if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5103 goto out2;
5104 }
5105 #endif
5106
5107 /* or to anything that kauth doesn't want us to (eg. immutable items) */
5108 if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5109 goto out2;
5110 }
5111
5112 /* target node must not exist */
5113 if (lvp != NULLVP) {
5114 error = EEXIST;
5115 goto out2;
5116 }
5117 /* cannot link across mountpoints */
5118 if (vnode_mount(vp) != vnode_mount(dvp)) {
5119 error = EXDEV;
5120 goto out2;
5121 }
5122
5123 /* authorize creation of the target note */
5124 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5125 goto out2;
5126 }
5127
5128 /* and finally make the link */
5129 error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5130 if (error) {
5131 goto out2;
5132 }
5133
5134 #if CONFIG_MACF
5135 (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5136 #endif
5137
5138 #if CONFIG_FSE
5139 need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5140 #else
5141 need_event = 0;
5142 #endif
5143 has_listeners = kauth_authorize_fileop_has_listeners();
5144
5145 need_kpath2 = 0;
5146 #if CONFIG_AUDIT
5147 if (AUDIT_RECORD_EXISTS()) {
5148 need_kpath2 = 1;
5149 }
5150 #endif
5151
5152 if (need_event || has_listeners || need_kpath2) {
5153 char *link_to_path = NULL;
5154 int len, link_name_len;
5155 int len_no_firmlink_path = 0;
5156
5157 /* build the path to the new link file */
5158 GET_PATH(target_path);
5159
5160 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5161 if (no_firmlink_path == NULL) {
5162 GET_PATH(no_firmlink_path);
5163 }
5164 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5165
5166 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5167
5168 if (has_listeners) {
5169 /* build the path to file we are linking to */
5170 GET_PATH(link_to_path);
5171
5172 link_name_len = MAXPATHLEN;
5173 if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5174 /*
5175 * Call out to allow 3rd party notification of rename.
5176 * Ignore result of kauth_authorize_fileop call.
5177 */
5178 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5179 (uintptr_t)link_to_path,
5180 (uintptr_t)target_path);
5181 }
5182 if (link_to_path != NULL) {
5183 RELEASE_PATH(link_to_path);
5184 }
5185 }
5186 #if CONFIG_FSE
5187 if (need_event) {
5188 /* construct fsevent */
5189 if (get_fse_info(vp, &finfo, ctx) == 0) {
5190 if (truncated_no_firmlink_path) {
5191 finfo.mode |= FSE_TRUNCATED_PATH;
5192 }
5193
5194 // build the path to the destination of the link
5195 add_fsevent(FSE_CREATE_FILE, ctx,
5196 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5197 FSE_ARG_FINFO, &finfo,
5198 FSE_ARG_DONE);
5199 }
5200
5201 pvp = vp->v_parent;
5202 // need an iocount on pvp in this case
5203 if (pvp && pvp != dvp) {
5204 error = vnode_get(pvp);
5205 if (error) {
5206 pvp = NULLVP;
5207 error = 0;
5208 }
5209 }
5210 if (pvp) {
5211 add_fsevent(FSE_STAT_CHANGED, ctx,
5212 FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5213 }
5214 if (pvp && pvp != dvp) {
5215 vnode_put(pvp);
5216 }
5217 }
5218 #endif
5219 }
5220 out2:
5221 /*
5222 * nameidone has to happen before we vnode_put(dvp)
5223 * since it may need to release the fs_nodelock on the dvp
5224 */
5225 nameidone(&nd);
5226 if (target_path != NULL) {
5227 RELEASE_PATH(target_path);
5228 }
5229 if (no_firmlink_path != NULL) {
5230 RELEASE_PATH(no_firmlink_path);
5231 no_firmlink_path = NULL;
5232 }
5233 out:
5234 if (lvp) {
5235 vnode_put(lvp);
5236 }
5237 if (dvp) {
5238 vnode_put(dvp);
5239 }
5240 vnode_put(vp);
5241 return error;
5242 }
5243
5244 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5245 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5246 {
5247 return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5248 AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5249 }
5250
5251 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5252 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5253 {
5254 if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5255 return EINVAL;
5256 }
5257
5258 return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5259 uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5260 }
5261
5262 /*
5263 * Make a symbolic link.
5264 *
5265 * We could add support for ACLs here too...
5266 */
5267 /* ARGSUSED */
5268 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5269 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5270 user_addr_t link, enum uio_seg segflg)
5271 {
5272 struct vnode_attr va;
5273 char *path;
5274 int error;
5275 struct nameidata nd;
5276 vnode_t vp, dvp;
5277 size_t dummy = 0;
5278 proc_t p;
5279
5280 error = 0;
5281 if (UIO_SEG_IS_USER_SPACE(segflg)) {
5282 path = zalloc(ZV_NAMEI);
5283 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5284 } else {
5285 path = (char *)path_data;
5286 }
5287 if (error) {
5288 goto out;
5289 }
5290 AUDIT_ARG(text, path); /* This is the link string */
5291
5292 NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5293 segflg, link, ctx);
5294
5295 error = nameiat(&nd, fd);
5296 if (error) {
5297 goto out;
5298 }
5299 dvp = nd.ni_dvp;
5300 vp = nd.ni_vp;
5301
5302 p = vfs_context_proc(ctx);
5303 VATTR_INIT(&va);
5304 VATTR_SET(&va, va_type, VLNK);
5305 VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5306
5307 #if CONFIG_MACF
5308 error = mac_vnode_check_create(ctx,
5309 dvp, &nd.ni_cnd, &va);
5310 #endif
5311 if (error != 0) {
5312 goto skipit;
5313 }
5314
5315 if (vp != NULL) {
5316 error = EEXIST;
5317 goto skipit;
5318 }
5319
5320 /* authorize */
5321 if (error == 0) {
5322 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5323 }
5324 /* get default ownership, etc. */
5325 if (error == 0) {
5326 error = vnode_authattr_new(dvp, &va, 0, ctx);
5327 }
5328 if (error == 0) {
5329 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5330 }
5331
5332 /* do fallback attribute handling */
5333 if (error == 0 && vp) {
5334 error = vnode_setattr_fallback(vp, &va, ctx);
5335 }
5336
5337 #if CONFIG_MACF
5338 if (error == 0 && vp) {
5339 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5340 }
5341 #endif
5342
5343 if (error == 0) {
5344 int update_flags = 0;
5345
5346 /*check if a new vnode was created, else try to get one*/
5347 if (vp == NULL) {
5348 nd.ni_cnd.cn_nameiop = LOOKUP;
5349 #if CONFIG_TRIGGERS
5350 nd.ni_op = OP_LOOKUP;
5351 #endif
5352 /*
5353 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5354 * reallocated again in namei().
5355 */
5356 nd.ni_cnd.cn_flags &= HASBUF;
5357 error = nameiat(&nd, fd);
5358 if (error) {
5359 goto skipit;
5360 }
5361 vp = nd.ni_vp;
5362 }
5363
5364 #if 0 /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5365 /* call out to allow 3rd party notification of rename.
5366 * Ignore result of kauth_authorize_fileop call.
5367 */
5368 if (kauth_authorize_fileop_has_listeners() &&
5369 namei(&nd) == 0) {
5370 char *new_link_path = NULL;
5371 int len;
5372
5373 /* build the path to the new link file */
5374 new_link_path = get_pathbuff();
5375 len = MAXPATHLEN;
5376 vn_getpath(dvp, new_link_path, &len);
5377 if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5378 new_link_path[len - 1] = '/';
5379 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5380 }
5381
5382 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5383 (uintptr_t)path, (uintptr_t)new_link_path);
5384 if (new_link_path != NULL) {
5385 release_pathbuff(new_link_path);
5386 }
5387 }
5388 #endif
5389 // Make sure the name & parent pointers are hooked up
5390 if (vp->v_name == NULL) {
5391 update_flags |= VNODE_UPDATE_NAME;
5392 }
5393 if (vp->v_parent == NULLVP) {
5394 update_flags |= VNODE_UPDATE_PARENT;
5395 }
5396
5397 if (update_flags) {
5398 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5399 }
5400
5401 #if CONFIG_FSE
5402 add_fsevent(FSE_CREATE_FILE, ctx,
5403 FSE_ARG_VNODE, vp,
5404 FSE_ARG_DONE);
5405 #endif
5406 }
5407
5408 skipit:
5409 /*
5410 * nameidone has to happen before we vnode_put(dvp)
5411 * since it may need to release the fs_nodelock on the dvp
5412 */
5413 nameidone(&nd);
5414
5415 if (vp) {
5416 vnode_put(vp);
5417 }
5418 vnode_put(dvp);
5419 out:
5420 if (path && (path != (char *)path_data)) {
5421 zfree(ZV_NAMEI, path);
5422 }
5423
5424 return error;
5425 }
5426
5427 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)5428 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5429 {
5430 return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5431 uap->link, UIO_USERSPACE);
5432 }
5433
5434 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)5435 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5436 __unused int32_t *retval)
5437 {
5438 return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5439 uap->path2, UIO_USERSPACE);
5440 }
5441
5442 /*
5443 * Delete a whiteout from the filesystem.
5444 * No longer supported.
5445 */
5446 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)5447 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5448 {
5449 return ENOTSUP;
5450 }
5451
5452 /*
5453 * Delete a name from the filesystem.
5454 */
5455 /* ARGSUSED */
5456 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5457 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5458 user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5459 {
5460 struct {
5461 struct nameidata nd;
5462 #if CONFIG_FSE
5463 struct vnode_attr va;
5464 fse_info finfo;
5465 #endif
5466 } *__unlink_data;
5467 struct nameidata *ndp;
5468 vnode_t vp, dvp;
5469 int error;
5470 struct componentname *cnp;
5471 char *path = NULL;
5472 char *no_firmlink_path = NULL;
5473 int len_path = 0;
5474 int len_no_firmlink_path = 0;
5475 int flags;
5476 int need_event;
5477 int has_listeners;
5478 int truncated_path;
5479 int truncated_no_firmlink_path;
5480 int batched;
5481 struct vnode_attr *vap;
5482 int do_retry;
5483 int retry_count = 0;
5484 int cn_flags;
5485
5486 cn_flags = LOCKPARENT;
5487 if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
5488 cn_flags |= AUDITVNPATH1;
5489 }
5490 /* If a starting dvp is passed, it trumps any fd passed. */
5491 if (start_dvp) {
5492 cn_flags |= USEDVP;
5493 }
5494
5495 #if NAMEDRSRCFORK
5496 /* unlink or delete is allowed on rsrc forks and named streams */
5497 cn_flags |= CN_ALLOWRSRCFORK;
5498 #endif
5499
5500 __unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
5501 ndp = &__unlink_data->nd;
5502 #if CONFIG_FSE
5503 fse_info *finfop = &__unlink_data->finfo;
5504 #endif
5505
5506 retry:
5507 do_retry = 0;
5508 flags = 0;
5509 need_event = 0;
5510 has_listeners = 0;
5511 truncated_path = 0;
5512 truncated_no_firmlink_path = 0;
5513 vap = NULL;
5514
5515 NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
5516
5517 ndp->ni_dvp = start_dvp;
5518 ndp->ni_flag |= NAMEI_COMPOUNDREMOVE;
5519 cnp = &ndp->ni_cnd;
5520
5521 continue_lookup:
5522 error = nameiat(ndp, fd);
5523 if (error) {
5524 goto early_out;
5525 }
5526
5527 dvp = ndp->ni_dvp;
5528 vp = ndp->ni_vp;
5529
5530 /* With Carbon delete semantics, busy files cannot be deleted */
5531 if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
5532 flags |= VNODE_REMOVE_NODELETEBUSY;
5533 }
5534
5535 /* Skip any potential upcalls if told to. */
5536 if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
5537 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
5538 }
5539
5540 if (vp) {
5541 batched = vnode_compound_remove_available(vp);
5542 /*
5543 * The root of a mounted filesystem cannot be deleted.
5544 */
5545 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
5546 error = EBUSY;
5547 goto out;
5548 }
5549
5550 #if DEVELOPMENT || DEBUG
5551 /*
5552 * XXX VSWAP: Check for entitlements or special flag here
5553 * so we can restrict access appropriately.
5554 */
5555 #else /* DEVELOPMENT || DEBUG */
5556
5557 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
5558 error = EPERM;
5559 goto out;
5560 }
5561 #endif /* DEVELOPMENT || DEBUG */
5562
5563 if (!batched) {
5564 error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
5565 if (error) {
5566 if (error == ENOENT) {
5567 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5568 do_retry = 1;
5569 retry_count++;
5570 }
5571 }
5572 goto out;
5573 }
5574 }
5575 } else {
5576 batched = 1;
5577
5578 if (!vnode_compound_remove_available(dvp)) {
5579 panic("No vp, but no compound remove?");
5580 }
5581 }
5582
5583 #if CONFIG_FSE
5584 need_event = need_fsevent(FSE_DELETE, dvp);
5585 if (need_event) {
5586 if (!batched) {
5587 if ((vp->v_flag & VISHARDLINK) == 0) {
5588 /* XXX need to get these data in batched VNOP */
5589 get_fse_info(vp, finfop, ctx);
5590 }
5591 } else {
5592 error =
5593 vfs_get_notify_attributes(&__unlink_data->va);
5594 if (error) {
5595 goto out;
5596 }
5597
5598 vap = &__unlink_data->va;
5599 }
5600 }
5601 #endif
5602 has_listeners = kauth_authorize_fileop_has_listeners();
5603 if (need_event || has_listeners) {
5604 if (path == NULL) {
5605 GET_PATH(path);
5606 }
5607 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
5608 if (no_firmlink_path == NULL) {
5609 GET_PATH(no_firmlink_path);
5610 }
5611 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5612 }
5613
5614 #if NAMEDRSRCFORK
5615 if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
5616 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
5617 } else
5618 #endif
5619 {
5620 error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
5621 vp = ndp->ni_vp;
5622 if (error == EKEEPLOOKING) {
5623 if (!batched) {
5624 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
5625 }
5626
5627 if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
5628 panic("EKEEPLOOKING, but continue flag not set?");
5629 }
5630
5631 if (vnode_isdir(vp)) {
5632 error = EISDIR;
5633 goto out;
5634 }
5635 goto continue_lookup;
5636 } else if (error == ENOENT && batched) {
5637 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5638 /*
5639 * For compound VNOPs, the authorization callback may
5640 * return ENOENT in case of racing hardlink lookups
5641 * hitting the name cache, redrive the lookup.
5642 */
5643 do_retry = 1;
5644 retry_count += 1;
5645 goto out;
5646 }
5647 }
5648 }
5649
5650 /*
5651 * Call out to allow 3rd party notification of delete.
5652 * Ignore result of kauth_authorize_fileop call.
5653 */
5654 if (!error) {
5655 if (has_listeners) {
5656 kauth_authorize_fileop(vfs_context_ucred(ctx),
5657 KAUTH_FILEOP_DELETE,
5658 (uintptr_t)vp,
5659 (uintptr_t)path);
5660 }
5661
5662 if (vp->v_flag & VISHARDLINK) {
5663 //
5664 // if a hardlink gets deleted we want to blow away the
5665 // v_parent link because the path that got us to this
5666 // instance of the link is no longer valid. this will
5667 // force the next call to get the path to ask the file
5668 // system instead of just following the v_parent link.
5669 //
5670 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
5671 }
5672
5673 #if CONFIG_FSE
5674 if (need_event) {
5675 if (vp->v_flag & VISHARDLINK) {
5676 get_fse_info(vp, finfop, ctx);
5677 } else if (vap) {
5678 vnode_get_fse_info_from_vap(vp, finfop, vap);
5679 }
5680 if (truncated_path) {
5681 finfop->mode |= FSE_TRUNCATED_PATH;
5682 }
5683 add_fsevent(FSE_DELETE, ctx,
5684 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5685 FSE_ARG_FINFO, finfop,
5686 FSE_ARG_DONE);
5687 }
5688 #endif
5689 }
5690
5691 out:
5692 if (path != NULL) {
5693 RELEASE_PATH(path);
5694 path = NULL;
5695 }
5696
5697 if (no_firmlink_path != NULL) {
5698 RELEASE_PATH(no_firmlink_path);
5699 no_firmlink_path = NULL;
5700 }
5701 #if NAMEDRSRCFORK
5702 /* recycle the deleted rsrc fork vnode to force a reclaim, which
5703 * will cause its shadow file to go away if necessary.
5704 */
5705 if (vp && (vnode_isnamedstream(vp)) &&
5706 (vp->v_parent != NULLVP) &&
5707 vnode_isshadow(vp)) {
5708 vnode_recycle(vp);
5709 }
5710 #endif
5711 /*
5712 * nameidone has to happen before we vnode_put(dvp)
5713 * since it may need to release the fs_nodelock on the dvp
5714 */
5715 nameidone(ndp);
5716 vnode_put(dvp);
5717 if (vp) {
5718 vnode_put(vp);
5719 }
5720
5721 if (do_retry) {
5722 goto retry;
5723 }
5724
5725 early_out:
5726 kfree_type(typeof(*__unlink_data), __unlink_data);
5727 return error;
5728 }
5729
5730 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5731 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
5732 enum uio_seg segflg, int unlink_flags)
5733 {
5734 return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
5735 unlink_flags);
5736 }
5737
5738 /*
5739 * Delete a name from the filesystem using Carbon semantics.
5740 */
5741 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)5742 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
5743 {
5744 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5745 uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
5746 }
5747
5748 /*
5749 * Delete a name from the filesystem using POSIX semantics.
5750 */
5751 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)5752 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
5753 {
5754 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5755 uap->path, UIO_USERSPACE, 0);
5756 }
5757
5758 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)5759 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
5760 {
5761 if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5762 return EINVAL;
5763 }
5764
5765 if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5766 int unlink_flags = 0;
5767
5768 if (uap->flag & AT_REMOVEDIR_DATALESS) {
5769 unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
5770 }
5771 return rmdirat_internal(vfs_context_current(), uap->fd,
5772 uap->path, UIO_USERSPACE, unlink_flags);
5773 } else {
5774 return unlinkat_internal(vfs_context_current(), uap->fd,
5775 NULLVP, uap->path, UIO_USERSPACE, 0);
5776 }
5777 }
5778
5779 /*
5780 * Reposition read/write file offset.
5781 */
5782 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)5783 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
5784 {
5785 struct fileproc *fp;
5786 vnode_t vp;
5787 struct vfs_context *ctx;
5788 off_t offset = uap->offset, file_size;
5789 int error;
5790
5791 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
5792 if (error == ENOTSUP) {
5793 return ESPIPE;
5794 }
5795 return error;
5796 }
5797 if (vnode_isfifo(vp)) {
5798 file_drop(uap->fd);
5799 return ESPIPE;
5800 }
5801
5802
5803 ctx = vfs_context_current();
5804 #if CONFIG_MACF
5805 if (uap->whence == L_INCR && uap->offset == 0) {
5806 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
5807 fp->fp_glob);
5808 } else {
5809 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
5810 fp->fp_glob);
5811 }
5812 if (error) {
5813 file_drop(uap->fd);
5814 return error;
5815 }
5816 #endif
5817 if ((error = vnode_getwithref(vp))) {
5818 file_drop(uap->fd);
5819 return error;
5820 }
5821
5822 switch (uap->whence) {
5823 case L_INCR:
5824 offset += fp->fp_glob->fg_offset;
5825 break;
5826 case L_XTND:
5827 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
5828 break;
5829 }
5830 offset += file_size;
5831 break;
5832 case L_SET:
5833 break;
5834 case SEEK_HOLE:
5835 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
5836 break;
5837 case SEEK_DATA:
5838 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
5839 break;
5840 default:
5841 error = EINVAL;
5842 }
5843 if (error == 0) {
5844 if (uap->offset > 0 && offset < 0) {
5845 /* Incremented/relative move past max size */
5846 error = EOVERFLOW;
5847 } else {
5848 /*
5849 * Allow negative offsets on character devices, per
5850 * POSIX 1003.1-2001. Most likely for writing disk
5851 * labels.
5852 */
5853 if (offset < 0 && vp->v_type != VCHR) {
5854 /* Decremented/relative move before start */
5855 error = EINVAL;
5856 } else {
5857 /* Success */
5858 fp->fp_glob->fg_offset = offset;
5859 *retval = fp->fp_glob->fg_offset;
5860 }
5861 }
5862 }
5863
5864 /*
5865 * An lseek can affect whether data is "available to read." Use
5866 * hint of NOTE_NONE so no EVFILT_VNODE events fire
5867 */
5868 post_event_if_success(vp, error, NOTE_NONE);
5869 (void)vnode_put(vp);
5870 file_drop(uap->fd);
5871 return error;
5872 }
5873
5874
5875 /*
5876 * Check access permissions.
5877 *
5878 * Returns: 0 Success
5879 * vnode_authorize:???
5880 */
5881 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)5882 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
5883 {
5884 kauth_action_t action;
5885 int error;
5886
5887 /*
5888 * If just the regular access bits, convert them to something
5889 * that vnode_authorize will understand.
5890 */
5891 if (!(uflags & _ACCESS_EXTENDED_MASK)) {
5892 action = 0;
5893 if (uflags & R_OK) {
5894 action |= KAUTH_VNODE_READ_DATA; /* aka KAUTH_VNODE_LIST_DIRECTORY */
5895 }
5896 if (uflags & W_OK) {
5897 if (vnode_isdir(vp)) {
5898 action |= KAUTH_VNODE_ADD_FILE |
5899 KAUTH_VNODE_ADD_SUBDIRECTORY;
5900 /* might want delete rights here too */
5901 } else {
5902 action |= KAUTH_VNODE_WRITE_DATA;
5903 }
5904 }
5905 if (uflags & X_OK) {
5906 if (vnode_isdir(vp)) {
5907 action |= KAUTH_VNODE_SEARCH;
5908 } else {
5909 action |= KAUTH_VNODE_EXECUTE;
5910 }
5911 }
5912 } else {
5913 /* take advantage of definition of uflags */
5914 action = uflags >> 8;
5915 }
5916
5917 #if CONFIG_MACF
5918 error = mac_vnode_check_access(ctx, vp, uflags);
5919 if (error) {
5920 return error;
5921 }
5922 #endif /* MAC */
5923
5924 /* action == 0 means only check for existence */
5925 if (action != 0) {
5926 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
5927 } else {
5928 error = 0;
5929 }
5930
5931 return error;
5932 }
5933
5934
5935
5936 /*
5937 * access_extended: Check access permissions in bulk.
5938 *
5939 * Description: uap->entries Pointer to an array of accessx
5940 * descriptor structs, plus one or
5941 * more NULL terminated strings (see
5942 * "Notes" section below).
5943 * uap->size Size of the area pointed to by
5944 * uap->entries.
5945 * uap->results Pointer to the results array.
5946 *
5947 * Returns: 0 Success
5948 * ENOMEM Insufficient memory
5949 * EINVAL Invalid arguments
5950 * namei:EFAULT Bad address
5951 * namei:ENAMETOOLONG Filename too long
5952 * namei:ENOENT No such file or directory
5953 * namei:ELOOP Too many levels of symbolic links
5954 * namei:EBADF Bad file descriptor
5955 * namei:ENOTDIR Not a directory
5956 * namei:???
5957 * access1:
5958 *
5959 * Implicit returns:
5960 * uap->results Array contents modified
5961 *
5962 * Notes: The uap->entries are structured as an arbitrary length array
5963 * of accessx descriptors, followed by one or more NULL terminated
5964 * strings
5965 *
5966 * struct accessx_descriptor[0]
5967 * ...
5968 * struct accessx_descriptor[n]
5969 * char name_data[0];
5970 *
5971 * We determine the entry count by walking the buffer containing
5972 * the uap->entries argument descriptor. For each descriptor we
5973 * see, the valid values for the offset ad_name_offset will be
5974 * in the byte range:
5975 *
5976 * [ uap->entries + sizeof(struct accessx_descriptor) ]
5977 * to
5978 * [ uap->entries + uap->size - 2 ]
5979 *
5980 * since we must have at least one string, and the string must
5981 * be at least one character plus the NULL terminator in length.
5982 *
5983 * XXX: Need to support the check-as uid argument
5984 */
5985 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)5986 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
5987 {
5988 struct accessx_descriptor *input = NULL;
5989 errno_t *result = NULL;
5990 errno_t error = 0;
5991 int wantdelete = 0;
5992 size_t desc_max, desc_actual;
5993 unsigned int i, j;
5994 struct vfs_context context;
5995 struct nameidata nd;
5996 int niopts;
5997 vnode_t vp = NULL;
5998 vnode_t dvp = NULL;
5999 #define ACCESSX_MAX_DESCR_ON_STACK 10
6000 struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6001
6002 context.vc_ucred = NULL;
6003
6004 /*
6005 * Validate parameters; if valid, copy the descriptor array and string
6006 * arguments into local memory. Before proceeding, the following
6007 * conditions must have been met:
6008 *
6009 * o The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6010 * o There must be sufficient room in the request for at least one
6011 * descriptor and a one yte NUL terminated string.
6012 * o The allocation of local storage must not fail.
6013 */
6014 if (uap->size > ACCESSX_MAX_TABLESIZE) {
6015 return ENOMEM;
6016 }
6017 if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6018 return EINVAL;
6019 }
6020 if (uap->size <= sizeof(stack_input)) {
6021 input = stack_input;
6022 } else {
6023 input = kalloc_data(uap->size, Z_WAITOK);
6024 if (input == NULL) {
6025 error = ENOMEM;
6026 goto out;
6027 }
6028 }
6029 error = copyin(uap->entries, input, uap->size);
6030 if (error) {
6031 goto out;
6032 }
6033
6034 AUDIT_ARG(opaque, input, uap->size);
6035
6036 /*
6037 * Force NUL termination of the copyin buffer to avoid nami() running
6038 * off the end. If the caller passes us bogus data, they may get a
6039 * bogus result.
6040 */
6041 ((char *)input)[uap->size - 1] = 0;
6042
6043 /*
6044 * Access is defined as checking against the process' real identity,
6045 * even if operations are checking the effective identity. This
6046 * requires that we use a local vfs context.
6047 */
6048 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6049 context.vc_thread = current_thread();
6050
6051 /*
6052 * Find out how many entries we have, so we can allocate the result
6053 * array by walking the list and adjusting the count downward by the
6054 * earliest string offset we see.
6055 */
6056 desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6057 desc_actual = desc_max;
6058 for (i = 0; i < desc_actual; i++) {
6059 /*
6060 * Take the offset to the name string for this entry and
6061 * convert to an input array index, which would be one off
6062 * the end of the array if this entry was the lowest-addressed
6063 * name string.
6064 */
6065 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6066
6067 /*
6068 * An offset greater than the max allowable offset is an error.
6069 * It is also an error for any valid entry to point
6070 * to a location prior to the end of the current entry, if
6071 * it's not a reference to the string of the previous entry.
6072 */
6073 if (j > desc_max || (j != 0 && j <= i)) {
6074 error = EINVAL;
6075 goto out;
6076 }
6077
6078 /* Also do not let ad_name_offset point to something beyond the size of the input */
6079 if (input[i].ad_name_offset >= uap->size) {
6080 error = EINVAL;
6081 goto out;
6082 }
6083
6084 /*
6085 * An offset of 0 means use the previous descriptor's offset;
6086 * this is used to chain multiple requests for the same file
6087 * to avoid multiple lookups.
6088 */
6089 if (j == 0) {
6090 /* This is not valid for the first entry */
6091 if (i == 0) {
6092 error = EINVAL;
6093 goto out;
6094 }
6095 continue;
6096 }
6097
6098 /*
6099 * If the offset of the string for this descriptor is before
6100 * what we believe is the current actual last descriptor,
6101 * then we need to adjust our estimate downward; this permits
6102 * the string table following the last descriptor to be out
6103 * of order relative to the descriptor list.
6104 */
6105 if (j < desc_actual) {
6106 desc_actual = j;
6107 }
6108 }
6109
6110 /*
6111 * We limit the actual number of descriptors we are willing to process
6112 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS. If the number being
6113 * requested does not exceed this limit,
6114 */
6115 if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6116 error = ENOMEM;
6117 goto out;
6118 }
6119 result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6120 if (result == NULL) {
6121 error = ENOMEM;
6122 goto out;
6123 }
6124
6125 /*
6126 * Do the work by iterating over the descriptor entries we know to
6127 * at least appear to contain valid data.
6128 */
6129 error = 0;
6130 for (i = 0; i < desc_actual; i++) {
6131 /*
6132 * If the ad_name_offset is 0, then we use the previous
6133 * results to make the check; otherwise, we are looking up
6134 * a new file name.
6135 */
6136 if (input[i].ad_name_offset != 0) {
6137 /* discard old vnodes */
6138 if (vp) {
6139 vnode_put(vp);
6140 vp = NULL;
6141 }
6142 if (dvp) {
6143 vnode_put(dvp);
6144 dvp = NULL;
6145 }
6146
6147 /*
6148 * Scan forward in the descriptor list to see if we
6149 * need the parent vnode. We will need it if we are
6150 * deleting, since we must have rights to remove
6151 * entries in the parent directory, as well as the
6152 * rights to delete the object itself.
6153 */
6154 wantdelete = input[i].ad_flags & _DELETE_OK;
6155 for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6156 if (input[j].ad_flags & _DELETE_OK) {
6157 wantdelete = 1;
6158 }
6159 }
6160
6161 niopts = FOLLOW | AUDITVNPATH1;
6162
6163 /* need parent for vnode_authorize for deletion test */
6164 if (wantdelete) {
6165 niopts |= WANTPARENT;
6166 }
6167
6168 /* do the lookup */
6169 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6170 CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6171 &context);
6172 error = namei(&nd);
6173 if (!error) {
6174 vp = nd.ni_vp;
6175 if (wantdelete) {
6176 dvp = nd.ni_dvp;
6177 }
6178 }
6179 nameidone(&nd);
6180 }
6181
6182 /*
6183 * Handle lookup errors.
6184 */
6185 switch (error) {
6186 case ENOENT:
6187 case EACCES:
6188 case EPERM:
6189 case ENOTDIR:
6190 result[i] = error;
6191 break;
6192 case 0:
6193 /* run this access check */
6194 result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6195 break;
6196 default:
6197 /* fatal lookup error */
6198
6199 goto out;
6200 }
6201 }
6202
6203 AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6204
6205 /* copy out results */
6206 error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6207
6208 out:
6209 if (input && input != stack_input) {
6210 kfree_data(input, uap->size);
6211 }
6212 if (result) {
6213 kfree_data(result, desc_actual * sizeof(errno_t));
6214 }
6215 if (vp) {
6216 vnode_put(vp);
6217 }
6218 if (dvp) {
6219 vnode_put(dvp);
6220 }
6221 if (IS_VALID_CRED(context.vc_ucred)) {
6222 kauth_cred_unref(&context.vc_ucred);
6223 }
6224 return error;
6225 }
6226
6227
6228 /*
6229 * Returns: 0 Success
6230 * namei:EFAULT Bad address
6231 * namei:ENAMETOOLONG Filename too long
6232 * namei:ENOENT No such file or directory
6233 * namei:ELOOP Too many levels of symbolic links
6234 * namei:EBADF Bad file descriptor
6235 * namei:ENOTDIR Not a directory
6236 * namei:???
6237 * access1:
6238 */
6239 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6240 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6241 int flag, enum uio_seg segflg)
6242 {
6243 int error;
6244 struct nameidata nd;
6245 int niopts;
6246 struct vfs_context context;
6247 #if NAMEDRSRCFORK
6248 int is_namedstream = 0;
6249 #endif
6250
6251 /*
6252 * Unless the AT_EACCESS option is used, Access is defined as checking
6253 * against the process' real identity, even if operations are checking
6254 * the effective identity. So we need to tweak the credential
6255 * in the context for that case.
6256 */
6257 if (!(flag & AT_EACCESS)) {
6258 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6259 } else {
6260 context.vc_ucred = ctx->vc_ucred;
6261 }
6262 context.vc_thread = ctx->vc_thread;
6263
6264
6265 niopts = (flag & AT_SYMLINK_NOFOLLOW ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6266 /* need parent for vnode_authorize for deletion test */
6267 if (amode & _DELETE_OK) {
6268 niopts |= WANTPARENT;
6269 }
6270 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6271 path, &context);
6272
6273 #if NAMEDRSRCFORK
6274 /* access(F_OK) calls are allowed for resource forks. */
6275 if (amode == F_OK) {
6276 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6277 }
6278 #endif
6279 error = nameiat(&nd, fd);
6280 if (error) {
6281 goto out;
6282 }
6283
6284 #if NAMEDRSRCFORK
6285 /* Grab reference on the shadow stream file vnode to
6286 * force an inactive on release which will mark it
6287 * for recycle.
6288 */
6289 if (vnode_isnamedstream(nd.ni_vp) &&
6290 (nd.ni_vp->v_parent != NULLVP) &&
6291 vnode_isshadow(nd.ni_vp)) {
6292 is_namedstream = 1;
6293 vnode_ref(nd.ni_vp);
6294 }
6295 #endif
6296
6297 error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6298
6299 #if NAMEDRSRCFORK
6300 if (is_namedstream) {
6301 vnode_rele(nd.ni_vp);
6302 }
6303 #endif
6304
6305 vnode_put(nd.ni_vp);
6306 if (amode & _DELETE_OK) {
6307 vnode_put(nd.ni_dvp);
6308 }
6309 nameidone(&nd);
6310
6311 out:
6312 if (!(flag & AT_EACCESS)) {
6313 kauth_cred_unref(&context.vc_ucred);
6314 }
6315 return error;
6316 }
6317
6318 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)6319 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6320 {
6321 return faccessat_internal(vfs_context_current(), AT_FDCWD,
6322 uap->path, uap->flags, 0, UIO_USERSPACE);
6323 }
6324
6325 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)6326 faccessat(__unused proc_t p, struct faccessat_args *uap,
6327 __unused int32_t *retval)
6328 {
6329 if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW)) {
6330 return EINVAL;
6331 }
6332
6333 return faccessat_internal(vfs_context_current(), uap->fd,
6334 uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6335 }
6336
6337 /*
6338 * Returns: 0 Success
6339 * EFAULT
6340 * copyout:EFAULT
6341 * namei:???
6342 * vn_stat:???
6343 */
6344 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)6345 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6346 user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6347 enum uio_seg segflg, int fd, int flag)
6348 {
6349 struct nameidata nd;
6350 int follow;
6351 union {
6352 struct stat sb;
6353 struct stat64 sb64;
6354 } source = {};
6355 union {
6356 struct user64_stat user64_sb;
6357 struct user32_stat user32_sb;
6358 struct user64_stat64 user64_sb64;
6359 struct user32_stat64 user32_sb64;
6360 } dest = {};
6361 caddr_t sbp;
6362 int error, my_size;
6363 kauth_filesec_t fsec;
6364 size_t xsecurity_bufsize;
6365 void * statptr;
6366 struct fileproc *fp = NULL;
6367 int needsrealdev = 0;
6368
6369 follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6370 NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6371 segflg, path, ctx);
6372
6373 #if NAMEDRSRCFORK
6374 int is_namedstream = 0;
6375 /* stat calls are allowed for resource forks. */
6376 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6377 #endif
6378
6379 if (flag & AT_FDONLY) {
6380 vnode_t fvp;
6381
6382 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6383 if (error) {
6384 return error;
6385 }
6386 if ((error = vnode_getwithref(fvp))) {
6387 file_drop(fd);
6388 return error;
6389 }
6390 nd.ni_vp = fvp;
6391 } else {
6392 error = nameiat(&nd, fd);
6393 if (error) {
6394 return error;
6395 }
6396 }
6397 fsec = KAUTH_FILESEC_NONE;
6398
6399 statptr = (void *)&source;
6400
6401 #if NAMEDRSRCFORK
6402 /* Grab reference on the shadow stream file vnode to
6403 * force an inactive on release which will mark it
6404 * for recycle.
6405 */
6406 if (vnode_isnamedstream(nd.ni_vp) &&
6407 (nd.ni_vp->v_parent != NULLVP) &&
6408 vnode_isshadow(nd.ni_vp)) {
6409 is_namedstream = 1;
6410 vnode_ref(nd.ni_vp);
6411 }
6412 #endif
6413
6414 needsrealdev = flag & AT_REALDEV ? 1 : 0;
6415 if (fp && (xsecurity == USER_ADDR_NULL)) {
6416 /*
6417 * If the caller has the file open, and is not
6418 * requesting extended security information, we are
6419 * going to let them get the basic stat information.
6420 */
6421 error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6422 fp->fp_glob->fg_cred);
6423 } else {
6424 error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6425 isstat64, needsrealdev, ctx);
6426 }
6427
6428 #if NAMEDRSRCFORK
6429 if (is_namedstream) {
6430 vnode_rele(nd.ni_vp);
6431 }
6432 #endif
6433 vnode_put(nd.ni_vp);
6434 nameidone(&nd);
6435 if (fp) {
6436 file_drop(fd);
6437 fp = NULL;
6438 }
6439
6440 if (error) {
6441 return error;
6442 }
6443 /* Zap spare fields */
6444 if (isstat64 != 0) {
6445 source.sb64.st_lspare = 0;
6446 source.sb64.st_qspare[0] = 0LL;
6447 source.sb64.st_qspare[1] = 0LL;
6448 if (vfs_context_is64bit(ctx)) {
6449 munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6450 my_size = sizeof(dest.user64_sb64);
6451 sbp = (caddr_t)&dest.user64_sb64;
6452 } else {
6453 munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6454 my_size = sizeof(dest.user32_sb64);
6455 sbp = (caddr_t)&dest.user32_sb64;
6456 }
6457 /*
6458 * Check if we raced (post lookup) against the last unlink of a file.
6459 */
6460 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
6461 source.sb64.st_nlink = 1;
6462 }
6463 } else {
6464 source.sb.st_lspare = 0;
6465 source.sb.st_qspare[0] = 0LL;
6466 source.sb.st_qspare[1] = 0LL;
6467 if (vfs_context_is64bit(ctx)) {
6468 munge_user64_stat(&source.sb, &dest.user64_sb);
6469 my_size = sizeof(dest.user64_sb);
6470 sbp = (caddr_t)&dest.user64_sb;
6471 } else {
6472 munge_user32_stat(&source.sb, &dest.user32_sb);
6473 my_size = sizeof(dest.user32_sb);
6474 sbp = (caddr_t)&dest.user32_sb;
6475 }
6476
6477 /*
6478 * Check if we raced (post lookup) against the last unlink of a file.
6479 */
6480 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
6481 source.sb.st_nlink = 1;
6482 }
6483 }
6484 if ((error = copyout(sbp, ub, my_size)) != 0) {
6485 goto out;
6486 }
6487
6488 /* caller wants extended security information? */
6489 if (xsecurity != USER_ADDR_NULL) {
6490 /* did we get any? */
6491 if (fsec == KAUTH_FILESEC_NONE) {
6492 if (susize(xsecurity_size, 0) != 0) {
6493 error = EFAULT;
6494 goto out;
6495 }
6496 } else {
6497 /* find the user buffer size */
6498 xsecurity_bufsize = fusize(xsecurity_size);
6499
6500 /* copy out the actual data size */
6501 if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
6502 error = EFAULT;
6503 goto out;
6504 }
6505
6506 /* if the caller supplied enough room, copy out to it */
6507 if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
6508 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
6509 }
6510 }
6511 }
6512 out:
6513 if (fsec != KAUTH_FILESEC_NONE) {
6514 kauth_filesec_free(fsec);
6515 }
6516 return error;
6517 }
6518
6519 /*
6520 * stat_extended: Get file status; with extended security (ACL).
6521 *
6522 * Parameters: p (ignored)
6523 * uap User argument descriptor (see below)
6524 * retval (ignored)
6525 *
6526 * Indirect: uap->path Path of file to get status from
6527 * uap->ub User buffer (holds file status info)
6528 * uap->xsecurity ACL to get (extended security)
6529 * uap->xsecurity_size Size of ACL
6530 *
6531 * Returns: 0 Success
6532 * !0 errno value
6533 *
6534 */
6535 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)6536 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
6537 __unused int32_t *retval)
6538 {
6539 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6540 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6541 0);
6542 }
6543
6544 /*
6545 * Returns: 0 Success
6546 * fstatat_internal:??? [see fstatat_internal() in this file]
6547 */
6548 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)6549 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
6550 {
6551 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6552 0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
6553 }
6554
6555 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)6556 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
6557 {
6558 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6559 0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
6560 }
6561
6562 /*
6563 * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
6564 *
6565 * Parameters: p (ignored)
6566 * uap User argument descriptor (see below)
6567 * retval (ignored)
6568 *
6569 * Indirect: uap->path Path of file to get status from
6570 * uap->ub User buffer (holds file status info)
6571 * uap->xsecurity ACL to get (extended security)
6572 * uap->xsecurity_size Size of ACL
6573 *
6574 * Returns: 0 Success
6575 * !0 errno value
6576 *
6577 */
6578 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)6579 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
6580 {
6581 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6582 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6583 0);
6584 }
6585
6586 /*
6587 * lstat_extended: Get file status; does not follow links; with extended security (ACL).
6588 *
6589 * Parameters: p (ignored)
6590 * uap User argument descriptor (see below)
6591 * retval (ignored)
6592 *
6593 * Indirect: uap->path Path of file to get status from
6594 * uap->ub User buffer (holds file status info)
6595 * uap->xsecurity ACL to get (extended security)
6596 * uap->xsecurity_size Size of ACL
6597 *
6598 * Returns: 0 Success
6599 * !0 errno value
6600 *
6601 */
6602 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)6603 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
6604 {
6605 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6606 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6607 AT_SYMLINK_NOFOLLOW);
6608 }
6609
6610 /*
6611 * Get file status; this version does not follow links.
6612 */
6613 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)6614 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
6615 {
6616 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6617 0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6618 }
6619
6620 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)6621 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
6622 {
6623 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6624 0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6625 }
6626
6627 /*
6628 * lstat64_extended: Get file status; can handle large inode numbers; does not
6629 * follow links; with extended security (ACL).
6630 *
6631 * Parameters: p (ignored)
6632 * uap User argument descriptor (see below)
6633 * retval (ignored)
6634 *
6635 * Indirect: uap->path Path of file to get status from
6636 * uap->ub User buffer (holds file status info)
6637 * uap->xsecurity ACL to get (extended security)
6638 * uap->xsecurity_size Size of ACL
6639 *
6640 * Returns: 0 Success
6641 * !0 errno value
6642 *
6643 */
6644 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)6645 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
6646 {
6647 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6648 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6649 AT_SYMLINK_NOFOLLOW);
6650 }
6651
6652 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)6653 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
6654 {
6655 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
6656 return EINVAL;
6657 }
6658
6659 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6660 0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
6661 }
6662
6663 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)6664 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
6665 __unused int32_t *retval)
6666 {
6667 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
6668 return EINVAL;
6669 }
6670
6671 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6672 0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
6673 }
6674
6675 /*
6676 * Get configurable pathname variables.
6677 *
6678 * Returns: 0 Success
6679 * namei:???
6680 * vn_pathconf:???
6681 *
6682 * Notes: Global implementation constants are intended to be
6683 * implemented in this function directly; all other constants
6684 * are per-FS implementation, and therefore must be handled in
6685 * each respective FS, instead.
6686 *
6687 * XXX We implement some things globally right now that should actually be
6688 * XXX per-FS; we will need to deal with this at some point.
6689 */
6690 /* ARGSUSED */
6691 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)6692 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
6693 {
6694 int error;
6695 struct nameidata nd;
6696 vfs_context_t ctx = vfs_context_current();
6697
6698 NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
6699 UIO_USERSPACE, uap->path, ctx);
6700 error = namei(&nd);
6701 if (error) {
6702 return error;
6703 }
6704
6705 error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
6706
6707 vnode_put(nd.ni_vp);
6708 nameidone(&nd);
6709 return error;
6710 }
6711
6712 /*
6713 * Return target name of a symbolic link.
6714 */
6715 /* ARGSUSED */
6716 static int
readlinkat_internal(vfs_context_t ctx,int fd,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)6717 readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
6718 enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
6719 int *retval)
6720 {
6721 vnode_t vp;
6722 uio_t auio;
6723 int error;
6724 struct nameidata nd;
6725 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
6726
6727 if (bufsize > INT32_MAX) {
6728 return EINVAL;
6729 }
6730
6731 NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
6732 seg, path, ctx);
6733
6734 error = nameiat(&nd, fd);
6735 if (error) {
6736 return error;
6737 }
6738 vp = nd.ni_vp;
6739
6740 nameidone(&nd);
6741
6742 auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
6743 &uio_buf[0], sizeof(uio_buf));
6744 uio_addiov(auio, buf, bufsize);
6745 if (vp->v_type != VLNK) {
6746 error = EINVAL;
6747 } else {
6748 #if CONFIG_MACF
6749 error = mac_vnode_check_readlink(ctx, vp);
6750 #endif
6751 if (error == 0) {
6752 error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
6753 ctx);
6754 }
6755 if (error == 0) {
6756 error = VNOP_READLINK(vp, auio, ctx);
6757 }
6758 }
6759 vnode_put(vp);
6760
6761 *retval = (int)(bufsize - uio_resid(auio));
6762 return error;
6763 }
6764
6765 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)6766 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
6767 {
6768 enum uio_seg procseg;
6769
6770 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6771 return readlinkat_internal(vfs_context_current(), AT_FDCWD,
6772 CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
6773 uap->count, procseg, retval);
6774 }
6775
6776 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)6777 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
6778 {
6779 enum uio_seg procseg;
6780
6781 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6782 return readlinkat_internal(vfs_context_current(), uap->fd, uap->path,
6783 procseg, uap->buf, uap->bufsize, procseg, retval);
6784 }
6785
6786 /*
6787 * Change file flags, the deep inner layer.
6788 */
6789 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)6790 chflags0(vnode_t vp, struct vnode_attr *va,
6791 int (*setattr)(vnode_t, void *, vfs_context_t),
6792 void *arg, vfs_context_t ctx)
6793 {
6794 kauth_action_t action = 0;
6795 int error;
6796
6797 #if CONFIG_MACF
6798 error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
6799 if (error) {
6800 goto out;
6801 }
6802 #endif
6803
6804 /* request authorisation, disregard immutability */
6805 if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
6806 goto out;
6807 }
6808 /*
6809 * Request that the auth layer disregard those file flags it's allowed to when
6810 * authorizing this operation; we need to do this in order to be able to
6811 * clear immutable flags.
6812 */
6813 if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
6814 goto out;
6815 }
6816 error = (*setattr)(vp, arg, ctx);
6817
6818 #if CONFIG_MACF
6819 if (error == 0) {
6820 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
6821 }
6822 #endif
6823
6824 out:
6825 return error;
6826 }
6827
6828 /*
6829 * Change file flags.
6830 *
6831 * NOTE: this will vnode_put() `vp'
6832 */
6833 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)6834 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
6835 {
6836 struct vnode_attr va;
6837 int error;
6838
6839 VATTR_INIT(&va);
6840 VATTR_SET(&va, va_flags, flags);
6841
6842 error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
6843 vnode_put(vp);
6844
6845 if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
6846 error = ENOTSUP;
6847 }
6848
6849 return error;
6850 }
6851
6852 /*
6853 * Change flags of a file given a path name.
6854 */
6855 /* ARGSUSED */
6856 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)6857 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
6858 {
6859 vnode_t vp;
6860 vfs_context_t ctx = vfs_context_current();
6861 int error;
6862 struct nameidata nd;
6863
6864 AUDIT_ARG(fflags, uap->flags);
6865 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6866 UIO_USERSPACE, uap->path, ctx);
6867 error = namei(&nd);
6868 if (error) {
6869 return error;
6870 }
6871 vp = nd.ni_vp;
6872 nameidone(&nd);
6873
6874 /* we don't vnode_put() here because chflags1 does internally */
6875 error = chflags1(vp, uap->flags, ctx);
6876
6877 return error;
6878 }
6879
6880 /*
6881 * Change flags of a file given a file descriptor.
6882 */
6883 /* ARGSUSED */
6884 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)6885 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
6886 {
6887 vnode_t vp;
6888 int error;
6889
6890 AUDIT_ARG(fd, uap->fd);
6891 AUDIT_ARG(fflags, uap->flags);
6892 if ((error = file_vnode(uap->fd, &vp))) {
6893 return error;
6894 }
6895
6896 if ((error = vnode_getwithref(vp))) {
6897 file_drop(uap->fd);
6898 return error;
6899 }
6900
6901 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6902
6903 /* we don't vnode_put() here because chflags1 does internally */
6904 error = chflags1(vp, uap->flags, vfs_context_current());
6905
6906 file_drop(uap->fd);
6907 return error;
6908 }
6909
6910 /*
6911 * Change security information on a filesystem object.
6912 *
6913 * Returns: 0 Success
6914 * EPERM Operation not permitted
6915 * vnode_authattr:??? [anything vnode_authattr can return]
6916 * vnode_authorize:??? [anything vnode_authorize can return]
6917 * vnode_setattr:??? [anything vnode_setattr can return]
6918 *
6919 * Notes: If vnode_authattr or vnode_authorize return EACCES, it will be
6920 * translated to EPERM before being returned.
6921 */
6922 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)6923 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
6924 {
6925 kauth_action_t action;
6926 int error;
6927
6928 AUDIT_ARG(mode, vap->va_mode);
6929 /* XXX audit new args */
6930
6931 #if NAMEDSTREAMS
6932 /* chmod calls are not allowed for resource forks. */
6933 if (vp->v_flag & VISNAMEDSTREAM) {
6934 return EPERM;
6935 }
6936 #endif
6937
6938 #if CONFIG_MACF
6939 if (VATTR_IS_ACTIVE(vap, va_mode) &&
6940 (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
6941 return error;
6942 }
6943
6944 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6945 if ((error = mac_vnode_check_setowner(ctx, vp,
6946 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6947 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
6948 return error;
6949 }
6950 }
6951
6952 if (VATTR_IS_ACTIVE(vap, va_acl) &&
6953 (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
6954 return error;
6955 }
6956 #endif
6957
6958 /* make sure that the caller is allowed to set this security information */
6959 if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
6960 ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6961 if (error == EACCES) {
6962 error = EPERM;
6963 }
6964 return error;
6965 }
6966
6967 if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
6968 return error;
6969 }
6970
6971 #if CONFIG_MACF
6972 if (VATTR_IS_ACTIVE(vap, va_mode)) {
6973 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
6974 }
6975
6976 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6977 mac_vnode_notify_setowner(ctx, vp,
6978 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6979 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
6980 }
6981
6982 if (VATTR_IS_ACTIVE(vap, va_acl)) {
6983 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
6984 }
6985 #endif
6986
6987 return error;
6988 }
6989
6990
6991 /*
6992 * Change mode of a file given a path name.
6993 *
6994 * Returns: 0 Success
6995 * namei:??? [anything namei can return]
6996 * chmod_vnode:??? [anything chmod_vnode can return]
6997 */
6998 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)6999 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7000 int fd, int flag, enum uio_seg segflg)
7001 {
7002 struct nameidata nd;
7003 int follow, error;
7004
7005 follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7006 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
7007 segflg, path, ctx);
7008 if ((error = nameiat(&nd, fd))) {
7009 return error;
7010 }
7011 error = chmod_vnode(ctx, nd.ni_vp, vap);
7012 vnode_put(nd.ni_vp);
7013 nameidone(&nd);
7014 return error;
7015 }
7016
7017 /*
7018 * chmod_extended: Change the mode of a file given a path name; with extended
7019 * argument list (including extended security (ACL)).
7020 *
7021 * Parameters: p Process requesting the open
7022 * uap User argument descriptor (see below)
7023 * retval (ignored)
7024 *
7025 * Indirect: uap->path Path to object (same as 'chmod')
7026 * uap->uid UID to set
7027 * uap->gid GID to set
7028 * uap->mode File mode to set (same as 'chmod')
7029 * uap->xsecurity ACL to set (or delete)
7030 *
7031 * Returns: 0 Success
7032 * !0 errno value
7033 *
7034 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
7035 *
7036 * XXX: We should enummerate the possible errno values here, and where
7037 * in the code they originated.
7038 */
7039 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7040 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7041 {
7042 int error;
7043 struct vnode_attr va;
7044 kauth_filesec_t xsecdst;
7045
7046 AUDIT_ARG(owner, uap->uid, uap->gid);
7047
7048 VATTR_INIT(&va);
7049 if (uap->mode != -1) {
7050 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7051 }
7052 if (uap->uid != KAUTH_UID_NONE) {
7053 VATTR_SET(&va, va_uid, uap->uid);
7054 }
7055 if (uap->gid != KAUTH_GID_NONE) {
7056 VATTR_SET(&va, va_gid, uap->gid);
7057 }
7058
7059 xsecdst = NULL;
7060 switch (uap->xsecurity) {
7061 /* explicit remove request */
7062 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7063 VATTR_SET(&va, va_acl, NULL);
7064 break;
7065 /* not being set */
7066 case USER_ADDR_NULL:
7067 break;
7068 default:
7069 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
7070 return error;
7071 }
7072 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
7073 va.va_vaflags |= VA_FILESEC_ACL;
7074 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
7075 }
7076
7077 error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7078 UIO_USERSPACE);
7079
7080 if (xsecdst != NULL) {
7081 kauth_filesec_free(xsecdst);
7082 }
7083 return error;
7084 }
7085
7086 /*
7087 * Returns: 0 Success
7088 * chmodat:??? [anything chmodat can return]
7089 */
7090 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7091 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7092 int flag, enum uio_seg segflg)
7093 {
7094 struct vnode_attr va;
7095
7096 VATTR_INIT(&va);
7097 VATTR_SET(&va, va_mode, mode & ALLPERMS);
7098
7099 return chmodat(ctx, path, &va, fd, flag, segflg);
7100 }
7101
7102 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7103 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7104 {
7105 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7106 AT_FDCWD, 0, UIO_USERSPACE);
7107 }
7108
7109 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7110 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7111 {
7112 if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7113 return EINVAL;
7114 }
7115
7116 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7117 uap->fd, uap->flag, UIO_USERSPACE);
7118 }
7119
7120 /*
7121 * Change mode of a file given a file descriptor.
7122 */
7123 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7124 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7125 {
7126 vnode_t vp;
7127 int error;
7128
7129 AUDIT_ARG(fd, fd);
7130
7131 if ((error = file_vnode(fd, &vp)) != 0) {
7132 return error;
7133 }
7134 if ((error = vnode_getwithref(vp)) != 0) {
7135 file_drop(fd);
7136 return error;
7137 }
7138 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7139
7140 error = chmod_vnode(vfs_context_current(), vp, vap);
7141 (void)vnode_put(vp);
7142 file_drop(fd);
7143
7144 return error;
7145 }
7146
7147 /*
7148 * fchmod_extended: Change mode of a file given a file descriptor; with
7149 * extended argument list (including extended security (ACL)).
7150 *
7151 * Parameters: p Process requesting to change file mode
7152 * uap User argument descriptor (see below)
7153 * retval (ignored)
7154 *
7155 * Indirect: uap->mode File mode to set (same as 'chmod')
7156 * uap->uid UID to set
7157 * uap->gid GID to set
7158 * uap->xsecurity ACL to set (or delete)
7159 * uap->fd File descriptor of file to change mode
7160 *
7161 * Returns: 0 Success
7162 * !0 errno value
7163 *
7164 */
7165 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7166 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7167 {
7168 int error;
7169 struct vnode_attr va;
7170 kauth_filesec_t xsecdst;
7171
7172 AUDIT_ARG(owner, uap->uid, uap->gid);
7173
7174 VATTR_INIT(&va);
7175 if (uap->mode != -1) {
7176 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7177 }
7178 if (uap->uid != KAUTH_UID_NONE) {
7179 VATTR_SET(&va, va_uid, uap->uid);
7180 }
7181 if (uap->gid != KAUTH_GID_NONE) {
7182 VATTR_SET(&va, va_gid, uap->gid);
7183 }
7184
7185 xsecdst = NULL;
7186 switch (uap->xsecurity) {
7187 case USER_ADDR_NULL:
7188 VATTR_SET(&va, va_acl, NULL);
7189 break;
7190 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7191 VATTR_SET(&va, va_acl, NULL);
7192 break;
7193 /* not being set */
7194 case CAST_USER_ADDR_T(-1):
7195 break;
7196 default:
7197 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
7198 return error;
7199 }
7200 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
7201 va.va_vaflags |= VA_FILESEC_ACL;
7202 }
7203
7204 error = fchmod1(p, uap->fd, &va);
7205
7206
7207 switch (uap->xsecurity) {
7208 case USER_ADDR_NULL:
7209 case CAST_USER_ADDR_T(-1):
7210 break;
7211 default:
7212 if (xsecdst != NULL) {
7213 kauth_filesec_free(xsecdst);
7214 }
7215 }
7216 return error;
7217 }
7218
7219 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)7220 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7221 {
7222 struct vnode_attr va;
7223
7224 VATTR_INIT(&va);
7225 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7226
7227 return fchmod1(p, uap->fd, &va);
7228 }
7229
7230
7231 /*
7232 * Set ownership given a path name.
7233 */
7234 /* ARGSUSED */
7235 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)7236 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7237 gid_t gid, int flag, enum uio_seg segflg)
7238 {
7239 vnode_t vp;
7240 struct vnode_attr va;
7241 int error;
7242 struct nameidata nd;
7243 int follow;
7244 kauth_action_t action;
7245
7246 AUDIT_ARG(owner, uid, gid);
7247
7248 follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7249 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
7250 path, ctx);
7251 error = nameiat(&nd, fd);
7252 if (error) {
7253 return error;
7254 }
7255 vp = nd.ni_vp;
7256
7257 nameidone(&nd);
7258
7259 VATTR_INIT(&va);
7260 if (uid != (uid_t)VNOVAL) {
7261 VATTR_SET(&va, va_uid, uid);
7262 }
7263 if (gid != (gid_t)VNOVAL) {
7264 VATTR_SET(&va, va_gid, gid);
7265 }
7266
7267 #if CONFIG_MACF
7268 error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7269 if (error) {
7270 goto out;
7271 }
7272 #endif
7273
7274 /* preflight and authorize attribute changes */
7275 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7276 goto out;
7277 }
7278 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7279 goto out;
7280 }
7281 error = vnode_setattr(vp, &va, ctx);
7282
7283 #if CONFIG_MACF
7284 if (error == 0) {
7285 mac_vnode_notify_setowner(ctx, vp, uid, gid);
7286 }
7287 #endif
7288
7289 out:
7290 /*
7291 * EACCES is only allowed from namei(); permissions failure should
7292 * return EPERM, so we need to translate the error code.
7293 */
7294 if (error == EACCES) {
7295 error = EPERM;
7296 }
7297
7298 vnode_put(vp);
7299 return error;
7300 }
7301
7302 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)7303 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7304 {
7305 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7306 uap->uid, uap->gid, 0, UIO_USERSPACE);
7307 }
7308
7309 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)7310 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7311 {
7312 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7313 uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7314 }
7315
7316 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)7317 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7318 {
7319 if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7320 return EINVAL;
7321 }
7322
7323 return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7324 uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7325 }
7326
7327 /*
7328 * Set ownership given a file descriptor.
7329 */
7330 /* ARGSUSED */
7331 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)7332 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7333 {
7334 struct vnode_attr va;
7335 vfs_context_t ctx = vfs_context_current();
7336 vnode_t vp;
7337 int error;
7338 kauth_action_t action;
7339
7340 AUDIT_ARG(owner, uap->uid, uap->gid);
7341 AUDIT_ARG(fd, uap->fd);
7342
7343 if ((error = file_vnode(uap->fd, &vp))) {
7344 return error;
7345 }
7346
7347 if ((error = vnode_getwithref(vp))) {
7348 file_drop(uap->fd);
7349 return error;
7350 }
7351 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7352
7353 VATTR_INIT(&va);
7354 if (uap->uid != VNOVAL) {
7355 VATTR_SET(&va, va_uid, uap->uid);
7356 }
7357 if (uap->gid != VNOVAL) {
7358 VATTR_SET(&va, va_gid, uap->gid);
7359 }
7360
7361 #if NAMEDSTREAMS
7362 /* chown calls are not allowed for resource forks. */
7363 if (vp->v_flag & VISNAMEDSTREAM) {
7364 error = EPERM;
7365 goto out;
7366 }
7367 #endif
7368
7369 #if CONFIG_MACF
7370 error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
7371 if (error) {
7372 goto out;
7373 }
7374 #endif
7375
7376 /* preflight and authorize attribute changes */
7377 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7378 goto out;
7379 }
7380 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7381 if (error == EACCES) {
7382 error = EPERM;
7383 }
7384 goto out;
7385 }
7386 error = vnode_setattr(vp, &va, ctx);
7387
7388 #if CONFIG_MACF
7389 if (error == 0) {
7390 mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
7391 }
7392 #endif
7393
7394 out:
7395 (void)vnode_put(vp);
7396 file_drop(uap->fd);
7397 return error;
7398 }
7399
7400 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)7401 getutimes(user_addr_t usrtvp, struct timespec *tsp)
7402 {
7403 int error;
7404
7405 if (usrtvp == USER_ADDR_NULL) {
7406 struct timeval old_tv;
7407 /* XXX Y2038 bug because of microtime argument */
7408 microtime(&old_tv);
7409 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
7410 tsp[1] = tsp[0];
7411 } else {
7412 if (IS_64BIT_PROCESS(current_proc())) {
7413 struct user64_timeval tv[2];
7414 error = copyin(usrtvp, (void *)tv, sizeof(tv));
7415 if (error) {
7416 return error;
7417 }
7418 TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
7419 TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
7420 } else {
7421 struct user32_timeval tv[2];
7422 error = copyin(usrtvp, (void *)tv, sizeof(tv));
7423 if (error) {
7424 return error;
7425 }
7426 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
7427 TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
7428 }
7429 }
7430 return 0;
7431 }
7432
7433 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)7434 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
7435 int nullflag)
7436 {
7437 int error;
7438 struct vnode_attr va;
7439 kauth_action_t action;
7440
7441 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7442
7443 VATTR_INIT(&va);
7444 VATTR_SET(&va, va_access_time, ts[0]);
7445 VATTR_SET(&va, va_modify_time, ts[1]);
7446 if (nullflag) {
7447 va.va_vaflags |= VA_UTIMES_NULL;
7448 }
7449
7450 #if NAMEDSTREAMS
7451 /* utimes calls are not allowed for resource forks. */
7452 if (vp->v_flag & VISNAMEDSTREAM) {
7453 error = EPERM;
7454 goto out;
7455 }
7456 #endif
7457
7458 #if CONFIG_MACF
7459 error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
7460 if (error) {
7461 goto out;
7462 }
7463 #endif
7464 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7465 if (!nullflag && error == EACCES) {
7466 error = EPERM;
7467 }
7468 goto out;
7469 }
7470
7471 /* since we may not need to auth anything, check here */
7472 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7473 if (!nullflag && error == EACCES) {
7474 error = EPERM;
7475 }
7476 goto out;
7477 }
7478 error = vnode_setattr(vp, &va, ctx);
7479
7480 #if CONFIG_MACF
7481 if (error == 0) {
7482 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
7483 }
7484 #endif
7485
7486 out:
7487 return error;
7488 }
7489
7490 /*
7491 * Set the access and modification times of a file.
7492 */
7493 /* ARGSUSED */
7494 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)7495 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
7496 {
7497 struct timespec ts[2];
7498 user_addr_t usrtvp;
7499 int error;
7500 struct nameidata nd;
7501 vfs_context_t ctx = vfs_context_current();
7502
7503 /*
7504 * AUDIT: Needed to change the order of operations to do the
7505 * name lookup first because auditing wants the path.
7506 */
7507 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
7508 UIO_USERSPACE, uap->path, ctx);
7509 error = namei(&nd);
7510 if (error) {
7511 return error;
7512 }
7513 nameidone(&nd);
7514
7515 /*
7516 * Fetch the user-supplied time. If usrtvp is USER_ADDR_NULL, we fetch
7517 * the current time instead.
7518 */
7519 usrtvp = uap->tptr;
7520 if ((error = getutimes(usrtvp, ts)) != 0) {
7521 goto out;
7522 }
7523
7524 error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
7525
7526 out:
7527 vnode_put(nd.ni_vp);
7528 return error;
7529 }
7530
7531 /*
7532 * Set the access and modification times of a file.
7533 */
7534 /* ARGSUSED */
7535 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)7536 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
7537 {
7538 struct timespec ts[2];
7539 vnode_t vp;
7540 user_addr_t usrtvp;
7541 int error;
7542
7543 AUDIT_ARG(fd, uap->fd);
7544 usrtvp = uap->tptr;
7545 if ((error = getutimes(usrtvp, ts)) != 0) {
7546 return error;
7547 }
7548 if ((error = file_vnode(uap->fd, &vp)) != 0) {
7549 return error;
7550 }
7551 if ((error = vnode_getwithref(vp))) {
7552 file_drop(uap->fd);
7553 return error;
7554 }
7555
7556 error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
7557 vnode_put(vp);
7558 file_drop(uap->fd);
7559 return error;
7560 }
7561
7562 /*
7563 * Truncate a file given its path name.
7564 */
7565 /* ARGSUSED */
7566 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)7567 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
7568 {
7569 vnode_t vp;
7570 struct vnode_attr va;
7571 vfs_context_t ctx = vfs_context_current();
7572 int error;
7573 struct nameidata nd;
7574 kauth_action_t action;
7575 rlim_t fsize_limit;
7576
7577 if (uap->length < 0) {
7578 return EINVAL;
7579 }
7580
7581 fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
7582 if ((rlim_t)uap->length > fsize_limit) {
7583 psignal(p, SIGXFSZ);
7584 return EFBIG;
7585 }
7586
7587 NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
7588 UIO_USERSPACE, uap->path, ctx);
7589 if ((error = namei(&nd))) {
7590 return error;
7591 }
7592 vp = nd.ni_vp;
7593
7594 nameidone(&nd);
7595
7596 VATTR_INIT(&va);
7597 VATTR_SET(&va, va_data_size, uap->length);
7598
7599 #if CONFIG_MACF
7600 error = mac_vnode_check_truncate(ctx, NOCRED, vp);
7601 if (error) {
7602 goto out;
7603 }
7604 #endif
7605
7606 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7607 goto out;
7608 }
7609 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7610 goto out;
7611 }
7612 error = vnode_setattr(vp, &va, ctx);
7613
7614 #if CONFIG_MACF
7615 if (error == 0) {
7616 mac_vnode_notify_truncate(ctx, NOCRED, vp);
7617 }
7618 #endif
7619
7620 out:
7621 vnode_put(vp);
7622 return error;
7623 }
7624
7625 /*
7626 * Truncate a file given a file descriptor.
7627 */
7628 /* ARGSUSED */
7629 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)7630 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
7631 {
7632 vfs_context_t ctx = vfs_context_current();
7633 struct vnode_attr va;
7634 vnode_t vp;
7635 struct fileproc *fp;
7636 int error;
7637 int fd = uap->fd;
7638 rlim_t fsize_limit;
7639
7640 AUDIT_ARG(fd, uap->fd);
7641 if (uap->length < 0) {
7642 return EINVAL;
7643 }
7644
7645 fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
7646 if ((rlim_t)uap->length > fsize_limit) {
7647 psignal(p, SIGXFSZ);
7648 return EFBIG;
7649 }
7650
7651 if ((error = fp_lookup(p, fd, &fp, 0))) {
7652 return error;
7653 }
7654
7655 switch (FILEGLOB_DTYPE(fp->fp_glob)) {
7656 case DTYPE_PSXSHM:
7657 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
7658 goto out;
7659 case DTYPE_VNODE:
7660 break;
7661 default:
7662 error = EINVAL;
7663 goto out;
7664 }
7665
7666 vp = (vnode_t)fp_get_data(fp);
7667
7668 if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
7669 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7670 error = EINVAL;
7671 goto out;
7672 }
7673
7674 if ((error = vnode_getwithref(vp)) != 0) {
7675 goto out;
7676 }
7677
7678 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7679
7680 #if CONFIG_MACF
7681 error = mac_vnode_check_truncate(ctx,
7682 fp->fp_glob->fg_cred, vp);
7683 if (error) {
7684 (void)vnode_put(vp);
7685 goto out;
7686 }
7687 #endif
7688 VATTR_INIT(&va);
7689 VATTR_SET(&va, va_data_size, uap->length);
7690 error = vnode_setattr(vp, &va, ctx);
7691
7692 #if CONFIG_MACF
7693 if (error == 0) {
7694 mac_vnode_notify_truncate(ctx, fp->fp_glob->fg_cred, vp);
7695 }
7696 #endif
7697
7698 (void)vnode_put(vp);
7699 out:
7700 file_drop(fd);
7701 return error;
7702 }
7703
7704
7705 /*
7706 * Sync an open file with synchronized I/O _file_ integrity completion
7707 */
7708 /* ARGSUSED */
7709 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)7710 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
7711 {
7712 __pthread_testcancel(1);
7713 return fsync_common(p, uap, MNT_WAIT);
7714 }
7715
7716
7717 /*
7718 * Sync an open file with synchronized I/O _file_ integrity completion
7719 *
7720 * Notes: This is a legacy support function that does not test for
7721 * thread cancellation points.
7722 */
7723 /* ARGSUSED */
7724 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)7725 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
7726 {
7727 return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
7728 }
7729
7730
7731 /*
7732 * Sync an open file with synchronized I/O _data_ integrity completion
7733 */
7734 /* ARGSUSED */
7735 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)7736 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
7737 {
7738 __pthread_testcancel(1);
7739 return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
7740 }
7741
7742
7743 /*
7744 * fsync_common
7745 *
7746 * Common fsync code to support both synchronized I/O file integrity completion
7747 * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
7748 *
7749 * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
7750 * will only guarantee that the file data contents are retrievable. If
7751 * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
7752 * includes additional metadata unnecessary for retrieving the file data
7753 * contents, such as atime, mtime, ctime, etc., also be committed to stable
7754 * storage.
7755 *
7756 * Parameters: p The process
7757 * uap->fd The descriptor to synchronize
7758 * flags The data integrity flags
7759 *
7760 * Returns: int Success
7761 * fp_getfvp:EBADF Bad file descriptor
7762 * fp_getfvp:ENOTSUP fd does not refer to a vnode
7763 * VNOP_FSYNC:??? unspecified
7764 *
7765 * Notes: We use struct fsync_args because it is a short name, and all
7766 * caller argument structures are otherwise identical.
7767 */
7768 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)7769 fsync_common(proc_t p, struct fsync_args *uap, int flags)
7770 {
7771 vnode_t vp;
7772 struct fileproc *fp;
7773 vfs_context_t ctx = vfs_context_current();
7774 int error;
7775
7776 AUDIT_ARG(fd, uap->fd);
7777
7778 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
7779 return error;
7780 }
7781 if ((error = vnode_getwithref(vp))) {
7782 file_drop(uap->fd);
7783 return error;
7784 }
7785
7786 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7787
7788 error = VNOP_FSYNC(vp, flags, ctx);
7789
7790 #if NAMEDRSRCFORK
7791 /* Sync resource fork shadow file if necessary. */
7792 if ((error == 0) &&
7793 (vp->v_flag & VISNAMEDSTREAM) &&
7794 (vp->v_parent != NULLVP) &&
7795 vnode_isshadow(vp) &&
7796 (fp->fp_glob->fg_flag & FWASWRITTEN)) {
7797 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
7798 }
7799 #endif
7800
7801 (void)vnode_put(vp);
7802 file_drop(uap->fd);
7803 return error;
7804 }
7805
7806 /*
7807 * Duplicate files. Source must be a file, target must be a file or
7808 * must not exist.
7809 *
7810 * XXX Copyfile authorisation checking is woefully inadequate, and will not
7811 * perform inheritance correctly.
7812 */
7813 /* ARGSUSED */
7814 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)7815 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
7816 {
7817 vnode_t tvp, fvp, tdvp, sdvp;
7818 struct nameidata fromnd, tond;
7819 int error;
7820 vfs_context_t ctx = vfs_context_current();
7821
7822 /* Check that the flags are valid. */
7823 if (uap->flags & ~CPF_MASK) {
7824 return EINVAL;
7825 }
7826
7827 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
7828 UIO_USERSPACE, uap->from, ctx);
7829 if ((error = namei(&fromnd))) {
7830 return error;
7831 }
7832 fvp = fromnd.ni_vp;
7833
7834 NDINIT(&tond, CREATE, OP_LINK,
7835 LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
7836 UIO_USERSPACE, uap->to, ctx);
7837 if ((error = namei(&tond))) {
7838 goto out1;
7839 }
7840 tdvp = tond.ni_dvp;
7841 tvp = tond.ni_vp;
7842
7843 if (tvp != NULL) {
7844 if (!(uap->flags & CPF_OVERWRITE)) {
7845 error = EEXIST;
7846 goto out;
7847 }
7848 }
7849
7850 if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
7851 error = EISDIR;
7852 goto out;
7853 }
7854
7855 if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
7856 error = EOPNOTSUPP;
7857 goto out;
7858 }
7859
7860 #if CONFIG_MACF
7861 if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
7862 goto out;
7863 }
7864 #endif /* CONFIG_MACF */
7865
7866 if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
7867 goto out;
7868 }
7869 if (tvp) {
7870 if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
7871 goto out;
7872 }
7873 }
7874 if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
7875 goto out;
7876 }
7877
7878 if (fvp == tdvp) {
7879 error = EINVAL;
7880 }
7881 /*
7882 * If source is the same as the destination (that is the
7883 * same inode number) then there is nothing to do.
7884 * (fixed to have POSIX semantics - CSM 3/2/98)
7885 */
7886 if (fvp == tvp) {
7887 error = -1;
7888 }
7889 if (!error) {
7890 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
7891 }
7892 out:
7893 sdvp = tond.ni_startdir;
7894 /*
7895 * nameidone has to happen before we vnode_put(tdvp)
7896 * since it may need to release the fs_nodelock on the tdvp
7897 */
7898 nameidone(&tond);
7899
7900 if (tvp) {
7901 vnode_put(tvp);
7902 }
7903 vnode_put(tdvp);
7904 vnode_put(sdvp);
7905 out1:
7906 vnode_put(fvp);
7907
7908 nameidone(&fromnd);
7909
7910 if (error == -1) {
7911 return 0;
7912 }
7913 return error;
7914 }
7915
7916 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
7917
7918 /*
7919 * Helper function for doing clones. The caller is expected to provide an
7920 * iocounted source vnode and release it.
7921 */
7922 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)7923 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
7924 user_addr_t dst, uint32_t flags, vfs_context_t ctx)
7925 {
7926 vnode_t tvp, tdvp;
7927 struct nameidata tond;
7928 int error;
7929 int follow;
7930 boolean_t free_src_acl;
7931 boolean_t attr_cleanup;
7932 enum vtype v_type;
7933 kauth_action_t action;
7934 struct componentname *cnp;
7935 uint32_t defaulted;
7936 struct vnode_attr va;
7937 struct vnode_attr nva;
7938 uint32_t vnop_flags;
7939
7940 v_type = vnode_vtype(fvp);
7941 switch (v_type) {
7942 case VLNK:
7943 /* FALLTHRU */
7944 case VREG:
7945 action = KAUTH_VNODE_ADD_FILE;
7946 break;
7947 case VDIR:
7948 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
7949 fvp->v_mountedhere) {
7950 return EINVAL;
7951 }
7952 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
7953 break;
7954 default:
7955 return EINVAL;
7956 }
7957
7958 AUDIT_ARG(fd2, dst_dirfd);
7959 AUDIT_ARG(value32, flags);
7960
7961 follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7962 NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
7963 UIO_USERSPACE, dst, ctx);
7964 if ((error = nameiat(&tond, dst_dirfd))) {
7965 return error;
7966 }
7967 cnp = &tond.ni_cnd;
7968 tdvp = tond.ni_dvp;
7969 tvp = tond.ni_vp;
7970
7971 free_src_acl = FALSE;
7972 attr_cleanup = FALSE;
7973
7974 if (tvp != NULL) {
7975 error = EEXIST;
7976 goto out;
7977 }
7978
7979 if (vnode_mount(tdvp) != vnode_mount(fvp)) {
7980 error = EXDEV;
7981 goto out;
7982 }
7983
7984 #if CONFIG_MACF
7985 if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
7986 goto out;
7987 }
7988 #endif
7989 if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
7990 goto out;
7991 }
7992
7993 action = KAUTH_VNODE_GENERIC_READ_BITS;
7994 if (data_read_authorised) {
7995 action &= ~KAUTH_VNODE_READ_DATA;
7996 }
7997 if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
7998 goto out;
7999 }
8000
8001 /*
8002 * certain attributes may need to be changed from the source, we ask for
8003 * those here with the exception of source file's ACL. The clone file
8004 * will inherit the target directory's ACL.
8005 */
8006 VATTR_INIT(&va);
8007 VATTR_WANTED(&va, va_uid);
8008 VATTR_WANTED(&va, va_gid);
8009 VATTR_WANTED(&va, va_mode);
8010 VATTR_WANTED(&va, va_flags);
8011
8012 if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
8013 goto out;
8014 }
8015
8016 VATTR_INIT(&nva);
8017 VATTR_SET(&nva, va_type, v_type);
8018 if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8019 VATTR_SET(&nva, va_acl, va.va_acl);
8020 free_src_acl = TRUE;
8021 }
8022
8023 /* Handle ACL inheritance, initialize vap. */
8024 if (v_type == VLNK) {
8025 error = vnode_authattr_new(tdvp, &nva, 0, ctx);
8026 } else {
8027 error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
8028 if (error) {
8029 goto out;
8030 }
8031 attr_cleanup = TRUE;
8032 }
8033
8034 vnop_flags = VNODE_CLONEFILE_DEFAULT;
8035 /*
8036 * We've got initial values for all security parameters,
8037 * If we are superuser, then we can change owners to be the
8038 * same as the source. Both superuser and the owner have default
8039 * WRITE_SECURITY privileges so all other fields can be taken
8040 * from source as well.
8041 */
8042 if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8043 if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8044 VATTR_SET(&nva, va_uid, va.va_uid);
8045 }
8046 if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8047 VATTR_SET(&nva, va_gid, va.va_gid);
8048 }
8049 } else {
8050 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8051 }
8052
8053 if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8054 VATTR_SET(&nva, va_mode, va.va_mode);
8055 }
8056 if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8057 VATTR_SET(&nva, va_flags,
8058 ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8059 (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8060 }
8061
8062 error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8063
8064 if (!error && tvp) {
8065 int update_flags = 0;
8066 #if CONFIG_FSE
8067 int fsevent;
8068 #endif /* CONFIG_FSE */
8069
8070 /*
8071 * If some of the requested attributes weren't handled by the
8072 * VNOP, use our fallback code.
8073 */
8074 if (!VATTR_ALL_SUPPORTED(&nva)) {
8075 (void)vnode_setattr_fallback(tvp, &nva, ctx);
8076 }
8077
8078 #if CONFIG_MACF
8079 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8080 VNODE_LABEL_CREATE, ctx);
8081 #endif
8082
8083 // Make sure the name & parent pointers are hooked up
8084 if (tvp->v_name == NULL) {
8085 update_flags |= VNODE_UPDATE_NAME;
8086 }
8087 if (tvp->v_parent == NULLVP) {
8088 update_flags |= VNODE_UPDATE_PARENT;
8089 }
8090
8091 if (update_flags) {
8092 (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8093 cnp->cn_namelen, cnp->cn_hash, update_flags);
8094 }
8095
8096 #if CONFIG_FSE
8097 switch (vnode_vtype(tvp)) {
8098 case VLNK:
8099 /* FALLTHRU */
8100 case VREG:
8101 fsevent = FSE_CREATE_FILE;
8102 break;
8103 case VDIR:
8104 fsevent = FSE_CREATE_DIR;
8105 break;
8106 default:
8107 goto out;
8108 }
8109
8110 if (need_fsevent(fsevent, tvp)) {
8111 /*
8112 * The following is a sequence of three explicit events.
8113 * A pair of FSE_CLONE events representing the source and destination
8114 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8115 * fseventsd may coalesce the destination clone and create events
8116 * into a single event resulting in the following sequence for a client
8117 * FSE_CLONE (src)
8118 * FSE_CLONE | FSE_CREATE (dst)
8119 */
8120 add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8121 FSE_ARG_DONE);
8122 add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8123 FSE_ARG_DONE);
8124 }
8125 #endif /* CONFIG_FSE */
8126 }
8127
8128 out:
8129 if (attr_cleanup) {
8130 vn_attribute_cleanup(&nva, defaulted);
8131 }
8132 if (free_src_acl && va.va_acl) {
8133 kauth_acl_free(va.va_acl);
8134 }
8135 nameidone(&tond);
8136 if (tvp) {
8137 vnode_put(tvp);
8138 }
8139 vnode_put(tdvp);
8140 return error;
8141 }
8142
8143 /*
8144 * clone files or directories, target must not exist.
8145 */
8146 /* ARGSUSED */
8147 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8148 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8149 __unused int32_t *retval)
8150 {
8151 vnode_t fvp;
8152 struct nameidata fromnd;
8153 int follow;
8154 int error;
8155 vfs_context_t ctx = vfs_context_current();
8156
8157 /* Check that the flags are valid. */
8158 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
8159 return EINVAL;
8160 }
8161
8162 AUDIT_ARG(fd, uap->src_dirfd);
8163
8164 follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8165 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8166 UIO_USERSPACE, uap->src, ctx);
8167 if ((error = nameiat(&fromnd, uap->src_dirfd))) {
8168 return error;
8169 }
8170
8171 fvp = fromnd.ni_vp;
8172 nameidone(&fromnd);
8173
8174 error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8175 uap->flags, ctx);
8176
8177 vnode_put(fvp);
8178 return error;
8179 }
8180
8181 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)8182 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8183 __unused int32_t *retval)
8184 {
8185 vnode_t fvp;
8186 struct fileproc *fp;
8187 int error;
8188 vfs_context_t ctx = vfs_context_current();
8189
8190 /* Check that the flags are valid. */
8191 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
8192 return EINVAL;
8193 }
8194
8195 AUDIT_ARG(fd, uap->src_fd);
8196 error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8197 if (error) {
8198 return error;
8199 }
8200
8201 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8202 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8203 error = EBADF;
8204 goto out;
8205 }
8206
8207 if ((error = vnode_getwithref(fvp))) {
8208 goto out;
8209 }
8210
8211 AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8212
8213 error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8214 uap->flags, ctx);
8215
8216 vnode_put(fvp);
8217 out:
8218 file_drop(uap->src_fd);
8219 return error;
8220 }
8221
8222 static int
rename_submounts_callback(mount_t mp,void * arg)8223 rename_submounts_callback(mount_t mp, void *arg)
8224 {
8225 int error = 0;
8226 mount_t pmp = (mount_t)arg;
8227 int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8228
8229 if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8230 return 0;
8231 }
8232
8233 if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8234 return 0;
8235 }
8236
8237 if ((error = vfs_busy(mp, LK_NOWAIT))) {
8238 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8239 return -1;
8240 }
8241
8242 int pathlen = MAXPATHLEN;
8243 if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8244 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8245 }
8246
8247 vfs_unbusy(mp);
8248
8249 return error;
8250 }
8251
8252 /*
8253 * Rename files. Source and destination must either both be directories,
8254 * or both not be directories. If target is a directory, it must be empty.
8255 */
8256 /* ARGSUSED */
8257 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,vfs_rename_flags_t flags)8258 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8259 int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
8260 {
8261 if (flags & ~VFS_RENAME_FLAGS_MASK) {
8262 return EINVAL;
8263 }
8264
8265 if (ISSET(flags, VFS_RENAME_SWAP) && ISSET(flags, VFS_RENAME_EXCL)) {
8266 return EINVAL;
8267 }
8268
8269 vnode_t tvp, tdvp;
8270 vnode_t fvp, fdvp;
8271 vnode_t mnt_fvp;
8272 struct nameidata *fromnd, *tond;
8273 int error;
8274 int do_retry;
8275 int retry_count;
8276 int mntrename;
8277 int need_event;
8278 int need_kpath2;
8279 int has_listeners;
8280 const char *oname = NULL;
8281 char *from_name = NULL, *to_name = NULL;
8282 char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8283 int from_len = 0, to_len = 0;
8284 int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8285 int holding_mntlock;
8286 int vn_authorize_skipped;
8287 mount_t locked_mp = NULL;
8288 vnode_t oparent = NULLVP;
8289 #if CONFIG_FSE
8290 fse_info from_finfo, to_finfo;
8291 #endif
8292 int from_truncated = 0, to_truncated = 0;
8293 int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8294 int batched = 0;
8295 struct vnode_attr *fvap, *tvap;
8296 int continuing = 0;
8297 /* carving out a chunk for structs that are too big to be on stack. */
8298 struct {
8299 struct nameidata from_node, to_node;
8300 struct vnode_attr fv_attr, tv_attr;
8301 } * __rename_data;
8302 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
8303 fromnd = &__rename_data->from_node;
8304 tond = &__rename_data->to_node;
8305
8306 holding_mntlock = 0;
8307 do_retry = 0;
8308 retry_count = 0;
8309 retry:
8310 fvp = tvp = NULL;
8311 fdvp = tdvp = NULL;
8312 fvap = tvap = NULL;
8313 mnt_fvp = NULLVP;
8314 mntrename = FALSE;
8315 vn_authorize_skipped = FALSE;
8316
8317 NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8318 segflg, from, ctx);
8319 fromnd->ni_flag = NAMEI_COMPOUNDRENAME;
8320
8321 NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8322 segflg, to, ctx);
8323 tond->ni_flag = NAMEI_COMPOUNDRENAME;
8324
8325 continue_lookup:
8326 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8327 if ((error = nameiat(fromnd, fromfd))) {
8328 goto out1;
8329 }
8330 fdvp = fromnd->ni_dvp;
8331 fvp = fromnd->ni_vp;
8332
8333 if (fvp && fvp->v_type == VDIR) {
8334 tond->ni_cnd.cn_flags |= WILLBEDIR;
8335 }
8336 }
8337
8338 if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8339 if ((error = nameiat(tond, tofd))) {
8340 /*
8341 * Translate error code for rename("dir1", "dir2/.").
8342 */
8343 if (error == EISDIR && fvp->v_type == VDIR) {
8344 error = EINVAL;
8345 }
8346 goto out1;
8347 }
8348 tdvp = tond->ni_dvp;
8349 tvp = tond->ni_vp;
8350 }
8351
8352 #if DEVELOPMENT || DEBUG
8353 /*
8354 * XXX VSWAP: Check for entitlements or special flag here
8355 * so we can restrict access appropriately.
8356 */
8357 #else /* DEVELOPMENT || DEBUG */
8358
8359 if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
8360 error = EPERM;
8361 goto out1;
8362 }
8363
8364 if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
8365 error = EPERM;
8366 goto out1;
8367 }
8368 #endif /* DEVELOPMENT || DEBUG */
8369
8370 if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
8371 error = ENOENT;
8372 goto out1;
8373 }
8374
8375 if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
8376 int32_t pval = 0;
8377 int err = 0;
8378
8379 /*
8380 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
8381 * has the same name as target iff the following conditions are met:
8382 * 1. the target file system is case insensitive
8383 * 2. source and target directories are the same
8384 * 3. source and target files are the same
8385 * 4. name only differs in case (determined by underlying filesystem)
8386 */
8387 if (fvp != tvp || fdvp != tdvp) {
8388 error = EEXIST;
8389 goto out1;
8390 }
8391
8392 /*
8393 * Assume that the target file system is case sensitive if
8394 * _PC_CASE_SENSITIVE selector isn't supported.
8395 */
8396 err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
8397 if (err != 0 || pval != 0) {
8398 error = EEXIST;
8399 goto out1;
8400 }
8401 }
8402
8403 batched = vnode_compound_rename_available(fdvp);
8404
8405 #if CONFIG_FSE
8406 need_event = need_fsevent(FSE_RENAME, fdvp);
8407 if (need_event) {
8408 if (fvp) {
8409 get_fse_info(fvp, &from_finfo, ctx);
8410 } else {
8411 error = vfs_get_notify_attributes(&__rename_data->fv_attr);
8412 if (error) {
8413 goto out1;
8414 }
8415
8416 fvap = &__rename_data->fv_attr;
8417 }
8418
8419 if (tvp) {
8420 get_fse_info(tvp, &to_finfo, ctx);
8421 } else if (batched) {
8422 error = vfs_get_notify_attributes(&__rename_data->tv_attr);
8423 if (error) {
8424 goto out1;
8425 }
8426
8427 tvap = &__rename_data->tv_attr;
8428 }
8429 }
8430 #else
8431 need_event = 0;
8432 #endif /* CONFIG_FSE */
8433
8434 has_listeners = kauth_authorize_fileop_has_listeners();
8435
8436 need_kpath2 = 0;
8437 #if CONFIG_AUDIT
8438 if (AUDIT_RECORD_EXISTS()) {
8439 need_kpath2 = 1;
8440 }
8441 #endif
8442
8443 if (need_event || has_listeners) {
8444 if (from_name == NULL) {
8445 GET_PATH(from_name);
8446 }
8447
8448 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
8449
8450 if (from_name_no_firmlink == NULL) {
8451 GET_PATH(from_name_no_firmlink);
8452 }
8453
8454 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
8455 }
8456
8457 if (need_event || need_kpath2 || has_listeners) {
8458 if (to_name == NULL) {
8459 GET_PATH(to_name);
8460 }
8461
8462 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
8463
8464 if (to_name_no_firmlink == NULL) {
8465 GET_PATH(to_name_no_firmlink);
8466 }
8467
8468 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
8469 if (to_name && need_kpath2) {
8470 AUDIT_ARG(kpath, to_name, ARG_KPATH2);
8471 }
8472 }
8473 if (!fvp) {
8474 /*
8475 * Claim: this check will never reject a valid rename.
8476 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
8477 * Suppose fdvp and tdvp are not on the same mount.
8478 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root,
8479 * then you can't move it to within another dir on the same mountpoint.
8480 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
8481 *
8482 * If this check passes, then we are safe to pass these vnodes to the same FS.
8483 */
8484 if (fdvp->v_mount != tdvp->v_mount) {
8485 error = EXDEV;
8486 goto out1;
8487 }
8488 goto skipped_lookup;
8489 }
8490
8491 /*
8492 * If the source and destination are the same (i.e. they're
8493 * links to the same vnode) and the target file system is
8494 * case sensitive, then there is nothing to do.
8495 *
8496 * XXX Come back to this.
8497 */
8498 if (fvp == tvp) {
8499 int pathconf_val;
8500
8501 /*
8502 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
8503 * then assume that this file system is case sensitive.
8504 */
8505 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
8506 pathconf_val != 0) {
8507 vn_authorize_skipped = TRUE;
8508 goto out1;
8509 }
8510 }
8511
8512 /*
8513 * Allow the renaming of mount points.
8514 * - target must not exist
8515 * - target must reside in the same directory as source
8516 * - union mounts cannot be renamed
8517 * - the root fs, and tightly-linked system volumes, cannot be renamed
8518 *
8519 * XXX Handle this in VFS after a continued lookup (if we missed
8520 * in the cache to start off)
8521 *
8522 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
8523 * we'll skip past here. The file system is responsible for
8524 * checking that @tvp is not a descendent of @fvp and vice versa
8525 * so it should always return EINVAL if either @tvp or @fvp is the
8526 * root of a volume.
8527 */
8528 if ((fvp->v_flag & VROOT) &&
8529 (fvp->v_type == VDIR) &&
8530 (tvp == NULL) &&
8531 (fvp->v_mountedhere == NULL) &&
8532 (fdvp == tdvp) &&
8533 ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
8534 ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
8535 (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
8536 vnode_t coveredvp;
8537
8538 /* switch fvp to the covered vnode */
8539 coveredvp = fvp->v_mount->mnt_vnodecovered;
8540 if ((vnode_getwithref(coveredvp))) {
8541 error = ENOENT;
8542 goto out1;
8543 }
8544 /*
8545 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
8546 * later.
8547 */
8548 mnt_fvp = fvp;
8549
8550 fvp = coveredvp;
8551 mntrename = TRUE;
8552 }
8553 /*
8554 * Check for cross-device rename.
8555 */
8556 if ((fvp->v_mount != tdvp->v_mount) ||
8557 (tvp && (fvp->v_mount != tvp->v_mount))) {
8558 error = EXDEV;
8559 goto out1;
8560 }
8561
8562 /*
8563 * If source is the same as the destination (that is the
8564 * same inode number) then there is nothing to do...
8565 * EXCEPT if the underlying file system supports case
8566 * insensitivity and is case preserving. In this case
8567 * the file system needs to handle the special case of
8568 * getting the same vnode as target (fvp) and source (tvp).
8569 *
8570 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
8571 * and _PC_CASE_PRESERVING can have this exception, and they need to
8572 * handle the special case of getting the same vnode as target and
8573 * source. NOTE: Then the target is unlocked going into vnop_rename,
8574 * so not to cause locking problems. There is a single reference on tvp.
8575 *
8576 * NOTE - that fvp == tvp also occurs if they are hard linked and
8577 * that correct behaviour then is just to return success without doing
8578 * anything.
8579 *
8580 * XXX filesystem should take care of this itself, perhaps...
8581 */
8582 if (fvp == tvp && fdvp == tdvp) {
8583 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
8584 !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
8585 fromnd->ni_cnd.cn_namelen)) {
8586 vn_authorize_skipped = TRUE;
8587 goto out1;
8588 }
8589 }
8590
8591 if (holding_mntlock && fvp->v_mount != locked_mp) {
8592 /*
8593 * we're holding a reference and lock
8594 * on locked_mp, but it no longer matches
8595 * what we want to do... so drop our hold
8596 */
8597 mount_unlock_renames(locked_mp);
8598 mount_drop(locked_mp, 0);
8599 holding_mntlock = 0;
8600 }
8601 if (tdvp != fdvp && fvp->v_type == VDIR) {
8602 /*
8603 * serialize renames that re-shape
8604 * the tree... if holding_mntlock is
8605 * set, then we're ready to go...
8606 * otherwise we
8607 * first need to drop the iocounts
8608 * we picked up, second take the
8609 * lock to serialize the access,
8610 * then finally start the lookup
8611 * process over with the lock held
8612 */
8613 if (!holding_mntlock) {
8614 /*
8615 * need to grab a reference on
8616 * the mount point before we
8617 * drop all the iocounts... once
8618 * the iocounts are gone, the mount
8619 * could follow
8620 */
8621 locked_mp = fvp->v_mount;
8622 mount_ref(locked_mp, 0);
8623
8624 /*
8625 * nameidone has to happen before we vnode_put(tvp)
8626 * since it may need to release the fs_nodelock on the tvp
8627 */
8628 nameidone(tond);
8629
8630 if (tvp) {
8631 vnode_put(tvp);
8632 }
8633 vnode_put(tdvp);
8634
8635 /*
8636 * nameidone has to happen before we vnode_put(fdvp)
8637 * since it may need to release the fs_nodelock on the fvp
8638 */
8639 nameidone(fromnd);
8640
8641 vnode_put(fvp);
8642 vnode_put(fdvp);
8643
8644 if (mnt_fvp != NULLVP) {
8645 vnode_put(mnt_fvp);
8646 }
8647
8648 mount_lock_renames(locked_mp);
8649 holding_mntlock = 1;
8650
8651 goto retry;
8652 }
8653 } else {
8654 /*
8655 * when we dropped the iocounts to take
8656 * the lock, we allowed the identity of
8657 * the various vnodes to change... if they did,
8658 * we may no longer be dealing with a rename
8659 * that reshapes the tree... once we're holding
8660 * the iocounts, the vnodes can't change type
8661 * so we're free to drop the lock at this point
8662 * and continue on
8663 */
8664 if (holding_mntlock) {
8665 mount_unlock_renames(locked_mp);
8666 mount_drop(locked_mp, 0);
8667 holding_mntlock = 0;
8668 }
8669 }
8670
8671 if (!batched) {
8672 error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
8673 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
8674 flags, NULL);
8675 if (error) {
8676 if (error == ENOENT) {
8677 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8678 /*
8679 * We encountered a race where after doing the namei,
8680 * tvp stops being valid. If so, simply re-drive the rename
8681 * call from the top.
8682 */
8683 do_retry = 1;
8684 retry_count += 1;
8685 }
8686 }
8687 goto out1;
8688 }
8689 }
8690
8691 /* Release the 'mnt_fvp' now that it is no longer needed. */
8692 if (mnt_fvp != NULLVP) {
8693 vnode_put(mnt_fvp);
8694 mnt_fvp = NULLVP;
8695 }
8696
8697 // save these off so we can later verify that fvp is the same
8698 oname = fvp->v_name;
8699 oparent = fvp->v_parent;
8700
8701 skipped_lookup:
8702 error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
8703 tdvp, &tvp, &tond->ni_cnd, tvap,
8704 flags, ctx);
8705
8706 if (holding_mntlock) {
8707 /*
8708 * we can drop our serialization
8709 * lock now
8710 */
8711 mount_unlock_renames(locked_mp);
8712 mount_drop(locked_mp, 0);
8713 holding_mntlock = 0;
8714 }
8715 if (error) {
8716 if (error == EDATALESS) {
8717 /*
8718 * If we've been here before, something has gone
8719 * horribly wrong and we should just get out lest
8720 * we spiral around the drain forever.
8721 */
8722 if (flags & VFS_RENAME_DATALESS) {
8723 error = EIO;
8724 goto out1;
8725 }
8726
8727 /*
8728 * The object we're renaming is dataless (or has a
8729 * dataless descendent) and requires materialization
8730 * before the rename occurs. But we're holding the
8731 * mount point's rename lock, so it's not safe to
8732 * make the upcall.
8733 *
8734 * In this case, we release the lock, perform the
8735 * materialization, and start the whole thing over.
8736 */
8737 error = vnode_materialize_dataless_file(fvp,
8738 NAMESPACE_HANDLER_RENAME_OP);
8739
8740 if (error == 0) {
8741 /*
8742 * The next time around we need to tell the
8743 * file system that the materializtaion has
8744 * been performed.
8745 */
8746 flags |= VFS_RENAME_DATALESS;
8747 do_retry = 1;
8748 }
8749 goto out1;
8750 }
8751 if (error == EKEEPLOOKING) {
8752 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8753 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8754 panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
8755 }
8756 }
8757
8758 fromnd->ni_vp = fvp;
8759 tond->ni_vp = tvp;
8760
8761 goto continue_lookup;
8762 }
8763
8764 /*
8765 * We may encounter a race in the VNOP where the destination didn't
8766 * exist when we did the namei, but it does by the time we go and
8767 * try to create the entry. In this case, we should re-drive this rename
8768 * call from the top again. Currently, only HFS bubbles out ERECYCLE,
8769 * but other filesystems susceptible to this race could return it, too.
8770 */
8771 if (error == ERECYCLE) {
8772 if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
8773 do_retry = 1;
8774 retry_count += 1;
8775 } else {
8776 printf("rename retry limit due to ERECYCLE reached\n");
8777 error = ENOENT;
8778 }
8779 }
8780
8781 /*
8782 * For compound VNOPs, the authorization callback may return
8783 * ENOENT in case of racing hardlink lookups hitting the name
8784 * cache, redrive the lookup.
8785 */
8786 if (batched && error == ENOENT) {
8787 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8788 do_retry = 1;
8789 retry_count += 1;
8790 }
8791 }
8792
8793 goto out1;
8794 }
8795
8796 /* call out to allow 3rd party notification of rename.
8797 * Ignore result of kauth_authorize_fileop call.
8798 */
8799 kauth_authorize_fileop(vfs_context_ucred(ctx),
8800 KAUTH_FILEOP_RENAME,
8801 (uintptr_t)from_name, (uintptr_t)to_name);
8802 if (flags & VFS_RENAME_SWAP) {
8803 kauth_authorize_fileop(vfs_context_ucred(ctx),
8804 KAUTH_FILEOP_RENAME,
8805 (uintptr_t)to_name, (uintptr_t)from_name);
8806 }
8807
8808 #if CONFIG_FSE
8809 if (from_name != NULL && to_name != NULL) {
8810 if (from_truncated || to_truncated) {
8811 // set it here since only the from_finfo gets reported up to user space
8812 from_finfo.mode |= FSE_TRUNCATED_PATH;
8813 }
8814
8815 if (tvap && tvp) {
8816 vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
8817 }
8818 if (fvap) {
8819 vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
8820 }
8821
8822 if (tvp) {
8823 add_fsevent(FSE_RENAME, ctx,
8824 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8825 FSE_ARG_FINFO, &from_finfo,
8826 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8827 FSE_ARG_FINFO, &to_finfo,
8828 FSE_ARG_DONE);
8829 if (flags & VFS_RENAME_SWAP) {
8830 /*
8831 * Strictly speaking, swap is the equivalent of
8832 * *three* renames. FSEvents clients should only take
8833 * the events as a hint, so we only bother reporting
8834 * two.
8835 */
8836 add_fsevent(FSE_RENAME, ctx,
8837 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8838 FSE_ARG_FINFO, &to_finfo,
8839 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8840 FSE_ARG_FINFO, &from_finfo,
8841 FSE_ARG_DONE);
8842 }
8843 } else {
8844 add_fsevent(FSE_RENAME, ctx,
8845 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8846 FSE_ARG_FINFO, &from_finfo,
8847 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8848 FSE_ARG_DONE);
8849 }
8850 }
8851 #endif /* CONFIG_FSE */
8852
8853 /*
8854 * update filesystem's mount point data
8855 */
8856 if (mntrename) {
8857 char *cp, *pathend, *mpname;
8858 char * tobuf;
8859 struct mount *mp;
8860 int maxlen;
8861 size_t len = 0;
8862
8863 mp = fvp->v_mountedhere;
8864
8865 if (vfs_busy(mp, LK_NOWAIT)) {
8866 error = EBUSY;
8867 goto out1;
8868 }
8869 tobuf = zalloc(ZV_NAMEI);
8870
8871 if (UIO_SEG_IS_USER_SPACE(segflg)) {
8872 error = copyinstr(to, tobuf, MAXPATHLEN, &len);
8873 } else {
8874 error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
8875 }
8876 if (!error) {
8877 /* find current mount point prefix */
8878 pathend = &mp->mnt_vfsstat.f_mntonname[0];
8879 for (cp = pathend; *cp != '\0'; ++cp) {
8880 if (*cp == '/') {
8881 pathend = cp + 1;
8882 }
8883 }
8884 /* find last component of target name */
8885 for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
8886 if (*cp == '/') {
8887 mpname = cp + 1;
8888 }
8889 }
8890
8891 /* Update f_mntonname of sub mounts */
8892 vfs_iterate(0, rename_submounts_callback, (void *)mp);
8893
8894 /* append name to prefix */
8895 maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
8896 bzero(pathend, maxlen);
8897
8898 strlcpy(pathend, mpname, maxlen);
8899 }
8900 zfree(ZV_NAMEI, tobuf);
8901
8902 vfs_unbusy(mp);
8903
8904 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
8905 }
8906 /*
8907 * fix up name & parent pointers. note that we first
8908 * check that fvp has the same name/parent pointers it
8909 * had before the rename call... this is a 'weak' check
8910 * at best...
8911 *
8912 * XXX oparent and oname may not be set in the compound vnop case
8913 */
8914 if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
8915 int update_flags;
8916
8917 update_flags = VNODE_UPDATE_NAME;
8918
8919 if (fdvp != tdvp) {
8920 update_flags |= VNODE_UPDATE_PARENT;
8921 }
8922
8923 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
8924 }
8925 out1:
8926 /*
8927 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
8928 * skipped earlier as no actual rename was performed.
8929 */
8930 if (vn_authorize_skipped && error == 0) {
8931 error = vn_authorize_renamex_with_paths(fdvp, fvp,
8932 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
8933 flags, NULL);
8934 if (error && error == ENOENT) {
8935 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8936 do_retry = 1;
8937 retry_count += 1;
8938 }
8939 }
8940 }
8941 if (to_name != NULL) {
8942 RELEASE_PATH(to_name);
8943 to_name = NULL;
8944 }
8945 if (to_name_no_firmlink != NULL) {
8946 RELEASE_PATH(to_name_no_firmlink);
8947 to_name_no_firmlink = NULL;
8948 }
8949 if (from_name != NULL) {
8950 RELEASE_PATH(from_name);
8951 from_name = NULL;
8952 }
8953 if (from_name_no_firmlink != NULL) {
8954 RELEASE_PATH(from_name_no_firmlink);
8955 from_name_no_firmlink = NULL;
8956 }
8957 if (holding_mntlock) {
8958 mount_unlock_renames(locked_mp);
8959 mount_drop(locked_mp, 0);
8960 holding_mntlock = 0;
8961 }
8962 if (tdvp) {
8963 /*
8964 * nameidone has to happen before we vnode_put(tdvp)
8965 * since it may need to release the fs_nodelock on the tdvp
8966 */
8967 nameidone(tond);
8968
8969 if (tvp) {
8970 vnode_put(tvp);
8971 }
8972 vnode_put(tdvp);
8973 }
8974 if (fdvp) {
8975 /*
8976 * nameidone has to happen before we vnode_put(fdvp)
8977 * since it may need to release the fs_nodelock on the fdvp
8978 */
8979 nameidone(fromnd);
8980
8981 if (fvp) {
8982 vnode_put(fvp);
8983 }
8984 vnode_put(fdvp);
8985 }
8986 if (mnt_fvp != NULLVP) {
8987 vnode_put(mnt_fvp);
8988 }
8989 /*
8990 * If things changed after we did the namei, then we will re-drive
8991 * this rename call from the top.
8992 */
8993 if (do_retry) {
8994 do_retry = 0;
8995 goto retry;
8996 }
8997
8998 kfree_type(typeof(*__rename_data), __rename_data);
8999 return error;
9000 }
9001
9002 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9003 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9004 {
9005 return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9006 AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9007 }
9008
9009 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9010 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9011 {
9012 return renameat_internal(
9013 vfs_context_current(),
9014 uap->fromfd, uap->from,
9015 uap->tofd, uap->to,
9016 UIO_USERSPACE, uap->flags);
9017 }
9018
9019 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9020 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9021 {
9022 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9023 uap->tofd, uap->to, UIO_USERSPACE, 0);
9024 }
9025
9026 /*
9027 * Make a directory file.
9028 *
9029 * Returns: 0 Success
9030 * EEXIST
9031 * namei:???
9032 * vnode_authorize:???
9033 * vn_create:???
9034 */
9035 /* ARGSUSED */
9036 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9037 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9038 enum uio_seg segflg)
9039 {
9040 vnode_t vp, dvp;
9041 int error;
9042 int update_flags = 0;
9043 int batched;
9044 struct nameidata nd;
9045
9046 AUDIT_ARG(mode, vap->va_mode);
9047 NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9048 path, ctx);
9049 nd.ni_cnd.cn_flags |= WILLBEDIR;
9050 nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9051
9052 continue_lookup:
9053 error = nameiat(&nd, fd);
9054 if (error) {
9055 return error;
9056 }
9057 dvp = nd.ni_dvp;
9058 vp = nd.ni_vp;
9059
9060 if (vp != NULL) {
9061 error = EEXIST;
9062 goto out;
9063 }
9064
9065 batched = vnode_compound_mkdir_available(dvp);
9066
9067 VATTR_SET(vap, va_type, VDIR);
9068
9069 /*
9070 * XXX
9071 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9072 * only get EXISTS or EISDIR for existing path components, and not that it could see
9073 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9074 * it will fail in a spurious manner. Need to figure out if this is valid behavior.
9075 */
9076 if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9077 if (error == EACCES || error == EPERM) {
9078 int error2;
9079
9080 nameidone(&nd);
9081 vnode_put(dvp);
9082 dvp = NULLVP;
9083
9084 /*
9085 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9086 * rather than EACCESS if the target exists.
9087 */
9088 NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9089 path, ctx);
9090 error2 = nameiat(&nd, fd);
9091 if (error2) {
9092 goto out;
9093 } else {
9094 vp = nd.ni_vp;
9095 error = EEXIST;
9096 goto out;
9097 }
9098 }
9099
9100 goto out;
9101 }
9102
9103 /*
9104 * make the directory
9105 */
9106 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9107 if (error == EKEEPLOOKING) {
9108 nd.ni_vp = vp;
9109 goto continue_lookup;
9110 }
9111
9112 goto out;
9113 }
9114
9115 // Make sure the name & parent pointers are hooked up
9116 if (vp->v_name == NULL) {
9117 update_flags |= VNODE_UPDATE_NAME;
9118 }
9119 if (vp->v_parent == NULLVP) {
9120 update_flags |= VNODE_UPDATE_PARENT;
9121 }
9122
9123 if (update_flags) {
9124 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9125 }
9126
9127 #if CONFIG_FSE
9128 add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9129 #endif
9130
9131 out:
9132 /*
9133 * nameidone has to happen before we vnode_put(dvp)
9134 * since it may need to release the fs_nodelock on the dvp
9135 */
9136 nameidone(&nd);
9137
9138 if (vp) {
9139 vnode_put(vp);
9140 }
9141 if (dvp) {
9142 vnode_put(dvp);
9143 }
9144
9145 return error;
9146 }
9147
9148 /*
9149 * mkdir_extended: Create a directory; with extended security (ACL).
9150 *
9151 * Parameters: p Process requesting to create the directory
9152 * uap User argument descriptor (see below)
9153 * retval (ignored)
9154 *
9155 * Indirect: uap->path Path of directory to create
9156 * uap->mode Access permissions to set
9157 * uap->xsecurity ACL to set
9158 *
9159 * Returns: 0 Success
9160 * !0 Not success
9161 *
9162 */
9163 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)9164 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9165 {
9166 int ciferror;
9167 kauth_filesec_t xsecdst;
9168 struct vnode_attr va;
9169
9170 AUDIT_ARG(owner, uap->uid, uap->gid);
9171
9172 xsecdst = NULL;
9173 if ((uap->xsecurity != USER_ADDR_NULL) &&
9174 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9175 return ciferror;
9176 }
9177
9178 VATTR_INIT(&va);
9179 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9180 if (xsecdst != NULL) {
9181 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9182 va.va_vaflags |= VA_FILESEC_ACL;
9183 }
9184
9185 ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9186 UIO_USERSPACE);
9187 if (xsecdst != NULL) {
9188 kauth_filesec_free(xsecdst);
9189 }
9190 return ciferror;
9191 }
9192
9193 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)9194 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9195 {
9196 struct vnode_attr va;
9197
9198 VATTR_INIT(&va);
9199 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9200
9201 return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9202 UIO_USERSPACE);
9203 }
9204
9205 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)9206 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9207 {
9208 struct vnode_attr va;
9209
9210 VATTR_INIT(&va);
9211 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9212
9213 return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9214 UIO_USERSPACE);
9215 }
9216
9217 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)9218 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9219 enum uio_seg segflg, int unlink_flags)
9220 {
9221 struct {
9222 struct nameidata nd;
9223 #if CONFIG_FSE
9224 struct vnode_attr va;
9225 #endif /* CONFIG_FSE */
9226 } *__rmdir_data;
9227 vnode_t vp, dvp;
9228 int error;
9229 struct nameidata *ndp;
9230 char *path = NULL;
9231 char *no_firmlink_path = NULL;
9232 int len_path = 0;
9233 int len_no_firmlink_path = 0;
9234 int has_listeners = 0;
9235 int need_event = 0;
9236 int truncated_path = 0;
9237 int truncated_no_firmlink_path = 0;
9238 struct vnode_attr *vap = NULL;
9239 int restart_count = 0;
9240 int batched;
9241
9242 int restart_flag;
9243
9244 __rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
9245 ndp = &__rmdir_data->nd;
9246
9247 /*
9248 * This loop exists to restart rmdir in the unlikely case that two
9249 * processes are simultaneously trying to remove the same directory
9250 * containing orphaned appleDouble files.
9251 */
9252 do {
9253 NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9254 segflg, dirpath, ctx);
9255 ndp->ni_flag = NAMEI_COMPOUNDRMDIR;
9256 continue_lookup:
9257 restart_flag = 0;
9258 vap = NULL;
9259
9260 error = nameiat(ndp, fd);
9261 if (error) {
9262 goto err_out;
9263 }
9264
9265 dvp = ndp->ni_dvp;
9266 vp = ndp->ni_vp;
9267
9268 if (vp) {
9269 batched = vnode_compound_rmdir_available(vp);
9270
9271 if (vp->v_flag & VROOT) {
9272 /*
9273 * The root of a mounted filesystem cannot be deleted.
9274 */
9275 error = EBUSY;
9276 goto out;
9277 }
9278
9279 #if DEVELOPMENT || DEBUG
9280 /*
9281 * XXX VSWAP: Check for entitlements or special flag here
9282 * so we can restrict access appropriately.
9283 */
9284 #else /* DEVELOPMENT || DEBUG */
9285
9286 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
9287 error = EPERM;
9288 goto out;
9289 }
9290 #endif /* DEVELOPMENT || DEBUG */
9291
9292 /*
9293 * Removed a check here; we used to abort if vp's vid
9294 * was not the same as what we'd seen the last time around.
9295 * I do not think that check was valid, because if we retry
9296 * and all dirents are gone, the directory could legitimately
9297 * be recycled but still be present in a situation where we would
9298 * have had permission to delete. Therefore, we won't make
9299 * an effort to preserve that check now that we may not have a
9300 * vp here.
9301 */
9302
9303 if (!batched) {
9304 error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
9305 if (error) {
9306 if (error == ENOENT) {
9307 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9308 restart_flag = 1;
9309 restart_count += 1;
9310 }
9311 }
9312 goto out;
9313 }
9314 }
9315 } else {
9316 batched = 1;
9317
9318 if (!vnode_compound_rmdir_available(dvp)) {
9319 panic("No error, but no compound rmdir?");
9320 }
9321 }
9322
9323 #if CONFIG_FSE
9324 fse_info finfo = {0};
9325
9326 need_event = need_fsevent(FSE_DELETE, dvp);
9327 if (need_event) {
9328 if (!batched) {
9329 get_fse_info(vp, &finfo, ctx);
9330 } else {
9331 error = vfs_get_notify_attributes(&__rmdir_data->va);
9332 if (error) {
9333 goto out;
9334 }
9335
9336 vap = &__rmdir_data->va;
9337 }
9338 }
9339 #endif
9340 has_listeners = kauth_authorize_fileop_has_listeners();
9341 if (need_event || has_listeners) {
9342 if (path == NULL) {
9343 GET_PATH(path);
9344 }
9345
9346 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
9347
9348 if (no_firmlink_path == NULL) {
9349 GET_PATH(no_firmlink_path);
9350 }
9351
9352 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
9353 #if CONFIG_FSE
9354 if (truncated_no_firmlink_path) {
9355 finfo.mode |= FSE_TRUNCATED_PATH;
9356 }
9357 #endif
9358 }
9359
9360 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
9361 ndp->ni_vp = vp;
9362 if (vp == NULLVP) {
9363 /* Couldn't find a vnode */
9364 goto out;
9365 }
9366
9367 if (error == EKEEPLOOKING) {
9368 goto continue_lookup;
9369 } else if (batched && error == ENOENT) {
9370 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9371 /*
9372 * For compound VNOPs, the authorization callback
9373 * may return ENOENT in case of racing hard link lookups
9374 * redrive the lookup.
9375 */
9376 restart_flag = 1;
9377 restart_count += 1;
9378 goto out;
9379 }
9380 }
9381
9382 /*
9383 * XXX There's no provision for passing flags
9384 * to VNOP_RMDIR(). So, if vn_rmdir() fails
9385 * because it's not empty, then we try again
9386 * with VNOP_REMOVE(), passing in a special
9387 * flag that clever file systems will know
9388 * how to handle.
9389 */
9390 if (error == ENOTEMPTY &&
9391 (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
9392 /*
9393 * If this fails, we want to keep the original
9394 * error.
9395 */
9396 if (vn_remove(dvp, &vp, ndp,
9397 VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
9398 error = 0;
9399 }
9400 }
9401
9402 #if CONFIG_APPLEDOUBLE
9403 /*
9404 * Special case to remove orphaned AppleDouble
9405 * files. I don't like putting this in the kernel,
9406 * but carbon does not like putting this in carbon either,
9407 * so here we are.
9408 */
9409 if (error == ENOTEMPTY) {
9410 int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
9411 if (ad_error == EBUSY) {
9412 error = ad_error;
9413 goto out;
9414 }
9415
9416
9417 /*
9418 * Assuming everything went well, we will try the RMDIR again
9419 */
9420 if (!ad_error) {
9421 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
9422 }
9423 }
9424 #endif /* CONFIG_APPLEDOUBLE */
9425 /*
9426 * Call out to allow 3rd party notification of delete.
9427 * Ignore result of kauth_authorize_fileop call.
9428 */
9429 if (!error) {
9430 if (has_listeners) {
9431 kauth_authorize_fileop(vfs_context_ucred(ctx),
9432 KAUTH_FILEOP_DELETE,
9433 (uintptr_t)vp,
9434 (uintptr_t)path);
9435 }
9436
9437 if (vp->v_flag & VISHARDLINK) {
9438 // see the comment in unlink1() about why we update
9439 // the parent of a hard link when it is removed
9440 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
9441 }
9442
9443 #if CONFIG_FSE
9444 if (need_event) {
9445 if (vap) {
9446 vnode_get_fse_info_from_vap(vp, &finfo, vap);
9447 }
9448 add_fsevent(FSE_DELETE, ctx,
9449 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
9450 FSE_ARG_FINFO, &finfo,
9451 FSE_ARG_DONE);
9452 }
9453 #endif
9454 }
9455
9456 out:
9457 if (path != NULL) {
9458 RELEASE_PATH(path);
9459 path = NULL;
9460 }
9461
9462 if (no_firmlink_path != NULL) {
9463 RELEASE_PATH(no_firmlink_path);
9464 no_firmlink_path = NULL;
9465 }
9466
9467 /*
9468 * nameidone has to happen before we vnode_put(dvp)
9469 * since it may need to release the fs_nodelock on the dvp
9470 */
9471 nameidone(ndp);
9472 vnode_put(dvp);
9473
9474 if (vp) {
9475 vnode_put(vp);
9476 }
9477
9478 if (restart_flag == 0) {
9479 wakeup_one((caddr_t)vp);
9480 goto err_out;
9481 }
9482 tsleep(vp, PVFS, "rm AD", 1);
9483 } while (restart_flag != 0);
9484
9485 err_out:
9486 kfree_type(typeof(*__rmdir_data), __rmdir_data);
9487
9488 return error;
9489 }
9490
9491 /*
9492 * Remove a directory file.
9493 */
9494 /* ARGSUSED */
9495 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)9496 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
9497 {
9498 return rmdirat_internal(vfs_context_current(), AT_FDCWD,
9499 CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
9500 }
9501
9502 /* Get direntry length padded to 8 byte alignment */
9503 #define DIRENT64_LEN(namlen) \
9504 ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
9505
9506 /* Get dirent length padded to 4 byte alignment */
9507 #define DIRENT_LEN(namelen) \
9508 ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
9509
9510 /* Get the end of this dirent */
9511 #define DIRENT_END(dep) \
9512 (((char *)(dep)) + (dep)->d_reclen - 1)
9513
9514 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)9515 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
9516 int *numdirent, vfs_context_t ctxp)
9517 {
9518 /* Check if fs natively supports VNODE_READDIR_EXTENDED */
9519 if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
9520 ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
9521 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
9522 } else {
9523 size_t bufsize;
9524 void * bufptr;
9525 uio_t auio;
9526 struct direntry *entry64;
9527 struct dirent *dep;
9528 size_t bytesread;
9529 int error;
9530
9531 /*
9532 * We're here because the underlying file system does not
9533 * support direnties or we mounted denying support so we must
9534 * fall back to dirents and convert them to direntries.
9535 *
9536 * Our kernel buffer needs to be smaller since re-packing will
9537 * expand each dirent. The worse case (when the name length
9538 * is 3 or less) corresponds to a struct direntry size of 32
9539 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
9540 * (4-byte aligned). So having a buffer that is 3/8 the size
9541 * will prevent us from reading more than we can pack.
9542 *
9543 * Since this buffer is wired memory, we will limit the
9544 * buffer size to a maximum of 32K. We would really like to
9545 * use 32K in the MIN(), but we use magic number 87371 to
9546 * prevent uio_resid() * 3 / 8 from overflowing.
9547 */
9548 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
9549 bufptr = kalloc_data(bufsize, Z_WAITOK);
9550 if (bufptr == NULL) {
9551 return ENOMEM;
9552 }
9553
9554 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
9555 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
9556 auio->uio_offset = uio->uio_offset;
9557
9558 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
9559
9560 dep = (struct dirent *)bufptr;
9561 bytesread = bufsize - uio_resid(auio);
9562
9563 entry64 = kalloc_type(struct direntry, Z_WAITOK);
9564 /*
9565 * Convert all the entries and copy them out to user's buffer.
9566 */
9567 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
9568 /* First check that the dirent struct up to d_name is within the buffer */
9569 if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
9570 /* Check that the length of the entire dirent is within the buffer */
9571 DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
9572 /* Check that the actual length including the name doesn't exceed d_reclen */
9573 DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
9574 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
9575 vp->v_mount->mnt_vfsstat.f_mntonname,
9576 vp->v_name ? vp->v_name : "<unknown>");
9577 error = EIO;
9578 break;
9579 }
9580
9581 size_t enbufsize = DIRENT64_LEN(dep->d_namlen);
9582
9583 bzero(entry64, enbufsize);
9584 /* Convert a dirent to a dirent64. */
9585 entry64->d_ino = dep->d_ino;
9586 entry64->d_seekoff = 0;
9587 entry64->d_reclen = (uint16_t)enbufsize;
9588 entry64->d_namlen = dep->d_namlen;
9589 entry64->d_type = dep->d_type;
9590 bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
9591
9592 /* Move to next entry. */
9593 dep = (struct dirent *)((char *)dep + dep->d_reclen);
9594
9595 /* Copy entry64 to user's buffer. */
9596 error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
9597 }
9598
9599 /* Update the real offset using the offset we got from VNOP_READDIR. */
9600 if (error == 0) {
9601 uio->uio_offset = auio->uio_offset;
9602 }
9603 uio_free(auio);
9604 kfree_data(bufptr, bufsize);
9605 kfree_type(struct direntry, entry64);
9606 return error;
9607 }
9608 }
9609
9610 #define GETDIRENTRIES_MAXBUFSIZE (128 * 1024 * 1024U)
9611
9612 /*
9613 * Read a block of directory entries in a file system independent format.
9614 */
9615 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)9616 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
9617 off_t *offset, int *eofflag, int flags)
9618 {
9619 vnode_t vp;
9620 struct vfs_context context = *vfs_context_current(); /* local copy */
9621 struct fileproc *fp;
9622 uio_t auio;
9623 int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9624 off_t loff;
9625 int error, numdirent;
9626 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
9627
9628 get_from_fd:
9629 error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
9630 if (error) {
9631 return error;
9632 }
9633
9634 vn_offset_lock(fp->fp_glob);
9635 if (((vnode_t)fp_get_data(fp)) != vp) {
9636 vn_offset_unlock(fp->fp_glob);
9637 file_drop(fd);
9638 goto get_from_fd;
9639 }
9640
9641 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
9642 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9643 error = EBADF;
9644 goto out;
9645 }
9646
9647 if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
9648 bufsize = GETDIRENTRIES_MAXBUFSIZE;
9649 }
9650
9651 #if CONFIG_MACF
9652 error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
9653 if (error) {
9654 goto out;
9655 }
9656 #endif
9657
9658 if ((error = vnode_getwithref(vp))) {
9659 goto out;
9660 }
9661 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9662
9663 #if CONFIG_UNION_MOUNTS
9664 unionread:
9665 #endif /* CONFIG_UNION_MOUNTS */
9666 if (vp->v_type != VDIR) {
9667 (void)vnode_put(vp);
9668 error = EINVAL;
9669 goto out;
9670 }
9671
9672 #if CONFIG_MACF
9673 error = mac_vnode_check_readdir(&context, vp);
9674 if (error != 0) {
9675 (void)vnode_put(vp);
9676 goto out;
9677 }
9678 #endif /* MAC */
9679
9680 loff = fp->fp_glob->fg_offset;
9681 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9682 uio_addiov(auio, bufp, bufsize);
9683
9684 if (flags & VNODE_READDIR_EXTENDED) {
9685 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
9686 fp->fp_glob->fg_offset = uio_offset(auio);
9687 } else {
9688 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
9689 fp->fp_glob->fg_offset = uio_offset(auio);
9690 }
9691 if (error) {
9692 (void)vnode_put(vp);
9693 goto out;
9694 }
9695
9696 #if CONFIG_UNION_MOUNTS
9697 if ((user_ssize_t)bufsize == uio_resid(auio) &&
9698 (vp->v_mount->mnt_flag & MNT_UNION)) {
9699 vnode_t uvp;
9700
9701 if (lookup_traverse_union(vp, &uvp, &context) == 0) {
9702 if (vnode_ref(uvp) == 0) {
9703 fp_set_data(fp, uvp);
9704 fp->fp_glob->fg_offset = 0;
9705 vnode_rele(vp);
9706 vnode_put(vp);
9707 vp = uvp;
9708 goto unionread;
9709 } else {
9710 /* could not get a ref, can't replace in fd */
9711 vnode_put(uvp);
9712 }
9713 }
9714 }
9715 #endif /* CONFIG_UNION_MOUNTS */
9716
9717 vnode_put(vp);
9718 if (offset) {
9719 *offset = loff;
9720 }
9721
9722 *bytesread = bufsize - uio_resid(auio);
9723 out:
9724 vn_offset_unlock(fp->fp_glob);
9725 file_drop(fd);
9726 return error;
9727 }
9728
9729
9730 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)9731 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
9732 {
9733 off_t offset;
9734 ssize_t bytesread;
9735 int error, eofflag;
9736
9737 AUDIT_ARG(fd, uap->fd);
9738 error = getdirentries_common(uap->fd, uap->buf, uap->count,
9739 &bytesread, &offset, &eofflag, 0);
9740
9741 if (error == 0) {
9742 if (proc_is64bit(p)) {
9743 user64_long_t base = (user64_long_t)offset;
9744 error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
9745 } else {
9746 user32_long_t base = (user32_long_t)offset;
9747 error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
9748 }
9749 *retval = (int)bytesread;
9750 }
9751 return error;
9752 }
9753
9754 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)9755 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
9756 {
9757 off_t offset;
9758 ssize_t bytesread;
9759 int error, eofflag;
9760 user_size_t bufsize;
9761
9762 AUDIT_ARG(fd, uap->fd);
9763
9764 /*
9765 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
9766 * then the kernel carves out the last 4 bytes to return extended
9767 * information to userspace (namely whether we reached EOF with this call).
9768 */
9769 if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9770 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
9771 } else {
9772 bufsize = uap->bufsize;
9773 }
9774
9775 error = getdirentries_common(uap->fd, uap->buf, bufsize,
9776 &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
9777
9778 if (error == 0) {
9779 *retval = bytesread;
9780 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
9781
9782 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9783 getdirentries64_flags_t flags = 0;
9784 if (eofflag) {
9785 flags |= GETDIRENTRIES64_EOF;
9786 }
9787 error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
9788 sizeof(flags));
9789 }
9790 }
9791 return error;
9792 }
9793
9794
9795 /*
9796 * Set the mode mask for creation of filesystem nodes.
9797 * XXX implement xsecurity
9798 */
9799 #define UMASK_NOXSECURITY (void *)1 /* leave existing xsecurity alone */
9800 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)9801 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
9802 {
9803 AUDIT_ARG(mask, newmask);
9804 proc_fdlock(p);
9805 *retval = p->p_fd.fd_cmask;
9806 p->p_fd.fd_cmask = newmask & ALLPERMS;
9807 proc_fdunlock(p);
9808 return 0;
9809 }
9810
9811 /*
9812 * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
9813 *
9814 * Parameters: p Process requesting to set the umask
9815 * uap User argument descriptor (see below)
9816 * retval umask of the process (parameter p)
9817 *
9818 * Indirect: uap->newmask umask to set
9819 * uap->xsecurity ACL to set
9820 *
9821 * Returns: 0 Success
9822 * !0 Not success
9823 *
9824 */
9825 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)9826 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
9827 {
9828 return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
9829 }
9830
9831 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)9832 umask(proc_t p, struct umask_args *uap, int32_t *retval)
9833 {
9834 return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
9835 }
9836
9837 /*
9838 * Void all references to file by ripping underlying filesystem
9839 * away from vnode.
9840 */
9841 /* ARGSUSED */
9842 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)9843 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
9844 {
9845 vnode_t vp;
9846 struct vnode_attr va;
9847 vfs_context_t ctx = vfs_context_current();
9848 int error;
9849 struct nameidata nd;
9850
9851 NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
9852 uap->path, ctx);
9853 error = namei(&nd);
9854 if (error) {
9855 return error;
9856 }
9857 vp = nd.ni_vp;
9858
9859 nameidone(&nd);
9860
9861 if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
9862 error = ENOTSUP;
9863 goto out;
9864 }
9865
9866 if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
9867 error = EBUSY;
9868 goto out;
9869 }
9870
9871 #if CONFIG_MACF
9872 error = mac_vnode_check_revoke(ctx, vp);
9873 if (error) {
9874 goto out;
9875 }
9876 #endif
9877
9878 VATTR_INIT(&va);
9879 VATTR_WANTED(&va, va_uid);
9880 if ((error = vnode_getattr(vp, &va, ctx))) {
9881 goto out;
9882 }
9883 if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
9884 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
9885 goto out;
9886 }
9887 if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
9888 VNOP_REVOKE(vp, REVOKEALL, ctx);
9889 }
9890 out:
9891 vnode_put(vp);
9892 return error;
9893 }
9894
9895
9896 /*
9897 * HFS/HFS PlUS SPECIFIC SYSTEM CALLS
9898 * The following system calls are designed to support features
9899 * which are specific to the HFS & HFS Plus volume formats
9900 */
9901
9902
9903 /*
9904 * Obtain attribute information on objects in a directory while enumerating
9905 * the directory.
9906 */
9907 /* ARGSUSED */
9908 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)9909 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
9910 {
9911 vnode_t vp;
9912 struct fileproc *fp;
9913 uio_t auio = NULL;
9914 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9915 uint32_t count = 0, savecount = 0;
9916 uint32_t newstate = 0;
9917 int error, eofflag;
9918 off_t loff = 0;
9919 struct attrlist attributelist;
9920 vfs_context_t ctx = vfs_context_current();
9921 int fd = uap->fd;
9922 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
9923 kauth_action_t action;
9924
9925 AUDIT_ARG(fd, fd);
9926
9927 /* Get the attributes into kernel space */
9928 if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
9929 return error;
9930 }
9931 if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
9932 return error;
9933 }
9934 savecount = count;
9935 if ((error = fp_getfvp(p, fd, &fp, &vp))) {
9936 return error;
9937 }
9938 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
9939 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9940 error = EBADF;
9941 goto out;
9942 }
9943
9944
9945 #if CONFIG_MACF
9946 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
9947 fp->fp_glob);
9948 if (error) {
9949 goto out;
9950 }
9951 #endif
9952
9953
9954 if ((error = vnode_getwithref(vp))) {
9955 goto out;
9956 }
9957
9958 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9959
9960 #if CONFIG_UNION_MOUNTS
9961 unionread:
9962 #endif /* CONFIG_UNION_MOUNTS */
9963 if (vp->v_type != VDIR) {
9964 (void)vnode_put(vp);
9965 error = EINVAL;
9966 goto out;
9967 }
9968
9969 #if CONFIG_MACF
9970 error = mac_vnode_check_readdir(ctx, vp);
9971 if (error != 0) {
9972 (void)vnode_put(vp);
9973 goto out;
9974 }
9975 #endif /* MAC */
9976
9977 /* set up the uio structure which will contain the users return buffer */
9978 loff = fp->fp_glob->fg_offset;
9979 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9980 uio_addiov(auio, uap->buffer, uap->buffersize);
9981
9982 /*
9983 * If the only item requested is file names, we can let that past with
9984 * just LIST_DIRECTORY. If they want any other attributes, that means
9985 * they need SEARCH as well.
9986 */
9987 action = KAUTH_VNODE_LIST_DIRECTORY;
9988 if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
9989 attributelist.fileattr || attributelist.dirattr) {
9990 action |= KAUTH_VNODE_SEARCH;
9991 }
9992
9993 if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
9994 /* Believe it or not, uap->options only has 32-bits of valid
9995 * info, so truncate before extending again */
9996
9997 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
9998 (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
9999 }
10000
10001 if (error) {
10002 (void) vnode_put(vp);
10003 goto out;
10004 }
10005
10006 #if CONFIG_UNION_MOUNTS
10007 /*
10008 * If we've got the last entry of a directory in a union mount
10009 * then reset the eofflag and pretend there's still more to come.
10010 * The next call will again set eofflag and the buffer will be empty,
10011 * so traverse to the underlying directory and do the directory
10012 * read there.
10013 */
10014 if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10015 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10016 eofflag = 0;
10017 } else { // Empty buffer
10018 struct vnode *tvp = vp;
10019 if (lookup_traverse_union(tvp, &vp, ctx) == 0) {
10020 vnode_ref_ext(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0);
10021 fp_set_data(fp, vp);
10022 fp->fp_glob->fg_offset = 0; // reset index for new dir
10023 count = savecount;
10024 vnode_rele_internal(tvp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10025 vnode_put(tvp);
10026 goto unionread;
10027 }
10028 vp = tvp;
10029 }
10030 }
10031 #endif /* CONFIG_UNION_MOUNTS */
10032
10033 (void)vnode_put(vp);
10034
10035 if (error) {
10036 goto out;
10037 }
10038 fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10039
10040 if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10041 goto out;
10042 }
10043 if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10044 goto out;
10045 }
10046 if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10047 goto out;
10048 }
10049
10050 *retval = eofflag; /* similar to getdirentries */
10051 error = 0;
10052 out:
10053 file_drop(fd);
10054 return error; /* return error earlier, an retval of 0 or 1 now */
10055 } /* end of getdirentriesattr system call */
10056
10057 /*
10058 * Exchange data between two files
10059 */
10060
10061 /* ARGSUSED */
10062 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10063 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10064 {
10065 struct nameidata fnd, snd;
10066 vfs_context_t ctx = vfs_context_current();
10067 vnode_t fvp;
10068 vnode_t svp;
10069 int error;
10070 u_int32_t nameiflags;
10071 char *fpath = NULL;
10072 char *spath = NULL;
10073 int flen = 0, slen = 0;
10074 int from_truncated = 0, to_truncated = 0;
10075 #if CONFIG_FSE
10076 fse_info f_finfo, s_finfo;
10077 #endif
10078
10079 nameiflags = 0;
10080 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10081 nameiflags |= FOLLOW;
10082 }
10083
10084 NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10085 UIO_USERSPACE, uap->path1, ctx);
10086
10087 error = namei(&fnd);
10088 if (error) {
10089 goto out2;
10090 }
10091
10092 nameidone(&fnd);
10093 fvp = fnd.ni_vp;
10094
10095 NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10096 UIO_USERSPACE, uap->path2, ctx);
10097
10098 error = namei(&snd);
10099 if (error) {
10100 vnode_put(fvp);
10101 goto out2;
10102 }
10103 nameidone(&snd);
10104 svp = snd.ni_vp;
10105
10106 /*
10107 * if the files are the same, return an inval error
10108 */
10109 if (svp == fvp) {
10110 error = EINVAL;
10111 goto out;
10112 }
10113
10114 /*
10115 * if the files are on different volumes, return an error
10116 */
10117 if (svp->v_mount != fvp->v_mount) {
10118 error = EXDEV;
10119 goto out;
10120 }
10121
10122 /* If they're not files, return an error */
10123 if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10124 error = EINVAL;
10125 goto out;
10126 }
10127
10128 #if CONFIG_MACF
10129 error = mac_vnode_check_exchangedata(ctx,
10130 fvp, svp);
10131 if (error) {
10132 goto out;
10133 }
10134 #endif
10135 if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10136 ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10137 goto out;
10138 }
10139
10140 if (
10141 #if CONFIG_FSE
10142 need_fsevent(FSE_EXCHANGE, fvp) ||
10143 #endif
10144 kauth_authorize_fileop_has_listeners()) {
10145 GET_PATH(fpath);
10146 GET_PATH(spath);
10147
10148 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10149 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10150
10151 #if CONFIG_FSE
10152 get_fse_info(fvp, &f_finfo, ctx);
10153 get_fse_info(svp, &s_finfo, ctx);
10154 if (from_truncated || to_truncated) {
10155 // set it here since only the f_finfo gets reported up to user space
10156 f_finfo.mode |= FSE_TRUNCATED_PATH;
10157 }
10158 #endif
10159 }
10160 /* Ok, make the call */
10161 error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10162
10163 if (error == 0) {
10164 const char *tmpname;
10165
10166 if (fpath != NULL && spath != NULL) {
10167 /* call out to allow 3rd party notification of exchangedata.
10168 * Ignore result of kauth_authorize_fileop call.
10169 */
10170 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10171 (uintptr_t)fpath, (uintptr_t)spath);
10172 }
10173 name_cache_lock();
10174
10175 tmpname = fvp->v_name;
10176 fvp->v_name = svp->v_name;
10177 svp->v_name = tmpname;
10178
10179 if (fvp->v_parent != svp->v_parent) {
10180 vnode_t tmp;
10181
10182 tmp = fvp->v_parent;
10183 fvp->v_parent = svp->v_parent;
10184 svp->v_parent = tmp;
10185 }
10186 name_cache_unlock();
10187
10188 #if CONFIG_FSE
10189 if (fpath != NULL && spath != NULL) {
10190 add_fsevent(FSE_EXCHANGE, ctx,
10191 FSE_ARG_STRING, flen, fpath,
10192 FSE_ARG_FINFO, &f_finfo,
10193 FSE_ARG_STRING, slen, spath,
10194 FSE_ARG_FINFO, &s_finfo,
10195 FSE_ARG_DONE);
10196 }
10197 #endif
10198 }
10199
10200 out:
10201 if (fpath != NULL) {
10202 RELEASE_PATH(fpath);
10203 }
10204 if (spath != NULL) {
10205 RELEASE_PATH(spath);
10206 }
10207 vnode_put(svp);
10208 vnode_put(fvp);
10209 out2:
10210 return error;
10211 }
10212
10213 /*
10214 * Return (in MB) the amount of freespace on the given vnode's volume.
10215 */
10216 uint32_t freespace_mb(vnode_t vp);
10217
10218 uint32_t
freespace_mb(vnode_t vp)10219 freespace_mb(vnode_t vp)
10220 {
10221 vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
10222 return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10223 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10224 }
10225
10226 #if CONFIG_SEARCHFS
10227
10228 /* ARGSUSED */
10229
10230 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)10231 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10232 {
10233 vnode_t vp, tvp;
10234 int i, error = 0;
10235 int fserror = 0;
10236 struct nameidata nd;
10237 struct user64_fssearchblock searchblock;
10238 struct searchstate *state;
10239 struct attrlist *returnattrs;
10240 struct timeval timelimit;
10241 void *searchparams1, *searchparams2;
10242 uio_t auio = NULL;
10243 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10244 uint32_t nummatches;
10245 size_t mallocsize;
10246 uint32_t nameiflags;
10247 vfs_context_t ctx = vfs_context_current();
10248 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10249
10250 /* Start by copying in fsearchblock parameter list */
10251 if (IS_64BIT_PROCESS(p)) {
10252 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
10253 timelimit.tv_sec = searchblock.timelimit.tv_sec;
10254 timelimit.tv_usec = searchblock.timelimit.tv_usec;
10255 } else {
10256 struct user32_fssearchblock tmp_searchblock;
10257
10258 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
10259 // munge into 64-bit version
10260 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
10261 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
10262 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
10263 searchblock.maxmatches = tmp_searchblock.maxmatches;
10264 /*
10265 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
10266 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
10267 */
10268 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
10269 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
10270 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
10271 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
10272 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
10273 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
10274 searchblock.searchattrs = tmp_searchblock.searchattrs;
10275 }
10276 if (error) {
10277 return error;
10278 }
10279
10280 /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
10281 */
10282 if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10283 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10284 return EINVAL;
10285 }
10286
10287 /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
10288 /* It all has to do into local memory and it's not that big so we might as well put it all together. */
10289 /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
10290 /* block. */
10291 /* */
10292 /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate */
10293 /* due to the changes in rdar://problem/12438273. That way if a 3rd party file system */
10294 /* assumes the size is still 556 bytes it will continue to work */
10295
10296 mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
10297 sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
10298
10299 searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
10300
10301 /* Now set up the various pointers to the correct place in our newly allocated memory */
10302
10303 searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
10304 returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
10305 state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
10306
10307 /* Now copy in the stuff given our local variables. */
10308
10309 if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
10310 goto freeandexit;
10311 }
10312
10313 if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
10314 goto freeandexit;
10315 }
10316
10317 if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
10318 goto freeandexit;
10319 }
10320
10321 if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
10322 goto freeandexit;
10323 }
10324
10325 /*
10326 * When searching a union mount, need to set the
10327 * start flag at the first call on each layer to
10328 * reset state for the new volume.
10329 */
10330 if (uap->options & SRCHFS_START) {
10331 state->ss_union_layer = 0;
10332 } else {
10333 uap->options |= state->ss_union_flags;
10334 }
10335 state->ss_union_flags = 0;
10336
10337 /*
10338 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
10339 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
10340 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
10341 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
10342 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
10343 */
10344
10345 if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
10346 attrreference_t* string_ref;
10347 u_int32_t* start_length;
10348 user64_size_t param_length;
10349
10350 /* validate searchparams1 */
10351 param_length = searchblock.sizeofsearchparams1;
10352 /* skip the word that specifies length of the buffer */
10353 start_length = (u_int32_t*) searchparams1;
10354 start_length = start_length + 1;
10355 string_ref = (attrreference_t*) start_length;
10356
10357 /* ensure no negative offsets or too big offsets */
10358 if (string_ref->attr_dataoffset < 0) {
10359 error = EINVAL;
10360 goto freeandexit;
10361 }
10362 if (string_ref->attr_length > MAXPATHLEN) {
10363 error = EINVAL;
10364 goto freeandexit;
10365 }
10366
10367 /* Check for pointer overflow in the string ref */
10368 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
10369 error = EINVAL;
10370 goto freeandexit;
10371 }
10372
10373 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
10374 error = EINVAL;
10375 goto freeandexit;
10376 }
10377 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
10378 error = EINVAL;
10379 goto freeandexit;
10380 }
10381 }
10382
10383 /* set up the uio structure which will contain the users return buffer */
10384 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10385 uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
10386
10387 nameiflags = 0;
10388 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10389 nameiflags |= FOLLOW;
10390 }
10391 NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
10392 UIO_USERSPACE, uap->path, ctx);
10393
10394 error = namei(&nd);
10395 if (error) {
10396 goto freeandexit;
10397 }
10398 vp = nd.ni_vp;
10399 nameidone(&nd);
10400
10401 /*
10402 * Switch to the root vnode for the volume
10403 */
10404 error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
10405 vnode_put(vp);
10406 if (error) {
10407 goto freeandexit;
10408 }
10409 vp = tvp;
10410
10411 #if CONFIG_UNION_MOUNTS
10412 /*
10413 * If it's a union mount, the path lookup takes
10414 * us to the top layer. But we may need to descend
10415 * to a lower layer. For non-union mounts the layer
10416 * is always zero.
10417 */
10418 for (i = 0; i < (int) state->ss_union_layer; i++) {
10419 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
10420 break;
10421 }
10422 tvp = vp;
10423 vp = vp->v_mount->mnt_vnodecovered;
10424 if (vp == NULL) {
10425 vnode_put(tvp);
10426 error = ENOENT;
10427 goto freeandexit;
10428 }
10429 error = vnode_getwithref(vp);
10430 vnode_put(tvp);
10431 if (error) {
10432 goto freeandexit;
10433 }
10434 }
10435 #endif /* CONFIG_UNION_MOUNTS */
10436
10437 #if CONFIG_MACF
10438 error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
10439 if (error) {
10440 vnode_put(vp);
10441 goto freeandexit;
10442 }
10443 #endif
10444
10445
10446 /*
10447 * If searchblock.maxmatches == 0, then skip the search. This has happened
10448 * before and sometimes the underlying code doesnt deal with it well.
10449 */
10450 if (searchblock.maxmatches == 0) {
10451 nummatches = 0;
10452 goto saveandexit;
10453 }
10454
10455 /*
10456 * Allright, we have everything we need, so lets make that call.
10457 *
10458 * We keep special track of the return value from the file system:
10459 * EAGAIN is an acceptable error condition that shouldn't keep us
10460 * from copying out any results...
10461 */
10462
10463 fserror = VNOP_SEARCHFS(vp,
10464 searchparams1,
10465 searchparams2,
10466 &searchblock.searchattrs,
10467 (uint32_t)searchblock.maxmatches,
10468 &timelimit,
10469 returnattrs,
10470 &nummatches,
10471 (uint32_t)uap->scriptcode,
10472 (uint32_t)uap->options,
10473 auio,
10474 (struct searchstate *) &state->ss_fsstate,
10475 ctx);
10476
10477 #if CONFIG_UNION_MOUNTS
10478 /*
10479 * If it's a union mount we need to be called again
10480 * to search the mounted-on filesystem.
10481 */
10482 if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
10483 state->ss_union_flags = SRCHFS_START;
10484 state->ss_union_layer++; // search next layer down
10485 fserror = EAGAIN;
10486 }
10487 #endif /* CONFIG_UNION_MOUNTS */
10488
10489 saveandexit:
10490
10491 vnode_put(vp);
10492
10493 /* Now copy out the stuff that needs copying out. That means the number of matches, the
10494 * search state. Everything was already put into he return buffer by the vop call. */
10495
10496 if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
10497 goto freeandexit;
10498 }
10499
10500 if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
10501 goto freeandexit;
10502 }
10503
10504 error = fserror;
10505
10506 freeandexit:
10507
10508 kfree_data(searchparams1, mallocsize);
10509
10510 return error;
10511 } /* end of searchfs system call */
10512
10513 #else /* CONFIG_SEARCHFS */
10514
10515 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)10516 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
10517 {
10518 return ENOTSUP;
10519 }
10520
10521 #endif /* CONFIG_SEARCHFS */
10522
10523
10524 #if CONFIG_DATALESS_FILES
10525
10526 /*
10527 * === Namespace Resolver Up-call Mechanism ===
10528 *
10529 * When I/O is performed to a dataless file or directory (read, write,
10530 * lookup-in, etc.), the file system performs an upcall to the namespace
10531 * resolver (filecoordinationd) to materialize the object.
10532 *
10533 * We need multiple up-calls to be in flight at once, and we need these
10534 * up-calls to be interruptible, thus the following implementation:
10535 *
10536 * => The nspace_resolver_request represents the in-kernel request state.
10537 * It contains a request ID, storage space for the errno code returned
10538 * by filecoordinationd, and flags.
10539 *
10540 * => The request ID is simply a global monotonically incrementing 32-bit
10541 * number. Outstanding requests are stored in a hash table, and the
10542 * hash function is extremely simple.
10543 *
10544 * => When an upcall is to be made to filecoordinationd, a request structure
10545 * is allocated on the stack (it is small, and needs to live only during
10546 * the duration of the call to resolve_nspace_item_ext()). It is
10547 * initialized and inserted into the table. Some backpressure from
10548 * filecoordinationd is applied by limiting the numnber of entries that
10549 * can be inserted into the table (and thus limiting the number of
10550 * outstanding requests issued to filecoordinationd); waiting for an
10551 * available slot is interruptible.
10552 *
10553 * => Once the request has been inserted into the table, the up-call is made
10554 * to filecoordinationd via a MiG-generated stub. The up-call returns
10555 * immediately and filecoordinationd processes the request asynchronously.
10556 *
10557 * => The caller now waits for the request to complete. Tnis is achieved by
10558 * sleeping on the address of the request structure and waiting for
10559 * filecoordinationd to mark the request structure as complete. This
10560 * is an interruptible sleep call; if interrupted, the request structure
10561 * is removed from the table and EINTR is returned to the caller. If
10562 * this occurs, an advisory up-call is made to filecoordinationd with
10563 * the request ID to indicate that the request can be aborted or
10564 * de-prioritized at the discretion of filecoordinationd.
10565 *
10566 * => When filecoordinationd has completed the request, it signals completion
10567 * by writing to the vfs.nspace.complete sysctl node. Only a process
10568 * decorated as a namespace resolver can write to this sysctl node. The
10569 * value is a request ID / errno tuple passed as an array of 2 uint32_t's.
10570 * The request ID is looked up in the table, and if the request is found,
10571 * the error code is stored in the request structure and a wakeup()
10572 * issued on the address of the request structure. If the request is not
10573 * found, we simply drop the completion notification, assuming that the
10574 * caller was interrupted.
10575 *
10576 * => When the waiting thread wakes up, it extracts the error code from the
10577 * request structure, removes the request from the table, and returns the
10578 * error code to the calling function. Fini!
10579 */
10580
10581 struct nspace_resolver_request {
10582 LIST_ENTRY(nspace_resolver_request) r_hashlink;
10583 vnode_t r_vp;
10584 uint32_t r_req_id;
10585 int r_resolver_error;
10586 int r_flags;
10587 };
10588
10589 #define RRF_COMPLETE 0x0001
10590
10591 static uint32_t
next_nspace_req_id(void)10592 next_nspace_req_id(void)
10593 {
10594 static uint32_t next_req_id;
10595
10596 return OSAddAtomic(1, &next_req_id);
10597 }
10598
10599 #define NSPACE_RESOLVER_REQ_HASHSIZE 32 /* XXX tune */
10600 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256 /* XXX tune */
10601
10602 static LIST_HEAD(nspace_resolver_requesthead,
10603 nspace_resolver_request) * nspace_resolver_request_hashtbl;
10604 static u_long nspace_resolver_request_hashmask;
10605 static u_int nspace_resolver_request_count;
10606 static bool nspace_resolver_request_wait_slot;
10607 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
10608 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
10609 &nspace_resolver_request_lck_grp);
10610
10611 #define NSPACE_REQ_LOCK() \
10612 lck_mtx_lock(&nspace_resolver_request_hash_mutex)
10613 #define NSPACE_REQ_UNLOCK() \
10614 lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
10615
10616 #define NSPACE_RESOLVER_HASH(req_id) \
10617 (&nspace_resolver_request_hashtbl[(req_id) & \
10618 nspace_resolver_request_hashmask])
10619
10620 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id)10621 nspace_resolver_req_lookup(uint32_t req_id)
10622 {
10623 struct nspace_resolver_requesthead *bucket;
10624 struct nspace_resolver_request *req;
10625
10626 bucket = NSPACE_RESOLVER_HASH(req_id);
10627 LIST_FOREACH(req, bucket, r_hashlink) {
10628 if (req->r_req_id == req_id) {
10629 return req;
10630 }
10631 }
10632
10633 return NULL;
10634 }
10635
10636 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)10637 nspace_resolver_req_add(struct nspace_resolver_request *req)
10638 {
10639 struct nspace_resolver_requesthead *bucket;
10640 int error;
10641
10642 while (nspace_resolver_request_count >=
10643 NSPACE_RESOLVER_MAX_OUTSTANDING) {
10644 nspace_resolver_request_wait_slot = true;
10645 error = msleep(&nspace_resolver_request_count,
10646 &nspace_resolver_request_hash_mutex,
10647 PVFS | PCATCH, "nspacerq", NULL);
10648 if (error) {
10649 return error;
10650 }
10651 }
10652
10653 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10654 #if DIAGNOSTIC
10655 assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
10656 #endif /* DIAGNOSTIC */
10657 LIST_INSERT_HEAD(bucket, req, r_hashlink);
10658 nspace_resolver_request_count++;
10659
10660 return 0;
10661 }
10662
10663 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)10664 nspace_resolver_req_remove(struct nspace_resolver_request *req)
10665 {
10666 struct nspace_resolver_requesthead *bucket;
10667
10668 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10669 #if DIAGNOSTIC
10670 assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
10671 #endif /* DIAGNOSTIC */
10672 LIST_REMOVE(req, r_hashlink);
10673 nspace_resolver_request_count--;
10674
10675 if (nspace_resolver_request_wait_slot) {
10676 nspace_resolver_request_wait_slot = false;
10677 wakeup(&nspace_resolver_request_count);
10678 }
10679 }
10680
10681 static void
nspace_resolver_req_cancel(uint32_t req_id)10682 nspace_resolver_req_cancel(uint32_t req_id)
10683 {
10684 kern_return_t kr;
10685 mach_port_t mp;
10686
10687 // Failures here aren't fatal -- the cancellation message
10688 // sent to the resolver is merely advisory.
10689
10690 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
10691 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
10692 return;
10693 }
10694
10695 kr = send_nspace_resolve_cancel(mp, req_id);
10696 if (kr != KERN_SUCCESS) {
10697 os_log_error(OS_LOG_DEFAULT,
10698 "NSPACE send_nspace_resolve_cancel failure: %d", kr);
10699 }
10700
10701 ipc_port_release_send(mp);
10702 }
10703
10704 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)10705 nspace_resolver_req_wait(struct nspace_resolver_request *req)
10706 {
10707 bool send_cancel_message = false;
10708 int error;
10709
10710 NSPACE_REQ_LOCK();
10711
10712 while ((req->r_flags & RRF_COMPLETE) == 0) {
10713 error = msleep(req, &nspace_resolver_request_hash_mutex,
10714 PVFS | PCATCH, "nspace", NULL);
10715 if (error && error != ERESTART) {
10716 req->r_resolver_error = (error == EINTR) ? EINTR :
10717 ETIMEDOUT;
10718 send_cancel_message = true;
10719 break;
10720 }
10721 }
10722
10723 nspace_resolver_req_remove(req);
10724
10725 NSPACE_REQ_UNLOCK();
10726
10727 if (send_cancel_message) {
10728 nspace_resolver_req_cancel(req->r_req_id);
10729 }
10730
10731 return req->r_resolver_error;
10732 }
10733
10734 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)10735 nspace_resolver_req_mark_complete(
10736 struct nspace_resolver_request *req,
10737 int resolver_error)
10738 {
10739 req->r_resolver_error = resolver_error;
10740 req->r_flags |= RRF_COMPLETE;
10741 wakeup(req);
10742 }
10743
10744 static void
nspace_resolver_req_completed(uint32_t req_id,int resolver_error,uint64_t orig_gencount)10745 nspace_resolver_req_completed(uint32_t req_id, int resolver_error, uint64_t orig_gencount)
10746 {
10747 struct nspace_resolver_request *req;
10748
10749 NSPACE_REQ_LOCK();
10750
10751 // If we don't find the request corresponding to our req_id,
10752 // just drop the completion signal on the floor; it's likely
10753 // that the requester interrupted with a signal.
10754
10755 req = nspace_resolver_req_lookup(req_id);
10756 if (req) {
10757 mount_t locked_mp = NULL;
10758
10759 locked_mp = req->r_vp->v_mount;
10760 mount_ref(locked_mp, 0);
10761 mount_lock_renames(locked_mp);
10762
10763 //
10764 // if the resolver isn't already returning an error and we have an
10765 // orig_gencount, then get an iocount on the request vnode and check
10766 // that the gencount on req->r_vp has not changed.
10767 //
10768 // note: a ref was taken on req->r_vp when the request was created
10769 // and that ref will be dropped by that thread when it wakes up.
10770 //
10771 if (resolver_error == 0 &&
10772 orig_gencount != 0 &&
10773 vnode_getwithref(req->r_vp) == 0) {
10774 struct vnode_attr va;
10775 uint64_t cur_gencount;
10776
10777 VATTR_INIT(&va);
10778 VATTR_WANTED(&va, va_recursive_gencount);
10779
10780 if (vnode_getattr(req->r_vp, &va, vfs_context_kernel()) == 0) {
10781 cur_gencount = va.va_recursive_gencount;
10782 } else {
10783 cur_gencount = 0;
10784 }
10785
10786 if (resolver_error == 0 && cur_gencount && orig_gencount && cur_gencount != orig_gencount) {
10787 printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n", orig_gencount, cur_gencount);
10788
10789 // this error will be returned to the thread that initiated the
10790 // materialization of req->r_vp.
10791 resolver_error = EBUSY;
10792
10793 // note: we explicitly do not return an error to the caller (i.e.
10794 // the thread that did the materialization) because they said they
10795 // don't want one.
10796 }
10797
10798 vnode_put(req->r_vp);
10799 }
10800
10801 mount_unlock_renames(locked_mp);
10802 mount_drop(locked_mp, 0);
10803
10804 nspace_resolver_req_mark_complete(req, resolver_error);
10805 }
10806
10807 NSPACE_REQ_UNLOCK();
10808
10809 return;
10810 }
10811
10812 static struct proc *nspace_resolver_proc;
10813
10814 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)10815 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
10816 {
10817 *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
10818 p == nspace_resolver_proc) ? 1 : 0;
10819 return 0;
10820 }
10821
10822 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)10823 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
10824 {
10825 vfs_context_t ctx = vfs_context_current();
10826 int error = 0;
10827
10828 //
10829 // The system filecoordinationd runs as uid == 0. This also
10830 // has the nice side-effect of filtering out filecoordinationd
10831 // running in the simulator.
10832 //
10833 if (!vfs_context_issuser(ctx)) {
10834 return EPERM;
10835 }
10836
10837 error = priv_check_cred(vfs_context_ucred(ctx),
10838 PRIV_VFS_DATALESS_RESOLVER, 0);
10839 if (error) {
10840 return error;
10841 }
10842
10843 if (is_resolver) {
10844 NSPACE_REQ_LOCK();
10845
10846 if (nspace_resolver_proc == NULL) {
10847 proc_lock(p);
10848 p->p_lflag |= P_LNSPACE_RESOLVER;
10849 proc_unlock(p);
10850 nspace_resolver_proc = p;
10851 } else {
10852 error = EBUSY;
10853 }
10854
10855 NSPACE_REQ_UNLOCK();
10856 } else {
10857 // This is basically just like the exit case.
10858 // nspace_resolver_exited() will verify that the
10859 // process is the resolver, and will clear the
10860 // global.
10861 nspace_resolver_exited(p);
10862 }
10863
10864 return error;
10865 }
10866
10867 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)10868 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
10869 {
10870 if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
10871 (p->p_vfs_iopolicy &
10872 P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
10873 *is_prevented = 1;
10874 } else {
10875 *is_prevented = 0;
10876 }
10877 return 0;
10878 }
10879
10880 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)10881 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
10882 {
10883 if (p->p_lflag & P_LNSPACE_RESOLVER) {
10884 return is_prevented ? 0 : EBUSY;
10885 }
10886
10887 if (is_prevented) {
10888 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
10889 } else {
10890 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
10891 }
10892 return 0;
10893 }
10894
10895 static int
nspace_materialization_get_thread_state(int * is_prevented)10896 nspace_materialization_get_thread_state(int *is_prevented)
10897 {
10898 uthread_t ut = current_uthread();
10899
10900 *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
10901 return 0;
10902 }
10903
10904 static int
nspace_materialization_set_thread_state(int is_prevented)10905 nspace_materialization_set_thread_state(int is_prevented)
10906 {
10907 uthread_t ut = current_uthread();
10908
10909 if (is_prevented) {
10910 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
10911 } else {
10912 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
10913 }
10914 return 0;
10915 }
10916
10917 /* the vfs.nspace branch */
10918 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
10919
10920 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)10921 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
10922 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10923 {
10924 struct proc *p = req->p;
10925 int new_value, old_value, changed = 0;
10926 int error;
10927
10928 error = nspace_resolver_get_proc_state(p, &old_value);
10929 if (error) {
10930 return error;
10931 }
10932
10933 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10934 &changed);
10935 if (error == 0 && changed) {
10936 error = nspace_resolver_set_proc_state(p, new_value);
10937 }
10938 return error;
10939 }
10940
10941 /* decorate this process as the dataless file resolver */
10942 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
10943 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10944 0, 0, sysctl_nspace_resolver, "I", "");
10945
10946 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)10947 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
10948 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10949 {
10950 struct proc *p = req->p;
10951 int new_value, old_value, changed = 0;
10952 int error;
10953
10954 error = nspace_materialization_get_proc_state(p, &old_value);
10955 if (error) {
10956 return error;
10957 }
10958
10959 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10960 &changed);
10961 if (error == 0 && changed) {
10962 error = nspace_materialization_set_proc_state(p, new_value);
10963 }
10964 return error;
10965 }
10966
10967 /* decorate this process as not wanting to materialize dataless files */
10968 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
10969 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10970 0, 0, sysctl_nspace_prevent_materialization, "I", "");
10971
10972 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)10973 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
10974 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10975 {
10976 int new_value, old_value, changed = 0;
10977 int error;
10978
10979 error = nspace_materialization_get_thread_state(&old_value);
10980 if (error) {
10981 return error;
10982 }
10983
10984 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10985 &changed);
10986 if (error == 0 && changed) {
10987 error = nspace_materialization_set_thread_state(new_value);
10988 }
10989 return error;
10990 }
10991
10992 /* decorate this thread as not wanting to materialize dataless files */
10993 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
10994 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10995 0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
10996
10997 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)10998 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
10999 __unused int arg2, struct sysctl_req *req)
11000 {
11001 struct proc *p = req->p;
11002 uint32_t req_status[2] = { 0, 0 };
11003 uint64_t gencount = 0;
11004 int error, is_resolver, changed = 0, gencount_changed;
11005
11006 error = nspace_resolver_get_proc_state(p, &is_resolver);
11007 if (error) {
11008 return error;
11009 }
11010
11011 if (!is_resolver) {
11012 return EPERM;
11013 }
11014
11015 error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11016 &changed);
11017 if (error) {
11018 return error;
11019 }
11020
11021 // get the gencount if it was passed
11022 error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11023 &gencount_changed);
11024 if (error) {
11025 gencount = 0;
11026 // we ignore the error because the gencount was optional
11027 error = 0;
11028 }
11029
11030 /*
11031 * req_status[0] is the req_id
11032 *
11033 * req_status[1] is the errno
11034 */
11035 if (error == 0 && changed) {
11036 nspace_resolver_req_completed(req_status[0],
11037 (int)req_status[1], gencount);
11038 }
11039 return error;
11040 }
11041
11042 /* Resolver reports completed reqs here. */
11043 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11044 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11045 0, 0, sysctl_nspace_complete, "-", "");
11046
11047 #endif /* CONFIG_DATALESS_FILES */
11048
11049 #if CONFIG_DATALESS_FILES
11050 #define __no_dataless_unused /* nothing */
11051 #else
11052 #define __no_dataless_unused __unused
11053 #endif
11054
11055 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)11056 vfs_context_dataless_materialization_is_prevented(
11057 vfs_context_t const ctx __no_dataless_unused)
11058 {
11059 #if CONFIG_DATALESS_FILES
11060 proc_t const p = vfs_context_proc(ctx);
11061 thread_t const t = vfs_context_thread(ctx);
11062 uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11063
11064 /*
11065 * Kernel context ==> return EDEADLK, as we would with any random
11066 * process decorated as no-materialize.
11067 */
11068 if (ctx == vfs_context_kernel()) {
11069 return EDEADLK;
11070 }
11071
11072 /*
11073 * If the process has the dataless-manipulation entitlement,
11074 * materialization is prevented, and depending on the kind
11075 * of file system operation, things get to proceed as if the
11076 * object is not dataless.
11077 */
11078 if (vfs_context_is_dataless_manipulator(ctx)) {
11079 return EJUSTRETURN;
11080 }
11081
11082 /*
11083 * Per-thread decorations override any process-wide decorations.
11084 * (Foundation uses this, and this overrides even the dataless-
11085 * manipulation entitlement so as to make API contracts consistent.)
11086 */
11087 if (ut != NULL) {
11088 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11089 return EDEADLK;
11090 }
11091 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11092 return 0;
11093 }
11094 }
11095
11096 /*
11097 * If the process's iopolicy specifies that dataless files
11098 * can be materialized, then we let it go ahead.
11099 */
11100 if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11101 return 0;
11102 }
11103 #endif /* CONFIG_DATALESS_FILES */
11104
11105 /*
11106 * The default behavior is to not materialize dataless files;
11107 * return to the caller that deadlock was detected.
11108 */
11109 return EDEADLK;
11110 }
11111
11112 void
nspace_resolver_init(void)11113 nspace_resolver_init(void)
11114 {
11115 #if CONFIG_DATALESS_FILES
11116 nspace_resolver_request_hashtbl =
11117 hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
11118 M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
11119 #endif /* CONFIG_DATALESS_FILES */
11120 }
11121
11122 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)11123 nspace_resolver_exited(struct proc *p __no_dataless_unused)
11124 {
11125 #if CONFIG_DATALESS_FILES
11126 struct nspace_resolver_requesthead *bucket;
11127 struct nspace_resolver_request *req;
11128 u_long idx;
11129
11130 NSPACE_REQ_LOCK();
11131
11132 if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11133 p == nspace_resolver_proc) {
11134 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
11135 bucket = &nspace_resolver_request_hashtbl[idx];
11136 LIST_FOREACH(req, bucket, r_hashlink) {
11137 nspace_resolver_req_mark_complete(req,
11138 ETIMEDOUT);
11139 }
11140 }
11141 nspace_resolver_proc = NULL;
11142 }
11143
11144 NSPACE_REQ_UNLOCK();
11145 #endif /* CONFIG_DATALESS_FILES */
11146 }
11147
11148 int
resolve_nspace_item(struct vnode * vp,uint64_t op)11149 resolve_nspace_item(struct vnode *vp, uint64_t op)
11150 {
11151 return resolve_nspace_item_ext(vp, op, NULL);
11152 }
11153
11154 #define DATALESS_RESOLVER_ENTITLEMENT \
11155 "com.apple.private.vfs.dataless-resolver"
11156 #define DATALESS_MANIPULATION_ENTITLEMENT \
11157 "com.apple.private.vfs.dataless-manipulation"
11158
11159 /*
11160 * Return TRUE if the vfs context is associated with a process entitled
11161 * for dataless manipulation.
11162 *
11163 * XXX Arguably belongs in vfs_subr.c, but is here because of the
11164 * complication around CONFIG_DATALESS_FILES.
11165 */
11166 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __unused)11167 vfs_context_is_dataless_manipulator(vfs_context_t ctx __unused)
11168 {
11169 #if CONFIG_DATALESS_FILES
11170 assert(ctx->vc_thread == current_thread());
11171 return IOCurrentTaskHasEntitlement( DATALESS_MANIPULATION_ENTITLEMENT) ||
11172 IOCurrentTaskHasEntitlement(DATALESS_RESOLVER_ENTITLEMENT);
11173 #else
11174 return false;
11175 #endif /* CONFIG_DATALESS_FILES */
11176 }
11177
11178 #if CONFIG_DATALESS_FILES
11179 static void
log_materialization_prevented(vnode_t vp,uint64_t op)11180 log_materialization_prevented(vnode_t vp, uint64_t op)
11181 {
11182 char p_name[MAXCOMLEN + 1];
11183 char *vntype;
11184 proc_selfname(&p_name[0], sizeof(p_name));
11185
11186 if (vp->v_type == VREG) {
11187 vntype = "File";
11188 } else if (vp->v_type == VDIR) {
11189 vntype = "Dir";
11190 } else if (vp->v_type == VLNK) {
11191 vntype = "SymLink";
11192 } else {
11193 vntype = "Other";
11194 }
11195
11196 #if DEVELOPMENT
11197 char *path = NULL;
11198 int len;
11199
11200 path = get_pathbuff();
11201 len = MAXPATHLEN;
11202 if (path) {
11203 vn_getpath(vp, path, &len);
11204 }
11205
11206 os_log_debug(OS_LOG_DEFAULT,
11207 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) path: %s",
11208 p_name, proc_selfpid(),
11209 op, vntype, path ? path : "<unknown-path>");
11210 if (path) {
11211 release_pathbuff(path);
11212 }
11213 #else
11214 os_log_debug(OS_LOG_DEFAULT,
11215 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
11216 p_name, proc_selfpid(),
11217 op, vntype);
11218 #endif
11219 }
11220 #endif /* CONFIG_DATALESS_FILES */
11221
11222
11223 static int
vfs_materialize_item(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused)11224 vfs_materialize_item(
11225 struct vnode *vp __no_dataless_unused,
11226 uint64_t op __no_dataless_unused,
11227 int64_t offset __no_dataless_unused,
11228 int64_t size __no_dataless_unused,
11229 char *lookup_name __no_dataless_unused,
11230 size_t const namelen __no_dataless_unused)
11231 {
11232 #if CONFIG_DATALESS_FILES
11233 struct nspace_resolver_request req;
11234 kern_return_t kern_ret;
11235 mach_port_t mach_port;
11236 char *path = NULL;
11237 vfs_context_t context;
11238 int path_len;
11239 int error;
11240 #ifdef DATALESS_FILES_USE_AUDIT_TOKEN
11241 audit_token_t atoken;
11242 #endif
11243
11244 /*
11245 * If this is a snapshot event and the vnode is on a disk image just
11246 * pretend nothing happened since any change to the disk image will
11247 * cause the disk image itself to get backed up and this avoids multi-
11248 * way deadlocks between the snapshot handler and the ever popular
11249 * diskimages-helper process. The variable nspace_allow_virtual_devs
11250 * allows this behavior to be overridden (for use by the Mobile
11251 * TimeMachine testing infrastructure which uses disk images).
11252 */
11253 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
11254 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
11255 return ENOTSUP;
11256 }
11257
11258 context = vfs_context_current();
11259
11260 error = vfs_context_dataless_materialization_is_prevented(context);
11261 if (error) {
11262 log_materialization_prevented(vp, op);
11263 return error;
11264 }
11265
11266 kern_ret = host_get_filecoordinationd_port(host_priv_self(),
11267 &mach_port);
11268 if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
11269 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
11270 /*
11271 * Treat this like being unable to access the backing store
11272 * server.
11273 */
11274 return ETIMEDOUT;
11275 }
11276
11277 path = zalloc(ZV_NAMEI);
11278 path_len = MAXPATHLEN;
11279
11280 error = vn_getpath(vp, path, &path_len);
11281 if (error) {
11282 goto out_release_port;
11283 }
11284
11285 #ifdef DATALESS_FILES_USE_AUDIT_TOKEN
11286 error = vfs_context_copy_audit_token(context, &atoken);
11287 if (error) {
11288 goto out_release_port;
11289 }
11290 #endif
11291
11292 req.r_req_id = next_nspace_req_id();
11293 req.r_resolver_error = 0;
11294 req.r_flags = 0;
11295 req.r_vp = vp;
11296
11297 NSPACE_REQ_LOCK();
11298 error = nspace_resolver_req_add(&req);
11299 NSPACE_REQ_UNLOCK();
11300 if (error) {
11301 goto out_release_port;
11302 }
11303
11304 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
11305 if (vp->v_type == VDIR) {
11306 char *tmpname = NULL;
11307
11308 /*
11309 * If the caller provided a lookup_name *and* a name length,
11310 * then we assume the lookup_name is not NUL-terminated.
11311 * Allocate a temporary buffer in this case to provide
11312 * a NUL-terminated path name to the IPC call.
11313 */
11314 if (lookup_name != NULL && namelen != 0) {
11315 if (namelen >= PATH_MAX) {
11316 error = EINVAL;
11317 goto out_release_port;
11318 }
11319 tmpname = zalloc(ZV_NAMEI);
11320 strlcpy(tmpname, lookup_name, namelen + 1);
11321 lookup_name = tmpname;
11322 } else if (lookup_name != NULL) {
11323 /*
11324 * If the caller provided a lookup_name with a
11325 * zero name length, then we assume it's NUL-
11326 * terminated. Verify it has a valid length.
11327 */
11328 if (strlen(lookup_name) >= PATH_MAX) {
11329 error = EINVAL;
11330 goto out_release_port;
11331 }
11332 }
11333
11334 #ifdef DATALESS_FILES_USE_AUDIT_TOKEN
11335 kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
11336 req.r_req_id, (uint32_t)(op & 0xffffffff),
11337 lookup_name == NULL ? "" : lookup_name, path, atoken);
11338 #else
11339 kern_ret = send_vfs_resolve_dir(mach_port, req.r_req_id,
11340 proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
11341 lookup_name == NULL ? "" : lookup_name, path);
11342 #endif /* DATALESS_FILES_USE_AUDIT_TOKEN */
11343
11344 if (tmpname != NULL) {
11345 zfree(ZV_NAMEI, tmpname);
11346
11347 /*
11348 * Poison lookup_name rather than reference
11349 * freed memory.
11350 */
11351 lookup_name = NULL;
11352 }
11353 } else {
11354 #ifdef DATALESS_FILES_USE_AUDIT_TOKEN
11355 kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
11356 req.r_req_id, (uint32_t)(op & 0xffffffff),
11357 offset, size, path, atoken);
11358 #else
11359 kern_ret = send_vfs_resolve_file(mach_port, req.r_req_id,
11360 proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
11361 offset, size, path);
11362 #endif /* DATALESS_FILES_USE_AUDIT_TOKEN */
11363 }
11364 if (kern_ret != KERN_SUCCESS) {
11365 /*
11366 * Also treat this like being unable to access the backing
11367 * store server.
11368 */
11369 os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
11370 kern_ret);
11371 error = ETIMEDOUT;
11372
11373 NSPACE_REQ_LOCK();
11374 nspace_resolver_req_remove(&req);
11375 NSPACE_REQ_UNLOCK();
11376 goto out_release_port;
11377 }
11378
11379 /*
11380 * Give back the memory we allocated earlier while we wait; we
11381 * no longer need it.
11382 */
11383 zfree(ZV_NAMEI, path);
11384 path = NULL;
11385
11386 /*
11387 * Request has been submitted to the resolver. Now (interruptibly)
11388 * wait for completion. Upon requrn, the request will have been
11389 * removed from the lookup table.
11390 */
11391 error = nspace_resolver_req_wait(&req);
11392
11393 out_release_port:
11394 if (path != NULL) {
11395 zfree(ZV_NAMEI, path);
11396 }
11397 ipc_port_release_send(mach_port);
11398
11399 return error;
11400 #else
11401 return ENOTSUP;
11402 #endif /* CONFIG_DATALESS_FILES */
11403 }
11404
11405 /*
11406 * vfs_materialize_file: Materialize a regular file.
11407 *
11408 * Inputs:
11409 * vp The dataless file to be materialized.
11410 *
11411 * op What kind of operation is being performed:
11412 * -> NAMESPACE_HANDLER_READ_OP
11413 * -> NAMESPACE_HANDLER_WRITE_OP
11414 * -> NAMESPACE_HANDLER_LINK_CREATE
11415 * -> NAMESPACE_HANDLER_DELETE_OP
11416 * -> NAMESPACE_HANDLER_TRUNCATE_OP
11417 * -> NAMESPACE_HANDLER_RENAME_OP
11418 *
11419 * offset offset of I/O for READ or WRITE. Ignored for
11420 * other ops.
11421 *
11422 * size size of I/O for READ or WRITE Ignored for
11423 * other ops.
11424 *
11425 * If offsize or size are -1 for a READ or WRITE, then the resolver should
11426 * consider the range to be unknown.
11427 *
11428 * Upon successful return, the caller may proceed with the operation.
11429 * N.B. the file may still be "dataless" in this case.
11430 */
11431 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)11432 vfs_materialize_file(
11433 struct vnode *vp,
11434 uint64_t op,
11435 int64_t offset,
11436 int64_t size)
11437 {
11438 if (vp->v_type != VREG) {
11439 return EFTYPE;
11440 }
11441 return vfs_materialize_item(vp, op, offset, size, NULL, 0);
11442 }
11443
11444 /*
11445 * vfs_materialize_dir:
11446 *
11447 * Inputs:
11448 * vp The dataless directory to be materialized.
11449 *
11450 * op What kind of operation is being performed:
11451 * -> NAMESPACE_HANDLER_READ_OP
11452 * -> NAMESPACE_HANDLER_WRITE_OP
11453 * -> NAMESPACE_HANDLER_DELETE_OP
11454 * -> NAMESPACE_HANDLER_RENAME_OP
11455 * -> NAMESPACE_HANDLER_LOOKUP_OP
11456 *
11457 * lookup_name Name being looked up for a LOOKUP op. Ignored for
11458 * other ops. May or may not be NUL-terminated; see below.
11459 *
11460 * namelen If non-zero, then lookup_name is assumed to not be NUL-
11461 * terminated and namelen is the number of valid bytes in
11462 * lookup_name. If zero, then lookup_name is assumed to be
11463 * NUL-terminated.
11464 *
11465 * Upon successful return, the caller may proceed with the operation.
11466 * N.B. the directory may still be "dataless" in this case.
11467 */
11468 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)11469 vfs_materialize_dir(
11470 struct vnode *vp,
11471 uint64_t op,
11472 char *lookup_name,
11473 size_t namelen)
11474 {
11475 if (vp->v_type != VDIR) {
11476 return EFTYPE;
11477 }
11478 if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
11479 return EINVAL;
11480 }
11481 return vfs_materialize_item(vp, op, 0, 0, lookup_name, namelen);
11482 }
11483
11484 int
resolve_nspace_item_ext(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,void * arg __unused)11485 resolve_nspace_item_ext(
11486 struct vnode *vp __no_dataless_unused,
11487 uint64_t op __no_dataless_unused,
11488 void *arg __unused)
11489 {
11490 #if CONFIG_DATALESS_FILES
11491 int error;
11492 mach_port_t mp;
11493 char *path = NULL;
11494 int path_len;
11495 kern_return_t kr;
11496 struct nspace_resolver_request req;
11497
11498 // only allow namespace events on regular files, directories and symlinks.
11499 if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
11500 return EFTYPE;
11501 }
11502
11503 //
11504 // if this is a snapshot event and the vnode is on a
11505 // disk image just pretend nothing happened since any
11506 // change to the disk image will cause the disk image
11507 // itself to get backed up and this avoids multi-way
11508 // deadlocks between the snapshot handler and the ever
11509 // popular diskimages-helper process. the variable
11510 // nspace_allow_virtual_devs allows this behavior to
11511 // be overridden (for use by the Mobile TimeMachine
11512 // testing infrastructure which uses disk images)
11513 //
11514 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
11515 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
11516 return ENOTSUP;
11517 }
11518
11519 error = vfs_context_dataless_materialization_is_prevented(
11520 vfs_context_current());
11521 if (error) {
11522 log_materialization_prevented(vp, op);
11523 return error;
11524 }
11525
11526 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11527 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11528 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
11529 // Treat this like being unable to access the backing
11530 // store server.
11531 return ETIMEDOUT;
11532 }
11533
11534 path = zalloc(ZV_NAMEI);
11535 path_len = MAXPATHLEN;
11536
11537 error = vn_getpath(vp, path, &path_len);
11538 if (error == 0) {
11539 int xxx_rdar44371223; /* XXX Mig bug */
11540 req.r_req_id = next_nspace_req_id();
11541 req.r_resolver_error = 0;
11542 req.r_flags = 0;
11543
11544 if ((error = vnode_ref(vp)) == 0) { // take a ref so that the vnode doesn't go away
11545 req.r_vp = vp;
11546 } else {
11547 goto out_release_port;
11548 }
11549
11550 NSPACE_REQ_LOCK();
11551 error = nspace_resolver_req_add(&req);
11552 NSPACE_REQ_UNLOCK();
11553 if (error) {
11554 vnode_rele(req.r_vp);
11555 goto out_release_port;
11556 }
11557
11558 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
11559 kr = send_nspace_resolve_path(mp, req.r_req_id,
11560 proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
11561 path, &xxx_rdar44371223);
11562 if (kr != KERN_SUCCESS) {
11563 // Also treat this like being unable to access
11564 // the backing store server.
11565 os_log_error(OS_LOG_DEFAULT,
11566 "NSPACE resolve_path failure: %d", kr);
11567 error = ETIMEDOUT;
11568
11569 NSPACE_REQ_LOCK();
11570 nspace_resolver_req_remove(&req);
11571 NSPACE_REQ_UNLOCK();
11572 vnode_rele(req.r_vp);
11573 goto out_release_port;
11574 }
11575
11576 // Give back the memory we allocated earlier while
11577 // we wait; we no longer need it.
11578 zfree(ZV_NAMEI, path);
11579 path = NULL;
11580
11581 // Request has been submitted to the resolver.
11582 // Now (interruptibly) wait for completion.
11583 // Upon requrn, the request will have been removed
11584 // from the lookup table.
11585 error = nspace_resolver_req_wait(&req);
11586
11587 vnode_rele(req.r_vp);
11588 }
11589
11590 out_release_port:
11591 if (path != NULL) {
11592 zfree(ZV_NAMEI, path);
11593 }
11594 ipc_port_release_send(mp);
11595
11596 return error;
11597 #else
11598 return ENOTSUP;
11599 #endif /* CONFIG_DATALESS_FILES */
11600 }
11601
11602 int
nspace_snapshot_event(__unused vnode_t vp,__unused time_t ctime,__unused uint64_t op_type,__unused void * arg)11603 nspace_snapshot_event(__unused vnode_t vp, __unused time_t ctime,
11604 __unused uint64_t op_type, __unused void *arg)
11605 {
11606 return 0;
11607 }
11608
11609 #if 0
11610 static int
11611 build_volfs_path(struct vnode *vp, char *path, int *len)
11612 {
11613 struct vnode_attr va;
11614 int ret;
11615
11616 VATTR_INIT(&va);
11617 VATTR_WANTED(&va, va_fsid);
11618 VATTR_WANTED(&va, va_fileid);
11619
11620 if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
11621 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
11622 ret = -1;
11623 } else {
11624 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
11625 ret = 0;
11626 }
11627
11628 return ret;
11629 }
11630 #endif
11631
11632 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)11633 fsctl_bogus_command_compat(unsigned long cmd)
11634 {
11635 switch (cmd) {
11636 case IOCBASECMD(FSIOC_SYNC_VOLUME):
11637 return FSIOC_SYNC_VOLUME;
11638 case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
11639 return FSIOC_ROUTEFS_SETROUTEID;
11640 case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
11641 return FSIOC_SET_PACKAGE_EXTS;
11642 case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
11643 return FSIOC_SET_FSTYPENAME_OVERRIDE;
11644 case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
11645 return DISK_CONDITIONER_IOC_GET;
11646 case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
11647 return DISK_CONDITIONER_IOC_SET;
11648 case IOCBASECMD(FSIOC_FIOSEEKHOLE):
11649 return FSIOC_FIOSEEKHOLE;
11650 case IOCBASECMD(FSIOC_FIOSEEKDATA):
11651 return FSIOC_FIOSEEKDATA;
11652 case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
11653 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
11654 case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
11655 return SPOTLIGHT_IOC_GET_LAST_MTIME;
11656 }
11657
11658 return cmd;
11659 }
11660
11661 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)11662 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
11663 {
11664 return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
11665 }
11666
11667 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)11668 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
11669 {
11670 struct vfs_attr vfa;
11671 mount_t mp = vp->v_mount;
11672 unsigned arg;
11673 int error;
11674
11675 /* record vid of vp so we can drop it below. */
11676 uint32_t vvid = vp->v_id;
11677
11678 /*
11679 * Then grab mount_iterref so that we can release the vnode.
11680 * Without this, a thread may call vnode_iterate_prepare then
11681 * get into a deadlock because we've never released the root vp
11682 */
11683 error = mount_iterref(mp, 0);
11684 if (error) {
11685 return error;
11686 }
11687 vnode_put(vp);
11688
11689 arg = MNT_NOWAIT;
11690 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
11691 arg = MNT_WAIT;
11692 }
11693
11694 /*
11695 * If the filessytem supports multiple filesytems in a
11696 * partition (For eg APFS volumes in a container, it knows
11697 * that the waitfor argument to VFS_SYNC are flags.
11698 */
11699 VFSATTR_INIT(&vfa);
11700 VFSATTR_WANTED(&vfa, f_capabilities);
11701 if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
11702 VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
11703 ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
11704 ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
11705 arg |= MNT_VOLUME;
11706 }
11707
11708 /* issue the sync for this volume */
11709 (void)sync_callback(mp, &arg);
11710
11711 /*
11712 * Then release the mount_iterref once we're done syncing; it's not
11713 * needed for the VNOP_IOCTL below
11714 */
11715 mount_iterdrop(mp);
11716
11717 if (arg & FSCTL_SYNC_FULLSYNC) {
11718 /* re-obtain vnode iocount on the root vp, if possible */
11719 error = vnode_getwithvid(vp, vvid);
11720 if (error == 0) {
11721 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
11722 vnode_put(vp);
11723 }
11724 }
11725 /* mark the argument VP as having been released */
11726 *arg_vp = NULL;
11727 return error;
11728 }
11729
11730 #if ROUTEFS
11731 static int __attribute__((noinline))
handle_routes(user_addr_t udata)11732 handle_routes(user_addr_t udata)
11733 {
11734 char routepath[MAXPATHLEN];
11735 size_t len = 0;
11736 int error;
11737
11738 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11739 return error;
11740 }
11741 bzero(routepath, MAXPATHLEN);
11742 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
11743 if (error) {
11744 return error;
11745 }
11746 error = routefs_kernel_mount(routepath);
11747 return error;
11748 }
11749 #endif
11750
11751 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)11752 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
11753 {
11754 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
11755 struct vnode_attr va;
11756 int error;
11757
11758 VATTR_INIT(&va);
11759 VATTR_SET(&va, va_flags, cas->new_flags);
11760
11761 error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
11762 return error;
11763 }
11764
11765 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)11766 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
11767 {
11768 struct mount *mp = NULL;
11769 errno_t rootauth = 0;
11770
11771 mp = vp->v_mount;
11772
11773 /*
11774 * query the underlying FS and see if it reports something
11775 * sane for this vnode. If volume is authenticated via
11776 * chunklist, leave that for the caller to determine.
11777 */
11778 rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
11779
11780 return rootauth;
11781 }
11782
11783 /*
11784 * Make a filesystem-specific control call:
11785 */
11786 /* ARGSUSED */
11787 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)11788 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
11789 {
11790 int error = 0;
11791 boolean_t is64bit;
11792 u_int size;
11793 #define STK_PARAMS 128
11794 char stkbuf[STK_PARAMS] = {0};
11795 caddr_t data, memp;
11796 vnode_t vp = *arg_vp;
11797
11798 if (vp->v_type == VCHR || vp->v_type == VBLK) {
11799 return ENOTTY;
11800 }
11801
11802 cmd = fsctl_bogus_command_compat(cmd);
11803
11804 size = IOCPARM_LEN(cmd);
11805 if (size > IOCPARM_MAX) {
11806 return EINVAL;
11807 }
11808
11809 is64bit = proc_is64bit(p);
11810
11811 memp = NULL;
11812
11813 if (size > sizeof(stkbuf)) {
11814 if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
11815 return ENOMEM;
11816 }
11817 data = memp;
11818 } else {
11819 data = &stkbuf[0];
11820 };
11821
11822 if (cmd & IOC_IN) {
11823 if (size) {
11824 error = copyin(udata, data, size);
11825 if (error) {
11826 if (memp) {
11827 kfree_data(memp, size);
11828 }
11829 return error;
11830 }
11831 } else {
11832 if (is64bit) {
11833 *(user_addr_t *)data = udata;
11834 } else {
11835 *(uint32_t *)data = (uint32_t)udata;
11836 }
11837 };
11838 } else if ((cmd & IOC_OUT) && size) {
11839 /*
11840 * Zero the buffer so the user always
11841 * gets back something deterministic.
11842 */
11843 bzero(data, size);
11844 } else if (cmd & IOC_VOID) {
11845 if (is64bit) {
11846 *(user_addr_t *)data = udata;
11847 } else {
11848 *(uint32_t *)data = (uint32_t)udata;
11849 }
11850 }
11851
11852 /* Check to see if it's a generic command */
11853 switch (cmd) {
11854 case FSIOC_SYNC_VOLUME:
11855 error = handle_sync_volume(vp, arg_vp, data, ctx);
11856 break;
11857
11858 case FSIOC_ROUTEFS_SETROUTEID:
11859 #if ROUTEFS
11860 error = handle_routes(udata);
11861 #endif
11862 break;
11863
11864 case FSIOC_SET_PACKAGE_EXTS: {
11865 user_addr_t ext_strings;
11866 uint32_t num_entries;
11867 uint32_t max_width;
11868
11869 if ((error = priv_check_cred(kauth_cred_get(), PRIV_PACKAGE_EXTENSIONS, 0))) {
11870 break;
11871 }
11872
11873 if ((is64bit && size != sizeof(user64_package_ext_info))
11874 || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
11875 // either you're 64-bit and passed a 64-bit struct or
11876 // you're 32-bit and passed a 32-bit struct. otherwise
11877 // it's not ok.
11878 error = EINVAL;
11879 break;
11880 }
11881
11882 if (is64bit) {
11883 if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
11884 assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
11885 }
11886 ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
11887 num_entries = ((user64_package_ext_info *)data)->num_entries;
11888 max_width = ((user64_package_ext_info *)data)->max_width;
11889 } else {
11890 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
11891 num_entries = ((user32_package_ext_info *)data)->num_entries;
11892 max_width = ((user32_package_ext_info *)data)->max_width;
11893 }
11894 error = set_package_extensions_table(ext_strings, num_entries, max_width);
11895 }
11896 break;
11897
11898 case FSIOC_SET_FSTYPENAME_OVERRIDE:
11899 {
11900 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11901 break;
11902 }
11903 if (vp->v_mount) {
11904 mount_lock(vp->v_mount);
11905 if (data[0] != 0) {
11906 int i;
11907 for (i = 0; i < MFSTYPENAMELEN; i++) {
11908 if (!data[i]) {
11909 goto continue_copy;
11910 }
11911 }
11912 /*
11913 * Getting here means we have a user data string which has no
11914 * NULL termination in its first MFSTYPENAMELEN bytes.
11915 * This is bogus, let's avoid strlcpy-ing the read data and
11916 * return an error.
11917 */
11918 error = EINVAL;
11919 goto unlock;
11920 continue_copy:
11921 strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
11922 vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
11923 if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
11924 vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
11925 vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
11926 }
11927 } else {
11928 if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
11929 vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
11930 }
11931 vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
11932 vp->v_mount->fstypename_override[0] = '\0';
11933 }
11934 unlock:
11935 mount_unlock(vp->v_mount);
11936 }
11937 }
11938 break;
11939
11940 case DISK_CONDITIONER_IOC_GET: {
11941 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
11942 }
11943 break;
11944
11945 case DISK_CONDITIONER_IOC_SET: {
11946 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
11947 }
11948 break;
11949
11950 case FSIOC_CAS_BSDFLAGS:
11951 error = handle_flags(vp, data, ctx);
11952 break;
11953
11954 case FSIOC_FD_ONLY_OPEN_ONCE: {
11955 error = 0;
11956 if (vnode_usecount(vp) > 1) {
11957 vnode_lock_spin(vp);
11958 if (vp->v_lflag & VL_HASSTREAMS) {
11959 if (vnode_isinuse_locked(vp, 1, 1)) {
11960 error = EBUSY;
11961 }
11962 } else if (vnode_usecount(vp) > 1) {
11963 error = EBUSY;
11964 }
11965 vnode_unlock(vp);
11966 }
11967 }
11968 break;
11969
11970 case FSIOC_EVAL_ROOTAUTH:
11971 error = handle_auth(vp, cmd, data, options, ctx);
11972 break;
11973
11974 default: {
11975 /* other, known commands shouldn't be passed down here */
11976 switch (cmd) {
11977 case F_PUNCHHOLE:
11978 case F_TRIM_ACTIVE_FILE:
11979 case F_RDADVISE:
11980 case F_TRANSCODEKEY:
11981 case F_GETPROTECTIONLEVEL:
11982 case F_GETDEFAULTPROTLEVEL:
11983 case F_MAKECOMPRESSED:
11984 case F_SET_GREEDY_MODE:
11985 case F_SETSTATICCONTENT:
11986 case F_SETIOTYPE:
11987 case F_SETBACKINGSTORE:
11988 case F_GETPATH_MTMINFO:
11989 case APFSIOC_REVERT_TO_SNAPSHOT:
11990 case FSIOC_FIOSEEKHOLE:
11991 case FSIOC_FIOSEEKDATA:
11992 case HFS_GET_BOOT_INFO:
11993 case HFS_SET_BOOT_INFO:
11994 case FIOPINSWAP:
11995 case F_CHKCLEAN:
11996 case F_FULLFSYNC:
11997 case F_BARRIERFSYNC:
11998 case F_FREEZE_FS:
11999 case F_THAW_FS:
12000 case FSIOC_KERNEL_ROOTAUTH:
12001 error = EINVAL;
12002 goto outdrop;
12003 }
12004 /* Invoke the filesystem-specific code */
12005 error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12006 }
12007 } /* end switch stmt */
12008
12009 /*
12010 * if no errors, copy any data to user. Size was
12011 * already set and checked above.
12012 */
12013 if (error == 0 && (cmd & IOC_OUT) && size) {
12014 error = copyout(data, udata, size);
12015 }
12016
12017 outdrop:
12018 if (memp) {
12019 kfree_data(memp, size);
12020 }
12021
12022 return error;
12023 }
12024
12025 /* ARGSUSED */
12026 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)12027 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
12028 {
12029 int error;
12030 struct nameidata nd;
12031 uint32_t nameiflags;
12032 vnode_t vp = NULL;
12033 vfs_context_t ctx = vfs_context_current();
12034
12035 AUDIT_ARG(cmd, (int)uap->cmd);
12036 AUDIT_ARG(value32, uap->options);
12037 /* Get the vnode for the file we are getting info on: */
12038 nameiflags = 0;
12039 //
12040 // if we come through fsctl() then the file is by definition not open.
12041 // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
12042 // lest the caller mistakenly thinks the only open is their own (but in
12043 // reality it's someone elses).
12044 //
12045 if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
12046 return EINVAL;
12047 }
12048 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
12049 nameiflags |= FOLLOW;
12050 }
12051 if (uap->cmd == FSIOC_FIRMLINK_CTL) {
12052 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
12053 }
12054 NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
12055 UIO_USERSPACE, uap->path, ctx);
12056 if ((error = namei(&nd))) {
12057 goto done;
12058 }
12059 vp = nd.ni_vp;
12060 nameidone(&nd);
12061
12062 #if CONFIG_MACF
12063 error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
12064 if (error) {
12065 goto done;
12066 }
12067 #endif
12068
12069 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12070
12071 done:
12072 if (vp) {
12073 vnode_put(vp);
12074 }
12075 return error;
12076 }
12077 /* ARGSUSED */
12078 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)12079 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
12080 {
12081 int error;
12082 vnode_t vp = NULL;
12083 vfs_context_t ctx = vfs_context_current();
12084 int fd = -1;
12085
12086 AUDIT_ARG(fd, uap->fd);
12087 AUDIT_ARG(cmd, (int)uap->cmd);
12088 AUDIT_ARG(value32, uap->options);
12089
12090 /* Get the vnode for the file we are getting info on: */
12091 if ((error = file_vnode(uap->fd, &vp))) {
12092 return error;
12093 }
12094 fd = uap->fd;
12095 if ((error = vnode_getwithref(vp))) {
12096 file_drop(fd);
12097 return error;
12098 }
12099
12100 #if CONFIG_MACF
12101 if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
12102 file_drop(fd);
12103 vnode_put(vp);
12104 return error;
12105 }
12106 #endif
12107
12108 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12109
12110 file_drop(fd);
12111
12112 /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
12113 if (vp) {
12114 vnode_put(vp);
12115 }
12116
12117 return error;
12118 }
12119 /* end of fsctl system call */
12120
12121 #define FILESEC_ACCESS_ENTITLEMENT \
12122 "com.apple.private.vfs.filesec-access"
12123
12124 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)12125 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
12126 {
12127 if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
12128 /*
12129 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
12130 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
12131 */
12132 if ((!setting && vfs_context_issuser(ctx)) ||
12133 IOCurrentTaskHasEntitlement(FILESEC_ACCESS_ENTITLEMENT)) {
12134 return 0;
12135 }
12136 }
12137
12138 return EPERM;
12139 }
12140
12141 /*
12142 * Retrieve the data of an extended attribute.
12143 */
12144 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)12145 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
12146 {
12147 vnode_t vp;
12148 struct nameidata nd;
12149 char attrname[XATTR_MAXNAMELEN + 1];
12150 vfs_context_t ctx = vfs_context_current();
12151 uio_t auio = NULL;
12152 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12153 size_t attrsize = 0;
12154 size_t namelen;
12155 u_int32_t nameiflags;
12156 int error;
12157 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12158
12159 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12160 return EINVAL;
12161 }
12162
12163 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12164 NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
12165 if ((error = namei(&nd))) {
12166 return error;
12167 }
12168 vp = nd.ni_vp;
12169 nameidone(&nd);
12170
12171 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12172 if (error != 0) {
12173 goto out;
12174 }
12175 if (xattr_protected(attrname) &&
12176 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12177 goto out;
12178 }
12179 /*
12180 * the specific check for 0xffffffff is a hack to preserve
12181 * binaray compatibilty in K64 with applications that discovered
12182 * that passing in a buf pointer and a size of -1 resulted in
12183 * just the size of the indicated extended attribute being returned.
12184 * this isn't part of the documented behavior, but because of the
12185 * original implemtation's check for "uap->size > 0", this behavior
12186 * was allowed. In K32 that check turned into a signed comparison
12187 * even though uap->size is unsigned... in K64, we blow by that
12188 * check because uap->size is unsigned and doesn't get sign smeared
12189 * in the munger for a 32 bit user app. we also need to add a
12190 * check to limit the maximum size of the buffer being passed in...
12191 * unfortunately, the underlying fileystems seem to just malloc
12192 * the requested size even if the actual extended attribute is tiny.
12193 * because that malloc is for kernel wired memory, we have to put a
12194 * sane limit on it.
12195 *
12196 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
12197 * U64 running on K64 will yield -1 (64 bits wide)
12198 * U32/U64 running on K32 will yield -1 (32 bits wide)
12199 */
12200 if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
12201 goto no_uio;
12202 }
12203
12204 if (uap->value) {
12205 if (uap->size > (size_t)XATTR_MAXSIZE) {
12206 uap->size = XATTR_MAXSIZE;
12207 }
12208
12209 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12210 &uio_buf[0], sizeof(uio_buf));
12211 uio_addiov(auio, uap->value, uap->size);
12212 }
12213 no_uio:
12214 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
12215 out:
12216 vnode_put(vp);
12217
12218 if (auio) {
12219 *retval = uap->size - uio_resid(auio);
12220 } else {
12221 *retval = (user_ssize_t)attrsize;
12222 }
12223
12224 return error;
12225 }
12226
12227 /*
12228 * Retrieve the data of an extended attribute.
12229 */
12230 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)12231 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
12232 {
12233 vnode_t vp;
12234 char attrname[XATTR_MAXNAMELEN + 1];
12235 vfs_context_t ctx = vfs_context_current();
12236 uio_t auio = NULL;
12237 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12238 size_t attrsize = 0;
12239 size_t namelen;
12240 int error;
12241 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12242
12243 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12244 return EINVAL;
12245 }
12246
12247 if ((error = file_vnode(uap->fd, &vp))) {
12248 return error;
12249 }
12250 if ((error = vnode_getwithref(vp))) {
12251 file_drop(uap->fd);
12252 return error;
12253 }
12254 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12255 if (error != 0) {
12256 goto out;
12257 }
12258 if (xattr_protected(attrname) &&
12259 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12260 goto out;
12261 }
12262 if (uap->value && uap->size > 0) {
12263 if (uap->size > (size_t)XATTR_MAXSIZE) {
12264 uap->size = XATTR_MAXSIZE;
12265 }
12266
12267 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12268 &uio_buf[0], sizeof(uio_buf));
12269 uio_addiov(auio, uap->value, uap->size);
12270 }
12271
12272 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
12273 out:
12274 (void)vnode_put(vp);
12275 file_drop(uap->fd);
12276
12277 if (auio) {
12278 *retval = uap->size - uio_resid(auio);
12279 } else {
12280 *retval = (user_ssize_t)attrsize;
12281 }
12282 return error;
12283 }
12284
12285 /*
12286 * Set the data of an extended attribute.
12287 */
12288 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)12289 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
12290 {
12291 vnode_t vp;
12292 struct nameidata nd;
12293 char attrname[XATTR_MAXNAMELEN + 1];
12294 vfs_context_t ctx = vfs_context_current();
12295 uio_t auio = NULL;
12296 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12297 size_t namelen;
12298 u_int32_t nameiflags;
12299 int error;
12300 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12301
12302 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12303 return EINVAL;
12304 }
12305
12306 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12307 if (error != 0) {
12308 if (error == EPERM) {
12309 /* if the string won't fit in attrname, copyinstr emits EPERM */
12310 return ENAMETOOLONG;
12311 }
12312 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
12313 return error;
12314 }
12315 if (xattr_protected(attrname) &&
12316 (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
12317 return error;
12318 }
12319 if (uap->size != 0 && uap->value == 0) {
12320 return EINVAL;
12321 }
12322 if (uap->size > INT_MAX) {
12323 return E2BIG;
12324 }
12325
12326 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12327 NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
12328 if ((error = namei(&nd))) {
12329 return error;
12330 }
12331 vp = nd.ni_vp;
12332 nameidone(&nd);
12333
12334 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
12335 &uio_buf[0], sizeof(uio_buf));
12336 uio_addiov(auio, uap->value, uap->size);
12337
12338 error = vn_setxattr(vp, attrname, auio, uap->options, ctx);
12339 #if CONFIG_FSE
12340 if (error == 0) {
12341 add_fsevent(FSE_XATTR_MODIFIED, ctx,
12342 FSE_ARG_VNODE, vp,
12343 FSE_ARG_DONE);
12344 }
12345 #endif
12346 vnode_put(vp);
12347 *retval = 0;
12348 return error;
12349 }
12350
12351 /*
12352 * Set the data of an extended attribute.
12353 */
12354 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)12355 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
12356 {
12357 vnode_t vp;
12358 char attrname[XATTR_MAXNAMELEN + 1];
12359 vfs_context_t ctx = vfs_context_current();
12360 uio_t auio = NULL;
12361 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12362 size_t namelen;
12363 int error;
12364 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12365
12366 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12367 return EINVAL;
12368 }
12369
12370 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12371 if (error != 0) {
12372 if (error == EPERM) {
12373 /* if the string won't fit in attrname, copyinstr emits EPERM */
12374 return ENAMETOOLONG;
12375 }
12376 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
12377 return error;
12378 }
12379 if (xattr_protected(attrname) &&
12380 (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
12381 return error;
12382 }
12383 if (uap->size != 0 && uap->value == 0) {
12384 return EINVAL;
12385 }
12386 if (uap->size > INT_MAX) {
12387 return E2BIG;
12388 }
12389 if ((error = file_vnode(uap->fd, &vp))) {
12390 return error;
12391 }
12392 if ((error = vnode_getwithref(vp))) {
12393 file_drop(uap->fd);
12394 return error;
12395 }
12396 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
12397 &uio_buf[0], sizeof(uio_buf));
12398 uio_addiov(auio, uap->value, uap->size);
12399
12400 error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
12401 #if CONFIG_FSE
12402 if (error == 0) {
12403 add_fsevent(FSE_XATTR_MODIFIED, ctx,
12404 FSE_ARG_VNODE, vp,
12405 FSE_ARG_DONE);
12406 }
12407 #endif
12408 vnode_put(vp);
12409 file_drop(uap->fd);
12410 *retval = 0;
12411 return error;
12412 }
12413
12414 /*
12415 * Remove an extended attribute.
12416 * XXX Code duplication here.
12417 */
12418 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)12419 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
12420 {
12421 vnode_t vp;
12422 struct nameidata nd;
12423 char attrname[XATTR_MAXNAMELEN + 1];
12424 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12425 vfs_context_t ctx = vfs_context_current();
12426 size_t namelen;
12427 u_int32_t nameiflags;
12428 int error;
12429
12430 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12431 return EINVAL;
12432 }
12433
12434 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12435 if (error != 0) {
12436 return error;
12437 }
12438 if (xattr_protected(attrname)) {
12439 return EPERM;
12440 }
12441 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12442 NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
12443 if ((error = namei(&nd))) {
12444 return error;
12445 }
12446 vp = nd.ni_vp;
12447 nameidone(&nd);
12448
12449 error = vn_removexattr(vp, attrname, uap->options, ctx);
12450 #if CONFIG_FSE
12451 if (error == 0) {
12452 add_fsevent(FSE_XATTR_REMOVED, ctx,
12453 FSE_ARG_VNODE, vp,
12454 FSE_ARG_DONE);
12455 }
12456 #endif
12457 vnode_put(vp);
12458 *retval = 0;
12459 return error;
12460 }
12461
12462 /*
12463 * Remove an extended attribute.
12464 * XXX Code duplication here.
12465 */
12466 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)12467 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
12468 {
12469 vnode_t vp;
12470 char attrname[XATTR_MAXNAMELEN + 1];
12471 size_t namelen;
12472 int error;
12473 #if CONFIG_FSE
12474 vfs_context_t ctx = vfs_context_current();
12475 #endif
12476
12477 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12478 return EINVAL;
12479 }
12480
12481 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12482 if (error != 0) {
12483 return error;
12484 }
12485 if (xattr_protected(attrname)) {
12486 return EPERM;
12487 }
12488 if ((error = file_vnode(uap->fd, &vp))) {
12489 return error;
12490 }
12491 if ((error = vnode_getwithref(vp))) {
12492 file_drop(uap->fd);
12493 return error;
12494 }
12495
12496 error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
12497 #if CONFIG_FSE
12498 if (error == 0) {
12499 add_fsevent(FSE_XATTR_REMOVED, ctx,
12500 FSE_ARG_VNODE, vp,
12501 FSE_ARG_DONE);
12502 }
12503 #endif
12504 vnode_put(vp);
12505 file_drop(uap->fd);
12506 *retval = 0;
12507 return error;
12508 }
12509
12510 /*
12511 * Retrieve the list of extended attribute names.
12512 * XXX Code duplication here.
12513 */
12514 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)12515 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
12516 {
12517 vnode_t vp;
12518 struct nameidata nd;
12519 vfs_context_t ctx = vfs_context_current();
12520 uio_t auio = NULL;
12521 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12522 size_t attrsize = 0;
12523 u_int32_t nameiflags;
12524 int error;
12525 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12526
12527 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12528 return EINVAL;
12529 }
12530
12531 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12532 NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
12533 if ((error = namei(&nd))) {
12534 return error;
12535 }
12536 vp = nd.ni_vp;
12537 nameidone(&nd);
12538 if (uap->namebuf != 0 && uap->bufsize > 0) {
12539 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
12540 &uio_buf[0], sizeof(uio_buf));
12541 uio_addiov(auio, uap->namebuf, uap->bufsize);
12542 }
12543
12544 error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
12545
12546 vnode_put(vp);
12547 if (auio) {
12548 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
12549 } else {
12550 *retval = (user_ssize_t)attrsize;
12551 }
12552 return error;
12553 }
12554
12555 /*
12556 * Retrieve the list of extended attribute names.
12557 * XXX Code duplication here.
12558 */
12559 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)12560 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
12561 {
12562 vnode_t vp;
12563 uio_t auio = NULL;
12564 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12565 size_t attrsize = 0;
12566 int error;
12567 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12568
12569 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12570 return EINVAL;
12571 }
12572
12573 if ((error = file_vnode(uap->fd, &vp))) {
12574 return error;
12575 }
12576 if ((error = vnode_getwithref(vp))) {
12577 file_drop(uap->fd);
12578 return error;
12579 }
12580 if (uap->namebuf != 0 && uap->bufsize > 0) {
12581 auio = uio_createwithbuffer(1, 0, spacetype,
12582 UIO_READ, &uio_buf[0], sizeof(uio_buf));
12583 uio_addiov(auio, uap->namebuf, uap->bufsize);
12584 }
12585
12586 error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
12587
12588 vnode_put(vp);
12589 file_drop(uap->fd);
12590 if (auio) {
12591 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
12592 } else {
12593 *retval = (user_ssize_t)attrsize;
12594 }
12595 return error;
12596 }
12597
12598 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)12599 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
12600 vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
12601 {
12602 int error;
12603 struct mount *mp = NULL;
12604 vnode_t vp;
12605 int length;
12606 int bpflags;
12607 /* maximum number of times to retry build_path */
12608 unsigned int retries = 0x10;
12609
12610 if (bufsize > PAGE_SIZE) {
12611 return EINVAL;
12612 }
12613
12614 if (buf == NULL) {
12615 return ENOMEM;
12616 }
12617
12618 retry:
12619 if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
12620 error = ENOTSUP; /* unexpected failure */
12621 return ENOTSUP;
12622 }
12623
12624 #if CONFIG_UNION_MOUNTS
12625 unionget:
12626 #endif /* CONFIG_UNION_MOUNTS */
12627 if (objid == 2) {
12628 struct vfs_attr vfsattr;
12629 int use_vfs_root = TRUE;
12630
12631 VFSATTR_INIT(&vfsattr);
12632 VFSATTR_WANTED(&vfsattr, f_capabilities);
12633 if (!(options & FSOPT_ISREALFSID) &&
12634 vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
12635 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
12636 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
12637 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
12638 use_vfs_root = FALSE;
12639 }
12640 }
12641
12642 if (use_vfs_root) {
12643 error = VFS_ROOT(mp, &vp, ctx);
12644 } else {
12645 error = VFS_VGET(mp, objid, &vp, ctx);
12646 }
12647 } else {
12648 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
12649 }
12650
12651 #if CONFIG_UNION_MOUNTS
12652 if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
12653 /*
12654 * If the fileid isn't found and we're in a union
12655 * mount volume, then see if the fileid is in the
12656 * mounted-on volume.
12657 */
12658 struct mount *tmp = mp;
12659 mp = vnode_mount(tmp->mnt_vnodecovered);
12660 vfs_unbusy(tmp);
12661 if (vfs_busy(mp, LK_NOWAIT) == 0) {
12662 goto unionget;
12663 }
12664 } else {
12665 vfs_unbusy(mp);
12666 }
12667 #else
12668 vfs_unbusy(mp);
12669 #endif /* CONFIG_UNION_MOUNTS */
12670
12671 if (error) {
12672 return error;
12673 }
12674
12675 #if CONFIG_MACF
12676 error = mac_vnode_check_fsgetpath(ctx, vp);
12677 if (error) {
12678 vnode_put(vp);
12679 return error;
12680 }
12681 #endif
12682
12683 /* Obtain the absolute path to this vnode. */
12684 bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
12685 if (options & FSOPT_NOFIRMLINKPATH) {
12686 bpflags |= BUILDPATH_NO_FIRMLINK;
12687 }
12688 bpflags |= BUILDPATH_CHECK_MOVED;
12689 error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
12690 vnode_put(vp);
12691
12692 if (error) {
12693 /* there was a race building the path, try a few more times */
12694 if (error == EAGAIN) {
12695 --retries;
12696 if (retries > 0) {
12697 goto retry;
12698 }
12699
12700 error = ENOENT;
12701 }
12702 goto out;
12703 }
12704
12705 AUDIT_ARG(text, buf);
12706
12707 if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
12708 unsigned long path_words[NUMPARMS];
12709 size_t path_len = sizeof(path_words);
12710
12711 if ((size_t)length < path_len) {
12712 memcpy((char *)path_words, buf, length);
12713 memset((char *)path_words + length, 0, path_len - length);
12714
12715 path_len = length;
12716 } else {
12717 memcpy((char *)path_words, buf + (length - path_len), path_len);
12718 }
12719
12720 kdebug_vfs_lookup(path_words, (int)path_len, vp,
12721 KDBG_VFS_LOOKUP_FLAG_LOOKUP);
12722 }
12723
12724 *pathlen = length; /* may be superseded by error */
12725
12726 out:
12727 return error;
12728 }
12729
12730 /*
12731 * Obtain the full pathname of a file system object by id.
12732 */
12733 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)12734 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
12735 uint32_t options, user_ssize_t *retval)
12736 {
12737 vfs_context_t ctx = vfs_context_current();
12738 fsid_t fsid;
12739 char *realpath;
12740 int length;
12741 int error;
12742
12743 if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
12744 return EINVAL;
12745 }
12746
12747 if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
12748 return error;
12749 }
12750 AUDIT_ARG(value32, fsid.val[0]);
12751 AUDIT_ARG(value64, objid);
12752 /* Restrict output buffer size for now. */
12753
12754 if (bufsize > PAGE_SIZE || bufsize <= 0) {
12755 return EINVAL;
12756 }
12757 realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
12758 if (realpath == NULL) {
12759 return ENOMEM;
12760 }
12761
12762 error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
12763 options, &length);
12764
12765 if (error) {
12766 goto out;
12767 }
12768
12769 error = copyout((caddr_t)realpath, buf, length);
12770
12771 *retval = (user_ssize_t)length; /* may be superseded by error */
12772 out:
12773 kfree_data(realpath, bufsize);
12774 return error;
12775 }
12776
12777 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)12778 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
12779 {
12780 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
12781 0, retval);
12782 }
12783
12784 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)12785 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
12786 {
12787 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
12788 uap->options, retval);
12789 }
12790
12791 /*
12792 * Common routine to handle various flavors of statfs data heading out
12793 * to user space.
12794 *
12795 * Returns: 0 Success
12796 * EFAULT
12797 */
12798 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)12799 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
12800 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
12801 boolean_t partial_copy)
12802 {
12803 int error;
12804 int my_size, copy_size;
12805
12806 if (is_64_bit) {
12807 struct user64_statfs sfs;
12808 my_size = copy_size = sizeof(sfs);
12809 bzero(&sfs, my_size);
12810 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12811 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
12812 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12813 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
12814 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
12815 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
12816 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
12817 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
12818 sfs.f_files = (user64_long_t)sfsp->f_files;
12819 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
12820 sfs.f_fsid = sfsp->f_fsid;
12821 sfs.f_owner = sfsp->f_owner;
12822 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12823 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12824 } else {
12825 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12826 }
12827 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12828 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12829
12830 if (partial_copy) {
12831 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12832 }
12833 error = copyout((caddr_t)&sfs, bufp, copy_size);
12834 } else {
12835 struct user32_statfs sfs;
12836
12837 my_size = copy_size = sizeof(sfs);
12838 bzero(&sfs, my_size);
12839
12840 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12841 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
12842 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12843
12844 /*
12845 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
12846 * have to fudge the numbers here in that case. We inflate the blocksize in order
12847 * to reflect the filesystem size as best we can.
12848 */
12849 if ((sfsp->f_blocks > INT_MAX)
12850 /* Hack for 4061702 . I think the real fix is for Carbon to
12851 * look for some volume capability and not depend on hidden
12852 * semantics agreed between a FS and carbon.
12853 * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
12854 * for Carbon to set bNoVolumeSizes volume attribute.
12855 * Without this the webdavfs files cannot be copied onto
12856 * disk as they look huge. This change should not affect
12857 * XSAN as they should not setting these to -1..
12858 */
12859 && (sfsp->f_blocks != 0xffffffffffffffffULL)
12860 && (sfsp->f_bfree != 0xffffffffffffffffULL)
12861 && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
12862 int shift;
12863
12864 /*
12865 * Work out how far we have to shift the block count down to make it fit.
12866 * Note that it's possible to have to shift so far that the resulting
12867 * blocksize would be unreportably large. At that point, we will clip
12868 * any values that don't fit.
12869 *
12870 * For safety's sake, we also ensure that f_iosize is never reported as
12871 * being smaller than f_bsize.
12872 */
12873 for (shift = 0; shift < 32; shift++) {
12874 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
12875 break;
12876 }
12877 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
12878 break;
12879 }
12880 }
12881 #define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
12882 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
12883 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
12884 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
12885 #undef __SHIFT_OR_CLIP
12886 sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
12887 sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
12888 } else {
12889 /* filesystem is small enough to be reported honestly */
12890 sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
12891 sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
12892 sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
12893 sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
12894 sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
12895 }
12896 sfs.f_files = (user32_long_t)sfsp->f_files;
12897 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
12898 sfs.f_fsid = sfsp->f_fsid;
12899 sfs.f_owner = sfsp->f_owner;
12900 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12901 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12902 } else {
12903 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12904 }
12905 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12906 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12907
12908 if (partial_copy) {
12909 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12910 }
12911 error = copyout((caddr_t)&sfs, bufp, copy_size);
12912 }
12913
12914 if (sizep != NULL) {
12915 *sizep = my_size;
12916 }
12917 return error;
12918 }
12919
12920 /*
12921 * copy stat structure into user_stat structure.
12922 */
12923 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)12924 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
12925 {
12926 bzero(usbp, sizeof(*usbp));
12927
12928 usbp->st_dev = sbp->st_dev;
12929 usbp->st_ino = sbp->st_ino;
12930 usbp->st_mode = sbp->st_mode;
12931 usbp->st_nlink = sbp->st_nlink;
12932 usbp->st_uid = sbp->st_uid;
12933 usbp->st_gid = sbp->st_gid;
12934 usbp->st_rdev = sbp->st_rdev;
12935 #ifndef _POSIX_C_SOURCE
12936 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12937 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12938 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12939 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12940 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12941 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12942 #else
12943 usbp->st_atime = sbp->st_atime;
12944 usbp->st_atimensec = sbp->st_atimensec;
12945 usbp->st_mtime = sbp->st_mtime;
12946 usbp->st_mtimensec = sbp->st_mtimensec;
12947 usbp->st_ctime = sbp->st_ctime;
12948 usbp->st_ctimensec = sbp->st_ctimensec;
12949 #endif
12950 usbp->st_size = sbp->st_size;
12951 usbp->st_blocks = sbp->st_blocks;
12952 usbp->st_blksize = sbp->st_blksize;
12953 usbp->st_flags = sbp->st_flags;
12954 usbp->st_gen = sbp->st_gen;
12955 usbp->st_lspare = sbp->st_lspare;
12956 usbp->st_qspare[0] = sbp->st_qspare[0];
12957 usbp->st_qspare[1] = sbp->st_qspare[1];
12958 }
12959
12960 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)12961 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
12962 {
12963 bzero(usbp, sizeof(*usbp));
12964
12965 usbp->st_dev = sbp->st_dev;
12966 usbp->st_ino = sbp->st_ino;
12967 usbp->st_mode = sbp->st_mode;
12968 usbp->st_nlink = sbp->st_nlink;
12969 usbp->st_uid = sbp->st_uid;
12970 usbp->st_gid = sbp->st_gid;
12971 usbp->st_rdev = sbp->st_rdev;
12972 #ifndef _POSIX_C_SOURCE
12973 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
12974 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
12975 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
12976 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
12977 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
12978 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
12979 #else
12980 usbp->st_atime = sbp->st_atime;
12981 usbp->st_atimensec = sbp->st_atimensec;
12982 usbp->st_mtime = sbp->st_mtime;
12983 usbp->st_mtimensec = sbp->st_mtimensec;
12984 usbp->st_ctime = sbp->st_ctime;
12985 usbp->st_ctimensec = sbp->st_ctimensec;
12986 #endif
12987 usbp->st_size = sbp->st_size;
12988 usbp->st_blocks = sbp->st_blocks;
12989 usbp->st_blksize = sbp->st_blksize;
12990 usbp->st_flags = sbp->st_flags;
12991 usbp->st_gen = sbp->st_gen;
12992 usbp->st_lspare = sbp->st_lspare;
12993 usbp->st_qspare[0] = sbp->st_qspare[0];
12994 usbp->st_qspare[1] = sbp->st_qspare[1];
12995 }
12996
12997 /*
12998 * copy stat64 structure into user_stat64 structure.
12999 */
13000 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)13001 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
13002 {
13003 bzero(usbp, sizeof(*usbp));
13004
13005 usbp->st_dev = sbp->st_dev;
13006 usbp->st_ino = sbp->st_ino;
13007 usbp->st_mode = sbp->st_mode;
13008 usbp->st_nlink = sbp->st_nlink;
13009 usbp->st_uid = sbp->st_uid;
13010 usbp->st_gid = sbp->st_gid;
13011 usbp->st_rdev = sbp->st_rdev;
13012 #ifndef _POSIX_C_SOURCE
13013 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13014 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13015 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13016 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13017 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13018 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13019 usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
13020 usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
13021 #else
13022 usbp->st_atime = sbp->st_atime;
13023 usbp->st_atimensec = sbp->st_atimensec;
13024 usbp->st_mtime = sbp->st_mtime;
13025 usbp->st_mtimensec = sbp->st_mtimensec;
13026 usbp->st_ctime = sbp->st_ctime;
13027 usbp->st_ctimensec = sbp->st_ctimensec;
13028 usbp->st_birthtime = sbp->st_birthtime;
13029 usbp->st_birthtimensec = sbp->st_birthtimensec;
13030 #endif
13031 usbp->st_size = sbp->st_size;
13032 usbp->st_blocks = sbp->st_blocks;
13033 usbp->st_blksize = sbp->st_blksize;
13034 usbp->st_flags = sbp->st_flags;
13035 usbp->st_gen = sbp->st_gen;
13036 usbp->st_lspare = sbp->st_lspare;
13037 usbp->st_qspare[0] = sbp->st_qspare[0];
13038 usbp->st_qspare[1] = sbp->st_qspare[1];
13039 }
13040
13041 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)13042 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
13043 {
13044 bzero(usbp, sizeof(*usbp));
13045
13046 usbp->st_dev = sbp->st_dev;
13047 usbp->st_ino = sbp->st_ino;
13048 usbp->st_mode = sbp->st_mode;
13049 usbp->st_nlink = sbp->st_nlink;
13050 usbp->st_uid = sbp->st_uid;
13051 usbp->st_gid = sbp->st_gid;
13052 usbp->st_rdev = sbp->st_rdev;
13053 #ifndef _POSIX_C_SOURCE
13054 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13055 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13056 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13057 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13058 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13059 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13060 usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
13061 usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
13062 #else
13063 usbp->st_atime = sbp->st_atime;
13064 usbp->st_atimensec = sbp->st_atimensec;
13065 usbp->st_mtime = sbp->st_mtime;
13066 usbp->st_mtimensec = sbp->st_mtimensec;
13067 usbp->st_ctime = sbp->st_ctime;
13068 usbp->st_ctimensec = sbp->st_ctimensec;
13069 usbp->st_birthtime = sbp->st_birthtime;
13070 usbp->st_birthtimensec = sbp->st_birthtimensec;
13071 #endif
13072 usbp->st_size = sbp->st_size;
13073 usbp->st_blocks = sbp->st_blocks;
13074 usbp->st_blksize = sbp->st_blksize;
13075 usbp->st_flags = sbp->st_flags;
13076 usbp->st_gen = sbp->st_gen;
13077 usbp->st_lspare = sbp->st_lspare;
13078 usbp->st_qspare[0] = sbp->st_qspare[0];
13079 usbp->st_qspare[1] = sbp->st_qspare[1];
13080 }
13081
13082 /*
13083 * Purge buffer cache for simulating cold starts
13084 */
13085 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)13086 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
13087 {
13088 ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
13089
13090 return VNODE_RETURNED;
13091 }
13092
13093 static int
vfs_purge_callback(mount_t mp,__unused void * arg)13094 vfs_purge_callback(mount_t mp, __unused void * arg)
13095 {
13096 vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
13097
13098 return VFS_RETURNED;
13099 }
13100
13101 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)13102 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
13103 {
13104 if (!kauth_cred_issuser(kauth_cred_get())) {
13105 return EPERM;
13106 }
13107
13108 vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
13109
13110 return 0;
13111 }
13112
13113 /*
13114 * gets the vnode associated with the (unnamed) snapshot directory
13115 * for a Filesystem. The snapshot directory vnode is returned with
13116 * an iocount on it.
13117 */
13118 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)13119 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
13120 {
13121 return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
13122 }
13123
13124 /*
13125 * Get the snapshot vnode.
13126 *
13127 * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
13128 * needs nameidone() on ndp.
13129 *
13130 * If the snapshot vnode exists it is returned in ndp->ni_vp.
13131 *
13132 * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
13133 * not needed.
13134 */
13135 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)13136 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
13137 user_addr_t name, struct nameidata *ndp, int32_t op,
13138 #if !CONFIG_TRIGGERS
13139 __unused
13140 #endif
13141 enum path_operation pathop,
13142 vfs_context_t ctx)
13143 {
13144 int error, i;
13145 caddr_t name_buf;
13146 size_t name_len;
13147 struct vfs_attr vfa;
13148
13149 *sdvpp = NULLVP;
13150 *rvpp = NULLVP;
13151
13152 error = vnode_getfromfd(ctx, dirfd, rvpp);
13153 if (error) {
13154 return error;
13155 }
13156
13157 if (!vnode_isvroot(*rvpp)) {
13158 error = EINVAL;
13159 goto out;
13160 }
13161
13162 /* Make sure the filesystem supports snapshots */
13163 VFSATTR_INIT(&vfa);
13164 VFSATTR_WANTED(&vfa, f_capabilities);
13165 if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
13166 !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
13167 !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
13168 VOL_CAP_INT_SNAPSHOT)) ||
13169 !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
13170 VOL_CAP_INT_SNAPSHOT))) {
13171 error = ENOTSUP;
13172 goto out;
13173 }
13174
13175 error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
13176 if (error) {
13177 goto out;
13178 }
13179
13180 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13181 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13182 if (error) {
13183 goto out1;
13184 }
13185
13186 /*
13187 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
13188 * (the length returned by copyinstr includes the terminating NUL)
13189 */
13190 if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
13191 (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
13192 error = EINVAL;
13193 goto out1;
13194 }
13195 for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
13196 ;
13197 }
13198 if (i < (int)name_len) {
13199 error = EINVAL;
13200 goto out1;
13201 }
13202
13203 #if CONFIG_MACF
13204 if (op == CREATE) {
13205 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
13206 name_buf);
13207 } else if (op == DELETE) {
13208 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
13209 name_buf);
13210 }
13211 if (error) {
13212 goto out1;
13213 }
13214 #endif
13215
13216 /* Check if the snapshot already exists ... */
13217 NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
13218 UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
13219 ndp->ni_dvp = *sdvpp;
13220
13221 error = namei(ndp);
13222 out1:
13223 zfree(ZV_NAMEI, name_buf);
13224 out:
13225 if (error) {
13226 if (*sdvpp) {
13227 vnode_put(*sdvpp);
13228 *sdvpp = NULLVP;
13229 }
13230 if (*rvpp) {
13231 vnode_put(*rvpp);
13232 *rvpp = NULLVP;
13233 }
13234 }
13235 return error;
13236 }
13237
13238 /*
13239 * create a filesystem snapshot (for supporting filesystems)
13240 *
13241 * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
13242 * We get to the (unnamed) snapshot directory vnode and create the vnode
13243 * for the snapshot in it.
13244 *
13245 * Restrictions:
13246 *
13247 * a) Passed in name for snapshot cannot have slashes.
13248 * b) name can't be "." or ".."
13249 *
13250 * Since this requires superuser privileges, vnode_authorize calls are not
13251 * made.
13252 */
13253 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13254 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
13255 vfs_context_t ctx)
13256 {
13257 vnode_t rvp, snapdvp;
13258 int error;
13259 struct nameidata *ndp;
13260
13261 ndp = kalloc_type(struct nameidata, Z_WAITOK);
13262
13263 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
13264 OP_LINK, ctx);
13265 if (error) {
13266 goto out;
13267 }
13268
13269 if (ndp->ni_vp) {
13270 vnode_put(ndp->ni_vp);
13271 error = EEXIST;
13272 } else {
13273 struct vnode_attr *vap;
13274 vnode_t vp = NULLVP;
13275
13276 vap = kalloc_type(struct vnode_attr, Z_WAITOK);
13277
13278 VATTR_INIT(vap);
13279 VATTR_SET(vap, va_type, VREG);
13280 VATTR_SET(vap, va_mode, 0);
13281
13282 error = vn_create(snapdvp, &vp, ndp, vap,
13283 VN_CREATE_NOAUTH | VN_CREATE_NOINHERIT, 0, NULL, ctx);
13284 if (!error && vp) {
13285 vnode_put(vp);
13286 }
13287
13288 kfree_type(struct vnode_attr, vap);
13289 }
13290
13291 nameidone(ndp);
13292 vnode_put(snapdvp);
13293 vnode_put(rvp);
13294 out:
13295 kfree_type(struct nameidata, ndp);
13296
13297 return error;
13298 }
13299
13300 /*
13301 * Delete a Filesystem snapshot
13302 *
13303 * get the vnode for the unnamed snapshot directory and the snapshot and
13304 * delete the snapshot.
13305 */
13306 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13307 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
13308 vfs_context_t ctx)
13309 {
13310 vnode_t rvp, snapdvp;
13311 int error;
13312 struct nameidata *ndp;
13313
13314 ndp = kalloc_type(struct nameidata, Z_WAITOK);
13315
13316 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
13317 OP_UNLINK, ctx);
13318 if (error) {
13319 goto out;
13320 }
13321
13322 error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
13323 VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
13324
13325 vnode_put(ndp->ni_vp);
13326 nameidone(ndp);
13327 vnode_put(snapdvp);
13328 vnode_put(rvp);
13329 out:
13330 kfree_type(struct nameidata, ndp);
13331
13332 return error;
13333 }
13334
13335 /*
13336 * Revert a filesystem to a snapshot
13337 *
13338 * Marks the filesystem to revert to the given snapshot on next mount.
13339 */
13340 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13341 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
13342 vfs_context_t ctx)
13343 {
13344 int error;
13345 vnode_t rvp;
13346 mount_t mp;
13347 struct fs_snapshot_revert_args revert_data;
13348 struct componentname cnp;
13349 caddr_t name_buf;
13350 size_t name_len;
13351
13352 error = vnode_getfromfd(ctx, dirfd, &rvp);
13353 if (error) {
13354 return error;
13355 }
13356 mp = vnode_mount(rvp);
13357
13358 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13359 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13360 if (error) {
13361 zfree(ZV_NAMEI, name_buf);
13362 vnode_put(rvp);
13363 return error;
13364 }
13365
13366 #if CONFIG_MACF
13367 error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
13368 if (error) {
13369 zfree(ZV_NAMEI, name_buf);
13370 vnode_put(rvp);
13371 return error;
13372 }
13373 #endif
13374
13375 /*
13376 * Grab mount_iterref so that we can release the vnode,
13377 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
13378 */
13379 error = mount_iterref(mp, 0);
13380 vnode_put(rvp);
13381 if (error) {
13382 zfree(ZV_NAMEI, name_buf);
13383 return error;
13384 }
13385
13386 memset(&cnp, 0, sizeof(cnp));
13387 cnp.cn_pnbuf = (char *)name_buf;
13388 cnp.cn_nameiop = LOOKUP;
13389 cnp.cn_flags = ISLASTCN | HASBUF;
13390 cnp.cn_pnlen = MAXPATHLEN;
13391 cnp.cn_nameptr = cnp.cn_pnbuf;
13392 cnp.cn_namelen = (int)name_len;
13393 revert_data.sr_cnp = &cnp;
13394
13395 error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
13396 mount_iterdrop(mp);
13397 zfree(ZV_NAMEI, name_buf);
13398
13399 if (error) {
13400 /* If there was any error, try again using VNOP_IOCTL */
13401
13402 vnode_t snapdvp;
13403 struct nameidata namend;
13404
13405 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
13406 OP_LOOKUP, ctx);
13407 if (error) {
13408 return error;
13409 }
13410
13411
13412 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
13413 0, ctx);
13414
13415 vnode_put(namend.ni_vp);
13416 nameidone(&namend);
13417 vnode_put(snapdvp);
13418 vnode_put(rvp);
13419 }
13420
13421 return error;
13422 }
13423
13424 /*
13425 * rename a Filesystem snapshot
13426 *
13427 * get the vnode for the unnamed snapshot directory and the snapshot and
13428 * rename the snapshot. This is a very specialised (and simple) case of
13429 * rename(2) (which has to deal with a lot more complications). It differs
13430 * slightly from rename(2) in that EEXIST is returned if the new name exists.
13431 */
13432 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)13433 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
13434 __unused uint32_t flags, vfs_context_t ctx)
13435 {
13436 vnode_t rvp, snapdvp;
13437 int error, i;
13438 caddr_t newname_buf;
13439 size_t name_len;
13440 vnode_t fvp;
13441 struct nameidata *fromnd, *tond;
13442 /* carving out a chunk for structs that are too big to be on stack. */
13443 struct {
13444 struct nameidata from_node;
13445 struct nameidata to_node;
13446 } * __rename_data;
13447
13448 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
13449 fromnd = &__rename_data->from_node;
13450 tond = &__rename_data->to_node;
13451
13452 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
13453 OP_UNLINK, ctx);
13454 if (error) {
13455 goto out;
13456 }
13457 fvp = fromnd->ni_vp;
13458
13459 newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13460 error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
13461 if (error) {
13462 goto out1;
13463 }
13464
13465 /*
13466 * Some sanity checks- new name can't be empty, "." or ".." or have
13467 * slashes.
13468 * (the length returned by copyinstr includes the terminating NUL)
13469 *
13470 * The FS rename VNOP is suppossed to handle this but we'll pick it
13471 * off here itself.
13472 */
13473 if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
13474 (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
13475 error = EINVAL;
13476 goto out1;
13477 }
13478 for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
13479 ;
13480 }
13481 if (i < (int)name_len) {
13482 error = EINVAL;
13483 goto out1;
13484 }
13485
13486 #if CONFIG_MACF
13487 error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
13488 newname_buf);
13489 if (error) {
13490 goto out1;
13491 }
13492 #endif
13493
13494 NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
13495 UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
13496 tond->ni_dvp = snapdvp;
13497
13498 error = namei(tond);
13499 if (error) {
13500 goto out2;
13501 } else if (tond->ni_vp) {
13502 /*
13503 * snapshot rename behaves differently than rename(2) - if the
13504 * new name exists, EEXIST is returned.
13505 */
13506 vnode_put(tond->ni_vp);
13507 error = EEXIST;
13508 goto out2;
13509 }
13510
13511 error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
13512 &tond->ni_cnd, ctx);
13513
13514 out2:
13515 nameidone(tond);
13516 out1:
13517 zfree(ZV_NAMEI, newname_buf);
13518 vnode_put(fvp);
13519 vnode_put(snapdvp);
13520 vnode_put(rvp);
13521 nameidone(fromnd);
13522 out:
13523 kfree_type(typeof(*__rename_data), __rename_data);
13524 return error;
13525 }
13526
13527 /*
13528 * Mount a Filesystem snapshot
13529 *
13530 * get the vnode for the unnamed snapshot directory and the snapshot and
13531 * mount the snapshot.
13532 */
13533 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)13534 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
13535 __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
13536 {
13537 mount_t mp;
13538 vnode_t rvp, snapdvp, snapvp, vp, pvp;
13539 struct fs_snapshot_mount_args smnt_data;
13540 int error;
13541 struct nameidata *snapndp, *dirndp;
13542 /* carving out a chunk for structs that are too big to be on stack. */
13543 struct {
13544 struct nameidata snapnd;
13545 struct nameidata dirnd;
13546 } * __snapshot_mount_data;
13547
13548 __snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
13549 snapndp = &__snapshot_mount_data->snapnd;
13550 dirndp = &__snapshot_mount_data->dirnd;
13551
13552 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
13553 OP_LOOKUP, ctx);
13554 if (error) {
13555 goto out;
13556 }
13557
13558 snapvp = snapndp->ni_vp;
13559 if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
13560 error = EIO;
13561 goto out1;
13562 }
13563
13564 /* Get the vnode to be covered */
13565 NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
13566 UIO_USERSPACE, directory, ctx);
13567 error = namei(dirndp);
13568 if (error) {
13569 goto out1;
13570 }
13571
13572 vp = dirndp->ni_vp;
13573 pvp = dirndp->ni_dvp;
13574 mp = vnode_mount(rvp);
13575
13576 if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
13577 error = EINVAL;
13578 goto out2;
13579 }
13580
13581 #if CONFIG_MACF
13582 error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
13583 mp->mnt_vfsstat.f_fstypename);
13584 if (error) {
13585 goto out2;
13586 }
13587 #endif
13588
13589 smnt_data.sm_mp = mp;
13590 smnt_data.sm_cnp = &snapndp->ni_cnd;
13591 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
13592 &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
13593 KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
13594
13595 out2:
13596 vnode_put(vp);
13597 vnode_put(pvp);
13598 nameidone(dirndp);
13599 out1:
13600 vnode_put(snapvp);
13601 vnode_put(snapdvp);
13602 vnode_put(rvp);
13603 nameidone(snapndp);
13604 out:
13605 kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
13606 return error;
13607 }
13608
13609 /*
13610 * Root from a snapshot of the filesystem
13611 *
13612 * Marks the filesystem to root from the given snapshot on next boot.
13613 */
13614 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13615 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
13616 vfs_context_t ctx)
13617 {
13618 int error;
13619 vnode_t rvp;
13620 mount_t mp;
13621 struct fs_snapshot_root_args root_data;
13622 struct componentname cnp;
13623 caddr_t name_buf;
13624 size_t name_len;
13625
13626 error = vnode_getfromfd(ctx, dirfd, &rvp);
13627 if (error) {
13628 return error;
13629 }
13630 mp = vnode_mount(rvp);
13631
13632 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13633 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13634 if (error) {
13635 zfree(ZV_NAMEI, name_buf);
13636 vnode_put(rvp);
13637 return error;
13638 }
13639
13640 // XXX MAC checks ?
13641
13642 /*
13643 * Grab mount_iterref so that we can release the vnode,
13644 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
13645 */
13646 error = mount_iterref(mp, 0);
13647 vnode_put(rvp);
13648 if (error) {
13649 zfree(ZV_NAMEI, name_buf);
13650 return error;
13651 }
13652
13653 memset(&cnp, 0, sizeof(cnp));
13654 cnp.cn_pnbuf = (char *)name_buf;
13655 cnp.cn_nameiop = LOOKUP;
13656 cnp.cn_flags = ISLASTCN | HASBUF;
13657 cnp.cn_pnlen = MAXPATHLEN;
13658 cnp.cn_nameptr = cnp.cn_pnbuf;
13659 cnp.cn_namelen = (int)name_len;
13660 root_data.sr_cnp = &cnp;
13661
13662 error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
13663
13664 mount_iterdrop(mp);
13665 zfree(ZV_NAMEI, name_buf);
13666
13667 return error;
13668 }
13669
13670 /*
13671 * FS snapshot operations dispatcher
13672 */
13673 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)13674 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
13675 __unused int32_t *retval)
13676 {
13677 int error;
13678 vfs_context_t ctx = vfs_context_current();
13679
13680 AUDIT_ARG(fd, uap->dirfd);
13681 AUDIT_ARG(value32, uap->op);
13682
13683 error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_SNAPSHOT, 0);
13684 if (error) {
13685 return error;
13686 }
13687
13688 /*
13689 * Enforce user authorization for snapshot modification operations,
13690 * or if trying to root from snapshot.
13691 */
13692 if (uap->op != SNAPSHOT_OP_MOUNT) {
13693 vnode_t dvp = NULLVP;
13694 vnode_t devvp = NULLVP;
13695 mount_t mp;
13696
13697 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
13698 if (error) {
13699 return error;
13700 }
13701 mp = vnode_mount(dvp);
13702 devvp = mp->mnt_devvp;
13703
13704 /* get an iocount on devvp */
13705 if (devvp == NULLVP) {
13706 error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
13707 /* for mounts which arent block devices */
13708 if (error == ENOENT) {
13709 error = ENXIO;
13710 }
13711 } else {
13712 error = vnode_getwithref(devvp);
13713 }
13714
13715 if (error) {
13716 vnode_put(dvp);
13717 return error;
13718 }
13719
13720 if ((vfs_context_issuser(ctx) == 0) &&
13721 (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
13722 (!IOCurrentTaskHasEntitlement("com.apple.private.vfs.snapshot.user"))) {
13723 error = EPERM;
13724 }
13725 vnode_put(dvp);
13726 vnode_put(devvp);
13727
13728 if (error) {
13729 return error;
13730 }
13731 }
13732
13733 switch (uap->op) {
13734 case SNAPSHOT_OP_CREATE:
13735 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
13736 break;
13737 case SNAPSHOT_OP_DELETE:
13738 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
13739 break;
13740 case SNAPSHOT_OP_RENAME:
13741 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
13742 uap->flags, ctx);
13743 break;
13744 case SNAPSHOT_OP_MOUNT:
13745 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
13746 uap->data, uap->flags, ctx);
13747 break;
13748 case SNAPSHOT_OP_REVERT:
13749 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
13750 break;
13751 #if CONFIG_MNT_ROOTSNAP
13752 case SNAPSHOT_OP_ROOT:
13753 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
13754 break;
13755 #endif /* CONFIG_MNT_ROOTSNAP */
13756 default:
13757 error = ENOSYS;
13758 }
13759
13760 return error;
13761 }
13762